diff options
Diffstat (limited to 'kernel')
124 files changed, 5767 insertions, 3039 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 17ea6d4a9a24..a59481a3fa6c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
| @@ -57,7 +57,6 @@ obj-$(CONFIG_UTS_NS) += utsname.o | |||
| 57 | obj-$(CONFIG_USER_NS) += user_namespace.o | 57 | obj-$(CONFIG_USER_NS) += user_namespace.o |
| 58 | obj-$(CONFIG_PID_NS) += pid_namespace.o | 58 | obj-$(CONFIG_PID_NS) += pid_namespace.o |
| 59 | obj-$(CONFIG_IKCONFIG) += configs.o | 59 | obj-$(CONFIG_IKCONFIG) += configs.o |
| 60 | obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o | ||
| 61 | obj-$(CONFIG_SMP) += stop_machine.o | 60 | obj-$(CONFIG_SMP) += stop_machine.o |
| 62 | obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o | 61 | obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o |
| 63 | obj-$(CONFIG_AUDIT) += audit.o auditfilter.o | 62 | obj-$(CONFIG_AUDIT) += audit.o auditfilter.o |
diff --git a/kernel/audit.c b/kernel/audit.c index cebb11db4d34..72ab759a0b43 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
| @@ -429,7 +429,7 @@ static void kauditd_send_skb(struct sk_buff *skb) | |||
| 429 | * This function doesn't consume an skb as might be expected since it has to | 429 | * This function doesn't consume an skb as might be expected since it has to |
| 430 | * copy it anyways. | 430 | * copy it anyways. |
| 431 | */ | 431 | */ |
| 432 | static void kauditd_send_multicast_skb(struct sk_buff *skb) | 432 | static void kauditd_send_multicast_skb(struct sk_buff *skb, gfp_t gfp_mask) |
| 433 | { | 433 | { |
| 434 | struct sk_buff *copy; | 434 | struct sk_buff *copy; |
| 435 | struct audit_net *aunet = net_generic(&init_net, audit_net_id); | 435 | struct audit_net *aunet = net_generic(&init_net, audit_net_id); |
| @@ -448,11 +448,11 @@ static void kauditd_send_multicast_skb(struct sk_buff *skb) | |||
| 448 | * no reason for new multicast clients to continue with this | 448 | * no reason for new multicast clients to continue with this |
| 449 | * non-compliance. | 449 | * non-compliance. |
| 450 | */ | 450 | */ |
| 451 | copy = skb_copy(skb, GFP_KERNEL); | 451 | copy = skb_copy(skb, gfp_mask); |
| 452 | if (!copy) | 452 | if (!copy) |
| 453 | return; | 453 | return; |
| 454 | 454 | ||
| 455 | nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL); | 455 | nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, gfp_mask); |
| 456 | } | 456 | } |
| 457 | 457 | ||
| 458 | /* | 458 | /* |
| @@ -499,7 +499,6 @@ static int kauditd_thread(void *dummy) | |||
| 499 | set_freezable(); | 499 | set_freezable(); |
| 500 | while (!kthread_should_stop()) { | 500 | while (!kthread_should_stop()) { |
| 501 | struct sk_buff *skb; | 501 | struct sk_buff *skb; |
| 502 | DECLARE_WAITQUEUE(wait, current); | ||
| 503 | 502 | ||
| 504 | flush_hold_queue(); | 503 | flush_hold_queue(); |
| 505 | 504 | ||
| @@ -514,16 +513,8 @@ static int kauditd_thread(void *dummy) | |||
| 514 | audit_printk_skb(skb); | 513 | audit_printk_skb(skb); |
| 515 | continue; | 514 | continue; |
| 516 | } | 515 | } |
| 517 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 518 | add_wait_queue(&kauditd_wait, &wait); | ||
| 519 | 516 | ||
| 520 | if (!skb_queue_len(&audit_skb_queue)) { | 517 | wait_event_freezable(kauditd_wait, skb_queue_len(&audit_skb_queue)); |
| 521 | try_to_freeze(); | ||
| 522 | schedule(); | ||
| 523 | } | ||
| 524 | |||
| 525 | __set_current_state(TASK_RUNNING); | ||
| 526 | remove_wait_queue(&kauditd_wait, &wait); | ||
| 527 | } | 518 | } |
| 528 | return 0; | 519 | return 0; |
| 529 | } | 520 | } |
| @@ -842,7 +833,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 842 | s.backlog_limit = audit_backlog_limit; | 833 | s.backlog_limit = audit_backlog_limit; |
| 843 | s.lost = atomic_read(&audit_lost); | 834 | s.lost = atomic_read(&audit_lost); |
| 844 | s.backlog = skb_queue_len(&audit_skb_queue); | 835 | s.backlog = skb_queue_len(&audit_skb_queue); |
| 845 | s.version = AUDIT_VERSION_LATEST; | 836 | s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL; |
| 846 | s.backlog_wait_time = audit_backlog_wait_time; | 837 | s.backlog_wait_time = audit_backlog_wait_time; |
| 847 | audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s)); | 838 | audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s)); |
| 848 | break; | 839 | break; |
| @@ -1109,7 +1100,7 @@ static void audit_receive(struct sk_buff *skb) | |||
| 1109 | } | 1100 | } |
| 1110 | 1101 | ||
| 1111 | /* Run custom bind function on netlink socket group connect or bind requests. */ | 1102 | /* Run custom bind function on netlink socket group connect or bind requests. */ |
| 1112 | static int audit_bind(int group) | 1103 | static int audit_bind(struct net *net, int group) |
| 1113 | { | 1104 | { |
| 1114 | if (!capable(CAP_AUDIT_READ)) | 1105 | if (!capable(CAP_AUDIT_READ)) |
| 1115 | return -EPERM; | 1106 | return -EPERM; |
| @@ -1949,7 +1940,7 @@ void audit_log_end(struct audit_buffer *ab) | |||
| 1949 | struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); | 1940 | struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); |
| 1950 | 1941 | ||
| 1951 | nlh->nlmsg_len = ab->skb->len; | 1942 | nlh->nlmsg_len = ab->skb->len; |
| 1952 | kauditd_send_multicast_skb(ab->skb); | 1943 | kauditd_send_multicast_skb(ab->skb, ab->gfp_mask); |
| 1953 | 1944 | ||
| 1954 | /* | 1945 | /* |
| 1955 | * The original kaudit unicast socket sends up messages with | 1946 | * The original kaudit unicast socket sends up messages with |
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 80f29e015570..2e0c97427b33 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c | |||
| @@ -174,9 +174,9 @@ static void insert_hash(struct audit_chunk *chunk) | |||
| 174 | struct fsnotify_mark *entry = &chunk->mark; | 174 | struct fsnotify_mark *entry = &chunk->mark; |
| 175 | struct list_head *list; | 175 | struct list_head *list; |
| 176 | 176 | ||
| 177 | if (!entry->i.inode) | 177 | if (!entry->inode) |
| 178 | return; | 178 | return; |
| 179 | list = chunk_hash(entry->i.inode); | 179 | list = chunk_hash(entry->inode); |
| 180 | list_add_rcu(&chunk->hash, list); | 180 | list_add_rcu(&chunk->hash, list); |
| 181 | } | 181 | } |
| 182 | 182 | ||
| @@ -188,7 +188,7 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode) | |||
| 188 | 188 | ||
| 189 | list_for_each_entry_rcu(p, list, hash) { | 189 | list_for_each_entry_rcu(p, list, hash) { |
| 190 | /* mark.inode may have gone NULL, but who cares? */ | 190 | /* mark.inode may have gone NULL, but who cares? */ |
| 191 | if (p->mark.i.inode == inode) { | 191 | if (p->mark.inode == inode) { |
| 192 | atomic_long_inc(&p->refs); | 192 | atomic_long_inc(&p->refs); |
| 193 | return p; | 193 | return p; |
| 194 | } | 194 | } |
| @@ -231,7 +231,7 @@ static void untag_chunk(struct node *p) | |||
| 231 | new = alloc_chunk(size); | 231 | new = alloc_chunk(size); |
| 232 | 232 | ||
| 233 | spin_lock(&entry->lock); | 233 | spin_lock(&entry->lock); |
| 234 | if (chunk->dead || !entry->i.inode) { | 234 | if (chunk->dead || !entry->inode) { |
| 235 | spin_unlock(&entry->lock); | 235 | spin_unlock(&entry->lock); |
| 236 | if (new) | 236 | if (new) |
| 237 | free_chunk(new); | 237 | free_chunk(new); |
| @@ -258,7 +258,7 @@ static void untag_chunk(struct node *p) | |||
| 258 | goto Fallback; | 258 | goto Fallback; |
| 259 | 259 | ||
| 260 | fsnotify_duplicate_mark(&new->mark, entry); | 260 | fsnotify_duplicate_mark(&new->mark, entry); |
| 261 | if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { | 261 | if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.inode, NULL, 1)) { |
| 262 | fsnotify_put_mark(&new->mark); | 262 | fsnotify_put_mark(&new->mark); |
| 263 | goto Fallback; | 263 | goto Fallback; |
| 264 | } | 264 | } |
| @@ -386,7 +386,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) | |||
| 386 | chunk_entry = &chunk->mark; | 386 | chunk_entry = &chunk->mark; |
| 387 | 387 | ||
| 388 | spin_lock(&old_entry->lock); | 388 | spin_lock(&old_entry->lock); |
| 389 | if (!old_entry->i.inode) { | 389 | if (!old_entry->inode) { |
| 390 | /* old_entry is being shot, lets just lie */ | 390 | /* old_entry is being shot, lets just lie */ |
| 391 | spin_unlock(&old_entry->lock); | 391 | spin_unlock(&old_entry->lock); |
| 392 | fsnotify_put_mark(old_entry); | 392 | fsnotify_put_mark(old_entry); |
| @@ -395,7 +395,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) | |||
| 395 | } | 395 | } |
| 396 | 396 | ||
| 397 | fsnotify_duplicate_mark(chunk_entry, old_entry); | 397 | fsnotify_duplicate_mark(chunk_entry, old_entry); |
| 398 | if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) { | 398 | if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->inode, NULL, 1)) { |
| 399 | spin_unlock(&old_entry->lock); | 399 | spin_unlock(&old_entry->lock); |
| 400 | fsnotify_put_mark(chunk_entry); | 400 | fsnotify_put_mark(chunk_entry); |
| 401 | fsnotify_put_mark(old_entry); | 401 | fsnotify_put_mark(old_entry); |
| @@ -611,7 +611,7 @@ void audit_trim_trees(void) | |||
| 611 | list_for_each_entry(node, &tree->chunks, list) { | 611 | list_for_each_entry(node, &tree->chunks, list) { |
| 612 | struct audit_chunk *chunk = find_chunk(node); | 612 | struct audit_chunk *chunk = find_chunk(node); |
| 613 | /* this could be NULL if the watch is dying else where... */ | 613 | /* this could be NULL if the watch is dying else where... */ |
| 614 | struct inode *inode = chunk->mark.i.inode; | 614 | struct inode *inode = chunk->mark.inode; |
| 615 | node->index |= 1U<<31; | 615 | node->index |= 1U<<31; |
| 616 | if (iterate_mounts(compare_root, inode, root_mnt)) | 616 | if (iterate_mounts(compare_root, inode, root_mnt)) |
| 617 | node->index &= ~(1U<<31); | 617 | node->index &= ~(1U<<31); |
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 3598e13f2a65..4f68a326d92e 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c | |||
| @@ -442,19 +442,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, | |||
| 442 | if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) { | 442 | if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) { |
| 443 | f->type = AUDIT_LOGINUID_SET; | 443 | f->type = AUDIT_LOGINUID_SET; |
| 444 | f->val = 0; | 444 | f->val = 0; |
| 445 | } | 445 | entry->rule.pflags |= AUDIT_LOGINUID_LEGACY; |
| 446 | |||
| 447 | if ((f->type == AUDIT_PID) || (f->type == AUDIT_PPID)) { | ||
| 448 | struct pid *pid; | ||
| 449 | rcu_read_lock(); | ||
| 450 | pid = find_vpid(f->val); | ||
| 451 | if (!pid) { | ||
| 452 | rcu_read_unlock(); | ||
| 453 | err = -ESRCH; | ||
| 454 | goto exit_free; | ||
| 455 | } | ||
| 456 | f->val = pid_nr(pid); | ||
| 457 | rcu_read_unlock(); | ||
| 458 | } | 446 | } |
| 459 | 447 | ||
| 460 | err = audit_field_valid(entry, f); | 448 | err = audit_field_valid(entry, f); |
| @@ -630,6 +618,13 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) | |||
| 630 | data->buflen += data->values[i] = | 618 | data->buflen += data->values[i] = |
| 631 | audit_pack_string(&bufp, krule->filterkey); | 619 | audit_pack_string(&bufp, krule->filterkey); |
| 632 | break; | 620 | break; |
| 621 | case AUDIT_LOGINUID_SET: | ||
| 622 | if (krule->pflags & AUDIT_LOGINUID_LEGACY && !f->val) { | ||
| 623 | data->fields[i] = AUDIT_LOGINUID; | ||
| 624 | data->values[i] = AUDIT_UID_UNSET; | ||
| 625 | break; | ||
| 626 | } | ||
| 627 | /* fallthrough if set */ | ||
| 633 | default: | 628 | default: |
| 634 | data->values[i] = f->val; | 629 | data->values[i] = f->val; |
| 635 | } | 630 | } |
| @@ -646,6 +641,7 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) | |||
| 646 | int i; | 641 | int i; |
| 647 | 642 | ||
| 648 | if (a->flags != b->flags || | 643 | if (a->flags != b->flags || |
| 644 | a->pflags != b->pflags || | ||
| 649 | a->listnr != b->listnr || | 645 | a->listnr != b->listnr || |
| 650 | a->action != b->action || | 646 | a->action != b->action || |
| 651 | a->field_count != b->field_count) | 647 | a->field_count != b->field_count) |
| @@ -764,6 +760,7 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old) | |||
| 764 | new = &entry->rule; | 760 | new = &entry->rule; |
| 765 | new->vers_ops = old->vers_ops; | 761 | new->vers_ops = old->vers_ops; |
| 766 | new->flags = old->flags; | 762 | new->flags = old->flags; |
| 763 | new->pflags = old->pflags; | ||
| 767 | new->listnr = old->listnr; | 764 | new->listnr = old->listnr; |
| 768 | new->action = old->action; | 765 | new->action = old->action; |
| 769 | for (i = 0; i < AUDIT_BITMASK_SIZE; i++) | 766 | for (i = 0; i < AUDIT_BITMASK_SIZE; i++) |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e420a0c41b5f..072566dd0caf 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
| @@ -72,6 +72,8 @@ | |||
| 72 | #include <linux/fs_struct.h> | 72 | #include <linux/fs_struct.h> |
| 73 | #include <linux/compat.h> | 73 | #include <linux/compat.h> |
| 74 | #include <linux/ctype.h> | 74 | #include <linux/ctype.h> |
| 75 | #include <linux/string.h> | ||
| 76 | #include <uapi/linux/limits.h> | ||
| 75 | 77 | ||
| 76 | #include "audit.h" | 78 | #include "audit.h" |
| 77 | 79 | ||
| @@ -1861,8 +1863,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry, | |||
| 1861 | } | 1863 | } |
| 1862 | 1864 | ||
| 1863 | list_for_each_entry_reverse(n, &context->names_list, list) { | 1865 | list_for_each_entry_reverse(n, &context->names_list, list) { |
| 1864 | /* does the name pointer match? */ | 1866 | if (!n->name || strcmp(n->name->name, name->name)) |
| 1865 | if (!n->name || n->name->name != name->name) | ||
| 1866 | continue; | 1867 | continue; |
| 1867 | 1868 | ||
| 1868 | /* match the correct record type */ | 1869 | /* match the correct record type */ |
| @@ -1877,12 +1878,48 @@ void __audit_inode(struct filename *name, const struct dentry *dentry, | |||
| 1877 | } | 1878 | } |
| 1878 | 1879 | ||
| 1879 | out_alloc: | 1880 | out_alloc: |
| 1880 | /* unable to find the name from a previous getname(). Allocate a new | 1881 | /* unable to find an entry with both a matching name and type */ |
| 1881 | * anonymous entry. | 1882 | n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN); |
| 1882 | */ | ||
| 1883 | n = audit_alloc_name(context, AUDIT_TYPE_NORMAL); | ||
| 1884 | if (!n) | 1883 | if (!n) |
| 1885 | return; | 1884 | return; |
| 1885 | /* unfortunately, while we may have a path name to record with the | ||
| 1886 | * inode, we can't always rely on the string lasting until the end of | ||
| 1887 | * the syscall so we need to create our own copy, it may fail due to | ||
| 1888 | * memory allocation issues, but we do our best */ | ||
| 1889 | if (name) { | ||
| 1890 | /* we can't use getname_kernel() due to size limits */ | ||
| 1891 | size_t len = strlen(name->name) + 1; | ||
| 1892 | struct filename *new = __getname(); | ||
| 1893 | |||
| 1894 | if (unlikely(!new)) | ||
| 1895 | goto out; | ||
| 1896 | |||
| 1897 | if (len <= (PATH_MAX - sizeof(*new))) { | ||
| 1898 | new->name = (char *)(new) + sizeof(*new); | ||
| 1899 | new->separate = false; | ||
| 1900 | } else if (len <= PATH_MAX) { | ||
| 1901 | /* this looks odd, but is due to final_putname() */ | ||
| 1902 | struct filename *new2; | ||
| 1903 | |||
| 1904 | new2 = kmalloc(sizeof(*new2), GFP_KERNEL); | ||
| 1905 | if (unlikely(!new2)) { | ||
| 1906 | __putname(new); | ||
| 1907 | goto out; | ||
| 1908 | } | ||
| 1909 | new2->name = (char *)new; | ||
| 1910 | new2->separate = true; | ||
| 1911 | new = new2; | ||
| 1912 | } else { | ||
| 1913 | /* we should never get here, but let's be safe */ | ||
| 1914 | __putname(new); | ||
| 1915 | goto out; | ||
| 1916 | } | ||
| 1917 | strlcpy((char *)new->name, name->name, len); | ||
| 1918 | new->uptr = NULL; | ||
| 1919 | new->aname = n; | ||
| 1920 | n->name = new; | ||
| 1921 | n->name_put = true; | ||
| 1922 | } | ||
| 1886 | out: | 1923 | out: |
| 1887 | if (parent) { | 1924 | if (parent) { |
| 1888 | n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; | 1925 | n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; |
| @@ -1897,6 +1934,11 @@ out: | |||
| 1897 | audit_copy_inode(n, dentry, inode); | 1934 | audit_copy_inode(n, dentry, inode); |
| 1898 | } | 1935 | } |
| 1899 | 1936 | ||
| 1937 | void __audit_file(const struct file *file) | ||
| 1938 | { | ||
| 1939 | __audit_inode(NULL, file->f_path.dentry, 0); | ||
| 1940 | } | ||
| 1941 | |||
| 1900 | /** | 1942 | /** |
| 1901 | * __audit_inode_child - collect inode info for created/removed objects | 1943 | * __audit_inode_child - collect inode info for created/removed objects |
| 1902 | * @parent: inode of dentry parent | 1944 | * @parent: inode of dentry parent |
| @@ -2373,7 +2415,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, | |||
| 2373 | ax->d.next = context->aux; | 2415 | ax->d.next = context->aux; |
| 2374 | context->aux = (void *)ax; | 2416 | context->aux = (void *)ax; |
| 2375 | 2417 | ||
| 2376 | dentry = dget(bprm->file->f_dentry); | 2418 | dentry = dget(bprm->file->f_path.dentry); |
| 2377 | get_vfs_caps_from_disk(dentry, &vcaps); | 2419 | get_vfs_caps_from_disk(dentry, &vcaps); |
| 2378 | dput(dentry); | 2420 | dput(dentry); |
| 2379 | 2421 | ||
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 0daf7f6ae7df..a5ae60f0b0a2 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | obj-y := core.o | 1 | obj-y := core.o |
| 2 | obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o | 2 | obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o |
| 3 | ifdef CONFIG_TEST_BPF | 3 | ifdef CONFIG_TEST_BPF |
| 4 | obj-$(CONFIG_BPF_SYSCALL) += test_stub.o | 4 | obj-$(CONFIG_BPF_SYSCALL) += test_stub.o |
| 5 | endif | 5 | endif |
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c new file mode 100644 index 000000000000..9eb4d8a7cd87 --- /dev/null +++ b/kernel/bpf/arraymap.c | |||
| @@ -0,0 +1,156 @@ | |||
| 1 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com | ||
| 2 | * | ||
| 3 | * This program is free software; you can redistribute it and/or | ||
| 4 | * modify it under the terms of version 2 of the GNU General Public | ||
| 5 | * License as published by the Free Software Foundation. | ||
| 6 | * | ||
| 7 | * This program is distributed in the hope that it will be useful, but | ||
| 8 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 10 | * General Public License for more details. | ||
| 11 | */ | ||
| 12 | #include <linux/bpf.h> | ||
| 13 | #include <linux/err.h> | ||
| 14 | #include <linux/vmalloc.h> | ||
| 15 | #include <linux/slab.h> | ||
| 16 | #include <linux/mm.h> | ||
| 17 | |||
| 18 | struct bpf_array { | ||
| 19 | struct bpf_map map; | ||
| 20 | u32 elem_size; | ||
| 21 | char value[0] __aligned(8); | ||
| 22 | }; | ||
| 23 | |||
| 24 | /* Called from syscall */ | ||
| 25 | static struct bpf_map *array_map_alloc(union bpf_attr *attr) | ||
| 26 | { | ||
| 27 | struct bpf_array *array; | ||
| 28 | u32 elem_size, array_size; | ||
| 29 | |||
| 30 | /* check sanity of attributes */ | ||
| 31 | if (attr->max_entries == 0 || attr->key_size != 4 || | ||
| 32 | attr->value_size == 0) | ||
| 33 | return ERR_PTR(-EINVAL); | ||
| 34 | |||
| 35 | elem_size = round_up(attr->value_size, 8); | ||
| 36 | |||
| 37 | /* check round_up into zero and u32 overflow */ | ||
| 38 | if (elem_size == 0 || | ||
| 39 | attr->max_entries > (U32_MAX - sizeof(*array)) / elem_size) | ||
| 40 | return ERR_PTR(-ENOMEM); | ||
| 41 | |||
| 42 | array_size = sizeof(*array) + attr->max_entries * elem_size; | ||
| 43 | |||
| 44 | /* allocate all map elements and zero-initialize them */ | ||
| 45 | array = kzalloc(array_size, GFP_USER | __GFP_NOWARN); | ||
| 46 | if (!array) { | ||
| 47 | array = vzalloc(array_size); | ||
| 48 | if (!array) | ||
| 49 | return ERR_PTR(-ENOMEM); | ||
| 50 | } | ||
| 51 | |||
| 52 | /* copy mandatory map attributes */ | ||
| 53 | array->map.key_size = attr->key_size; | ||
| 54 | array->map.value_size = attr->value_size; | ||
| 55 | array->map.max_entries = attr->max_entries; | ||
| 56 | |||
| 57 | array->elem_size = elem_size; | ||
| 58 | |||
| 59 | return &array->map; | ||
| 60 | } | ||
| 61 | |||
| 62 | /* Called from syscall or from eBPF program */ | ||
| 63 | static void *array_map_lookup_elem(struct bpf_map *map, void *key) | ||
| 64 | { | ||
| 65 | struct bpf_array *array = container_of(map, struct bpf_array, map); | ||
| 66 | u32 index = *(u32 *)key; | ||
| 67 | |||
| 68 | if (index >= array->map.max_entries) | ||
| 69 | return NULL; | ||
| 70 | |||
| 71 | return array->value + array->elem_size * index; | ||
| 72 | } | ||
| 73 | |||
| 74 | /* Called from syscall */ | ||
| 75 | static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) | ||
| 76 | { | ||
| 77 | struct bpf_array *array = container_of(map, struct bpf_array, map); | ||
| 78 | u32 index = *(u32 *)key; | ||
| 79 | u32 *next = (u32 *)next_key; | ||
| 80 | |||
| 81 | if (index >= array->map.max_entries) { | ||
| 82 | *next = 0; | ||
| 83 | return 0; | ||
| 84 | } | ||
| 85 | |||
| 86 | if (index == array->map.max_entries - 1) | ||
| 87 | return -ENOENT; | ||
| 88 | |||
| 89 | *next = index + 1; | ||
| 90 | return 0; | ||
| 91 | } | ||
| 92 | |||
| 93 | /* Called from syscall or from eBPF program */ | ||
| 94 | static int array_map_update_elem(struct bpf_map *map, void *key, void *value, | ||
| 95 | u64 map_flags) | ||
| 96 | { | ||
| 97 | struct bpf_array *array = container_of(map, struct bpf_array, map); | ||
| 98 | u32 index = *(u32 *)key; | ||
| 99 | |||
| 100 | if (map_flags > BPF_EXIST) | ||
| 101 | /* unknown flags */ | ||
| 102 | return -EINVAL; | ||
| 103 | |||
| 104 | if (index >= array->map.max_entries) | ||
| 105 | /* all elements were pre-allocated, cannot insert a new one */ | ||
| 106 | return -E2BIG; | ||
| 107 | |||
| 108 | if (map_flags == BPF_NOEXIST) | ||
| 109 | /* all elements already exist */ | ||
| 110 | return -EEXIST; | ||
| 111 | |||
| 112 | memcpy(array->value + array->elem_size * index, value, array->elem_size); | ||
| 113 | return 0; | ||
| 114 | } | ||
| 115 | |||
| 116 | /* Called from syscall or from eBPF program */ | ||
| 117 | static int array_map_delete_elem(struct bpf_map *map, void *key) | ||
| 118 | { | ||
| 119 | return -EINVAL; | ||
| 120 | } | ||
| 121 | |||
| 122 | /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ | ||
| 123 | static void array_map_free(struct bpf_map *map) | ||
| 124 | { | ||
| 125 | struct bpf_array *array = container_of(map, struct bpf_array, map); | ||
| 126 | |||
| 127 | /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, | ||
| 128 | * so the programs (can be more than one that used this map) were | ||
| 129 | * disconnected from events. Wait for outstanding programs to complete | ||
| 130 | * and free the array | ||
| 131 | */ | ||
| 132 | synchronize_rcu(); | ||
| 133 | |||
| 134 | kvfree(array); | ||
| 135 | } | ||
| 136 | |||
| 137 | static struct bpf_map_ops array_ops = { | ||
| 138 | .map_alloc = array_map_alloc, | ||
| 139 | .map_free = array_map_free, | ||
| 140 | .map_get_next_key = array_map_get_next_key, | ||
| 141 | .map_lookup_elem = array_map_lookup_elem, | ||
| 142 | .map_update_elem = array_map_update_elem, | ||
| 143 | .map_delete_elem = array_map_delete_elem, | ||
| 144 | }; | ||
| 145 | |||
| 146 | static struct bpf_map_type_list tl = { | ||
| 147 | .ops = &array_ops, | ||
| 148 | .type = BPF_MAP_TYPE_ARRAY, | ||
| 149 | }; | ||
| 150 | |||
| 151 | static int __init register_array_map(void) | ||
| 152 | { | ||
| 153 | bpf_register_map_type(&tl); | ||
| 154 | return 0; | ||
| 155 | } | ||
| 156 | late_initcall(register_array_map); | ||
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index d6594e457a25..a64e7a207d2b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c | |||
| @@ -163,7 +163,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, | |||
| 163 | 163 | ||
| 164 | void bpf_jit_binary_free(struct bpf_binary_header *hdr) | 164 | void bpf_jit_binary_free(struct bpf_binary_header *hdr) |
| 165 | { | 165 | { |
| 166 | module_free(NULL, hdr); | 166 | module_memfree(hdr); |
| 167 | } | 167 | } |
| 168 | #endif /* CONFIG_BPF_JIT */ | 168 | #endif /* CONFIG_BPF_JIT */ |
| 169 | 169 | ||
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c new file mode 100644 index 000000000000..b3ba43674310 --- /dev/null +++ b/kernel/bpf/hashtab.c | |||
| @@ -0,0 +1,367 @@ | |||
| 1 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com | ||
| 2 | * | ||
| 3 | * This program is free software; you can redistribute it and/or | ||
| 4 | * modify it under the terms of version 2 of the GNU General Public | ||
| 5 | * License as published by the Free Software Foundation. | ||
| 6 | * | ||
| 7 | * This program is distributed in the hope that it will be useful, but | ||
| 8 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 10 | * General Public License for more details. | ||
| 11 | */ | ||
| 12 | #include <linux/bpf.h> | ||
| 13 | #include <linux/jhash.h> | ||
| 14 | #include <linux/filter.h> | ||
| 15 | #include <linux/vmalloc.h> | ||
| 16 | |||
| 17 | struct bpf_htab { | ||
| 18 | struct bpf_map map; | ||
| 19 | struct hlist_head *buckets; | ||
| 20 | spinlock_t lock; | ||
| 21 | u32 count; /* number of elements in this hashtable */ | ||
| 22 | u32 n_buckets; /* number of hash buckets */ | ||
| 23 | u32 elem_size; /* size of each element in bytes */ | ||
| 24 | }; | ||
| 25 | |||
| 26 | /* each htab element is struct htab_elem + key + value */ | ||
| 27 | struct htab_elem { | ||
| 28 | struct hlist_node hash_node; | ||
| 29 | struct rcu_head rcu; | ||
| 30 | u32 hash; | ||
| 31 | char key[0] __aligned(8); | ||
| 32 | }; | ||
| 33 | |||
| 34 | /* Called from syscall */ | ||
| 35 | static struct bpf_map *htab_map_alloc(union bpf_attr *attr) | ||
| 36 | { | ||
| 37 | struct bpf_htab *htab; | ||
| 38 | int err, i; | ||
| 39 | |||
| 40 | htab = kzalloc(sizeof(*htab), GFP_USER); | ||
| 41 | if (!htab) | ||
| 42 | return ERR_PTR(-ENOMEM); | ||
| 43 | |||
| 44 | /* mandatory map attributes */ | ||
| 45 | htab->map.key_size = attr->key_size; | ||
| 46 | htab->map.value_size = attr->value_size; | ||
| 47 | htab->map.max_entries = attr->max_entries; | ||
| 48 | |||
| 49 | /* check sanity of attributes. | ||
| 50 | * value_size == 0 may be allowed in the future to use map as a set | ||
| 51 | */ | ||
| 52 | err = -EINVAL; | ||
| 53 | if (htab->map.max_entries == 0 || htab->map.key_size == 0 || | ||
| 54 | htab->map.value_size == 0) | ||
| 55 | goto free_htab; | ||
| 56 | |||
| 57 | /* hash table size must be power of 2 */ | ||
| 58 | htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); | ||
| 59 | |||
| 60 | err = -E2BIG; | ||
| 61 | if (htab->map.key_size > MAX_BPF_STACK) | ||
| 62 | /* eBPF programs initialize keys on stack, so they cannot be | ||
| 63 | * larger than max stack size | ||
| 64 | */ | ||
| 65 | goto free_htab; | ||
| 66 | |||
| 67 | err = -ENOMEM; | ||
| 68 | /* prevent zero size kmalloc and check for u32 overflow */ | ||
| 69 | if (htab->n_buckets == 0 || | ||
| 70 | htab->n_buckets > U32_MAX / sizeof(struct hlist_head)) | ||
| 71 | goto free_htab; | ||
| 72 | |||
| 73 | htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head), | ||
| 74 | GFP_USER | __GFP_NOWARN); | ||
| 75 | |||
| 76 | if (!htab->buckets) { | ||
| 77 | htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head)); | ||
| 78 | if (!htab->buckets) | ||
| 79 | goto free_htab; | ||
| 80 | } | ||
| 81 | |||
| 82 | for (i = 0; i < htab->n_buckets; i++) | ||
| 83 | INIT_HLIST_HEAD(&htab->buckets[i]); | ||
| 84 | |||
| 85 | spin_lock_init(&htab->lock); | ||
| 86 | htab->count = 0; | ||
| 87 | |||
| 88 | htab->elem_size = sizeof(struct htab_elem) + | ||
| 89 | round_up(htab->map.key_size, 8) + | ||
| 90 | htab->map.value_size; | ||
| 91 | return &htab->map; | ||
| 92 | |||
| 93 | free_htab: | ||
| 94 | kfree(htab); | ||
| 95 | return ERR_PTR(err); | ||
| 96 | } | ||
| 97 | |||
| 98 | static inline u32 htab_map_hash(const void *key, u32 key_len) | ||
| 99 | { | ||
| 100 | return jhash(key, key_len, 0); | ||
| 101 | } | ||
| 102 | |||
| 103 | static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) | ||
| 104 | { | ||
| 105 | return &htab->buckets[hash & (htab->n_buckets - 1)]; | ||
| 106 | } | ||
| 107 | |||
| 108 | static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash, | ||
| 109 | void *key, u32 key_size) | ||
| 110 | { | ||
| 111 | struct htab_elem *l; | ||
| 112 | |||
| 113 | hlist_for_each_entry_rcu(l, head, hash_node) | ||
| 114 | if (l->hash == hash && !memcmp(&l->key, key, key_size)) | ||
| 115 | return l; | ||
| 116 | |||
| 117 | return NULL; | ||
| 118 | } | ||
| 119 | |||
| 120 | /* Called from syscall or from eBPF program */ | ||
| 121 | static void *htab_map_lookup_elem(struct bpf_map *map, void *key) | ||
| 122 | { | ||
| 123 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); | ||
| 124 | struct hlist_head *head; | ||
| 125 | struct htab_elem *l; | ||
| 126 | u32 hash, key_size; | ||
| 127 | |||
| 128 | /* Must be called with rcu_read_lock. */ | ||
| 129 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
| 130 | |||
| 131 | key_size = map->key_size; | ||
| 132 | |||
| 133 | hash = htab_map_hash(key, key_size); | ||
| 134 | |||
| 135 | head = select_bucket(htab, hash); | ||
| 136 | |||
| 137 | l = lookup_elem_raw(head, hash, key, key_size); | ||
| 138 | |||
| 139 | if (l) | ||
| 140 | return l->key + round_up(map->key_size, 8); | ||
| 141 | |||
| 142 | return NULL; | ||
| 143 | } | ||
| 144 | |||
| 145 | /* Called from syscall */ | ||
| 146 | static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) | ||
| 147 | { | ||
| 148 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); | ||
| 149 | struct hlist_head *head; | ||
| 150 | struct htab_elem *l, *next_l; | ||
| 151 | u32 hash, key_size; | ||
| 152 | int i; | ||
| 153 | |||
| 154 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
| 155 | |||
| 156 | key_size = map->key_size; | ||
| 157 | |||
| 158 | hash = htab_map_hash(key, key_size); | ||
| 159 | |||
| 160 | head = select_bucket(htab, hash); | ||
| 161 | |||
| 162 | /* lookup the key */ | ||
| 163 | l = lookup_elem_raw(head, hash, key, key_size); | ||
| 164 | |||
| 165 | if (!l) { | ||
| 166 | i = 0; | ||
| 167 | goto find_first_elem; | ||
| 168 | } | ||
| 169 | |||
| 170 | /* key was found, get next key in the same bucket */ | ||
| 171 | next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)), | ||
| 172 | struct htab_elem, hash_node); | ||
| 173 | |||
| 174 | if (next_l) { | ||
| 175 | /* if next elem in this hash list is non-zero, just return it */ | ||
| 176 | memcpy(next_key, next_l->key, key_size); | ||
| 177 | return 0; | ||
| 178 | } | ||
| 179 | |||
| 180 | /* no more elements in this hash list, go to the next bucket */ | ||
| 181 | i = hash & (htab->n_buckets - 1); | ||
| 182 | i++; | ||
| 183 | |||
| 184 | find_first_elem: | ||
| 185 | /* iterate over buckets */ | ||
| 186 | for (; i < htab->n_buckets; i++) { | ||
| 187 | head = select_bucket(htab, i); | ||
| 188 | |||
| 189 | /* pick first element in the bucket */ | ||
| 190 | next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), | ||
| 191 | struct htab_elem, hash_node); | ||
| 192 | if (next_l) { | ||
| 193 | /* if it's not empty, just return it */ | ||
| 194 | memcpy(next_key, next_l->key, key_size); | ||
| 195 | return 0; | ||
| 196 | } | ||
| 197 | } | ||
| 198 | |||
| 199 | /* itereated over all buckets and all elements */ | ||
| 200 | return -ENOENT; | ||
| 201 | } | ||
| 202 | |||
| 203 | /* Called from syscall or from eBPF program */ | ||
| 204 | static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, | ||
| 205 | u64 map_flags) | ||
| 206 | { | ||
| 207 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); | ||
| 208 | struct htab_elem *l_new, *l_old; | ||
| 209 | struct hlist_head *head; | ||
| 210 | unsigned long flags; | ||
| 211 | u32 key_size; | ||
| 212 | int ret; | ||
| 213 | |||
| 214 | if (map_flags > BPF_EXIST) | ||
| 215 | /* unknown flags */ | ||
| 216 | return -EINVAL; | ||
| 217 | |||
| 218 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
| 219 | |||
| 220 | /* allocate new element outside of lock */ | ||
| 221 | l_new = kmalloc(htab->elem_size, GFP_ATOMIC); | ||
| 222 | if (!l_new) | ||
| 223 | return -ENOMEM; | ||
| 224 | |||
| 225 | key_size = map->key_size; | ||
| 226 | |||
| 227 | memcpy(l_new->key, key, key_size); | ||
| 228 | memcpy(l_new->key + round_up(key_size, 8), value, map->value_size); | ||
| 229 | |||
| 230 | l_new->hash = htab_map_hash(l_new->key, key_size); | ||
| 231 | |||
| 232 | /* bpf_map_update_elem() can be called in_irq() */ | ||
| 233 | spin_lock_irqsave(&htab->lock, flags); | ||
| 234 | |||
| 235 | head = select_bucket(htab, l_new->hash); | ||
| 236 | |||
| 237 | l_old = lookup_elem_raw(head, l_new->hash, key, key_size); | ||
| 238 | |||
| 239 | if (!l_old && unlikely(htab->count >= map->max_entries)) { | ||
| 240 | /* if elem with this 'key' doesn't exist and we've reached | ||
| 241 | * max_entries limit, fail insertion of new elem | ||
| 242 | */ | ||
| 243 | ret = -E2BIG; | ||
| 244 | goto err; | ||
| 245 | } | ||
| 246 | |||
| 247 | if (l_old && map_flags == BPF_NOEXIST) { | ||
| 248 | /* elem already exists */ | ||
| 249 | ret = -EEXIST; | ||
| 250 | goto err; | ||
| 251 | } | ||
| 252 | |||
| 253 | if (!l_old && map_flags == BPF_EXIST) { | ||
| 254 | /* elem doesn't exist, cannot update it */ | ||
| 255 | ret = -ENOENT; | ||
| 256 | goto err; | ||
| 257 | } | ||
| 258 | |||
| 259 | /* add new element to the head of the list, so that concurrent | ||
| 260 | * search will find it before old elem | ||
| 261 | */ | ||
| 262 | hlist_add_head_rcu(&l_new->hash_node, head); | ||
| 263 | if (l_old) { | ||
| 264 | hlist_del_rcu(&l_old->hash_node); | ||
| 265 | kfree_rcu(l_old, rcu); | ||
| 266 | } else { | ||
| 267 | htab->count++; | ||
| 268 | } | ||
| 269 | spin_unlock_irqrestore(&htab->lock, flags); | ||
| 270 | |||
| 271 | return 0; | ||
| 272 | err: | ||
| 273 | spin_unlock_irqrestore(&htab->lock, flags); | ||
| 274 | kfree(l_new); | ||
| 275 | return ret; | ||
| 276 | } | ||
| 277 | |||
| 278 | /* Called from syscall or from eBPF program */ | ||
| 279 | static int htab_map_delete_elem(struct bpf_map *map, void *key) | ||
| 280 | { | ||
| 281 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); | ||
| 282 | struct hlist_head *head; | ||
| 283 | struct htab_elem *l; | ||
| 284 | unsigned long flags; | ||
| 285 | u32 hash, key_size; | ||
| 286 | int ret = -ENOENT; | ||
| 287 | |||
| 288 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
| 289 | |||
| 290 | key_size = map->key_size; | ||
| 291 | |||
| 292 | hash = htab_map_hash(key, key_size); | ||
| 293 | |||
| 294 | spin_lock_irqsave(&htab->lock, flags); | ||
| 295 | |||
| 296 | head = select_bucket(htab, hash); | ||
| 297 | |||
| 298 | l = lookup_elem_raw(head, hash, key, key_size); | ||
| 299 | |||
| 300 | if (l) { | ||
| 301 | hlist_del_rcu(&l->hash_node); | ||
| 302 | htab->count--; | ||
| 303 | kfree_rcu(l, rcu); | ||
| 304 | ret = 0; | ||
| 305 | } | ||
| 306 | |||
| 307 | spin_unlock_irqrestore(&htab->lock, flags); | ||
| 308 | return ret; | ||
| 309 | } | ||
| 310 | |||
| 311 | static void delete_all_elements(struct bpf_htab *htab) | ||
| 312 | { | ||
| 313 | int i; | ||
| 314 | |||
| 315 | for (i = 0; i < htab->n_buckets; i++) { | ||
| 316 | struct hlist_head *head = select_bucket(htab, i); | ||
| 317 | struct hlist_node *n; | ||
| 318 | struct htab_elem *l; | ||
| 319 | |||
| 320 | hlist_for_each_entry_safe(l, n, head, hash_node) { | ||
| 321 | hlist_del_rcu(&l->hash_node); | ||
| 322 | htab->count--; | ||
| 323 | kfree(l); | ||
| 324 | } | ||
| 325 | } | ||
| 326 | } | ||
| 327 | |||
| 328 | /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ | ||
| 329 | static void htab_map_free(struct bpf_map *map) | ||
| 330 | { | ||
| 331 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); | ||
| 332 | |||
| 333 | /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, | ||
| 334 | * so the programs (can be more than one that used this map) were | ||
| 335 | * disconnected from events. Wait for outstanding critical sections in | ||
| 336 | * these programs to complete | ||
| 337 | */ | ||
| 338 | synchronize_rcu(); | ||
| 339 | |||
| 340 | /* some of kfree_rcu() callbacks for elements of this map may not have | ||
| 341 | * executed. It's ok. Proceed to free residual elements and map itself | ||
| 342 | */ | ||
| 343 | delete_all_elements(htab); | ||
| 344 | kvfree(htab->buckets); | ||
| 345 | kfree(htab); | ||
| 346 | } | ||
| 347 | |||
| 348 | static struct bpf_map_ops htab_ops = { | ||
| 349 | .map_alloc = htab_map_alloc, | ||
| 350 | .map_free = htab_map_free, | ||
| 351 | .map_get_next_key = htab_map_get_next_key, | ||
| 352 | .map_lookup_elem = htab_map_lookup_elem, | ||
| 353 | .map_update_elem = htab_map_update_elem, | ||
| 354 | .map_delete_elem = htab_map_delete_elem, | ||
| 355 | }; | ||
| 356 | |||
| 357 | static struct bpf_map_type_list tl = { | ||
| 358 | .ops = &htab_ops, | ||
| 359 | .type = BPF_MAP_TYPE_HASH, | ||
| 360 | }; | ||
| 361 | |||
| 362 | static int __init register_htab_map(void) | ||
| 363 | { | ||
| 364 | bpf_register_map_type(&tl); | ||
| 365 | return 0; | ||
| 366 | } | ||
| 367 | late_initcall(register_htab_map); | ||
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c new file mode 100644 index 000000000000..9e3414d85459 --- /dev/null +++ b/kernel/bpf/helpers.c | |||
| @@ -0,0 +1,89 @@ | |||
| 1 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com | ||
| 2 | * | ||
| 3 | * This program is free software; you can redistribute it and/or | ||
| 4 | * modify it under the terms of version 2 of the GNU General Public | ||
| 5 | * License as published by the Free Software Foundation. | ||
| 6 | * | ||
| 7 | * This program is distributed in the hope that it will be useful, but | ||
| 8 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 10 | * General Public License for more details. | ||
| 11 | */ | ||
| 12 | #include <linux/bpf.h> | ||
| 13 | #include <linux/rcupdate.h> | ||
| 14 | |||
| 15 | /* If kernel subsystem is allowing eBPF programs to call this function, | ||
| 16 | * inside its own verifier_ops->get_func_proto() callback it should return | ||
| 17 | * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments | ||
| 18 | * | ||
| 19 | * Different map implementations will rely on rcu in map methods | ||
| 20 | * lookup/update/delete, therefore eBPF programs must run under rcu lock | ||
| 21 | * if program is allowed to access maps, so check rcu_read_lock_held in | ||
| 22 | * all three functions. | ||
| 23 | */ | ||
| 24 | static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | ||
| 25 | { | ||
| 26 | /* verifier checked that R1 contains a valid pointer to bpf_map | ||
| 27 | * and R2 points to a program stack and map->key_size bytes were | ||
| 28 | * initialized | ||
| 29 | */ | ||
| 30 | struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; | ||
| 31 | void *key = (void *) (unsigned long) r2; | ||
| 32 | void *value; | ||
| 33 | |||
| 34 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
| 35 | |||
| 36 | value = map->ops->map_lookup_elem(map, key); | ||
| 37 | |||
| 38 | /* lookup() returns either pointer to element value or NULL | ||
| 39 | * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type | ||
| 40 | */ | ||
| 41 | return (unsigned long) value; | ||
| 42 | } | ||
| 43 | |||
| 44 | struct bpf_func_proto bpf_map_lookup_elem_proto = { | ||
| 45 | .func = bpf_map_lookup_elem, | ||
| 46 | .gpl_only = false, | ||
| 47 | .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, | ||
| 48 | .arg1_type = ARG_CONST_MAP_PTR, | ||
| 49 | .arg2_type = ARG_PTR_TO_MAP_KEY, | ||
| 50 | }; | ||
| 51 | |||
| 52 | static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | ||
| 53 | { | ||
| 54 | struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; | ||
| 55 | void *key = (void *) (unsigned long) r2; | ||
| 56 | void *value = (void *) (unsigned long) r3; | ||
| 57 | |||
| 58 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
| 59 | |||
| 60 | return map->ops->map_update_elem(map, key, value, r4); | ||
| 61 | } | ||
| 62 | |||
| 63 | struct bpf_func_proto bpf_map_update_elem_proto = { | ||
| 64 | .func = bpf_map_update_elem, | ||
| 65 | .gpl_only = false, | ||
| 66 | .ret_type = RET_INTEGER, | ||
| 67 | .arg1_type = ARG_CONST_MAP_PTR, | ||
| 68 | .arg2_type = ARG_PTR_TO_MAP_KEY, | ||
| 69 | .arg3_type = ARG_PTR_TO_MAP_VALUE, | ||
| 70 | .arg4_type = ARG_ANYTHING, | ||
| 71 | }; | ||
| 72 | |||
| 73 | static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | ||
| 74 | { | ||
| 75 | struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; | ||
| 76 | void *key = (void *) (unsigned long) r2; | ||
| 77 | |||
| 78 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
| 79 | |||
| 80 | return map->ops->map_delete_elem(map, key); | ||
| 81 | } | ||
| 82 | |||
| 83 | struct bpf_func_proto bpf_map_delete_elem_proto = { | ||
| 84 | .func = bpf_map_delete_elem, | ||
| 85 | .gpl_only = false, | ||
| 86 | .ret_type = RET_INTEGER, | ||
| 87 | .arg1_type = ARG_CONST_MAP_PTR, | ||
| 88 | .arg2_type = ARG_PTR_TO_MAP_KEY, | ||
| 89 | }; | ||
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ba61c8c16032..088ac0b1b106 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c | |||
| @@ -169,7 +169,7 @@ static int map_lookup_elem(union bpf_attr *attr) | |||
| 169 | if (copy_from_user(key, ukey, map->key_size) != 0) | 169 | if (copy_from_user(key, ukey, map->key_size) != 0) |
| 170 | goto free_key; | 170 | goto free_key; |
| 171 | 171 | ||
| 172 | err = -ESRCH; | 172 | err = -ENOENT; |
| 173 | rcu_read_lock(); | 173 | rcu_read_lock(); |
| 174 | value = map->ops->map_lookup_elem(map, key); | 174 | value = map->ops->map_lookup_elem(map, key); |
| 175 | if (!value) | 175 | if (!value) |
| @@ -190,7 +190,7 @@ err_put: | |||
| 190 | return err; | 190 | return err; |
| 191 | } | 191 | } |
| 192 | 192 | ||
| 193 | #define BPF_MAP_UPDATE_ELEM_LAST_FIELD value | 193 | #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags |
| 194 | 194 | ||
| 195 | static int map_update_elem(union bpf_attr *attr) | 195 | static int map_update_elem(union bpf_attr *attr) |
| 196 | { | 196 | { |
| @@ -231,7 +231,7 @@ static int map_update_elem(union bpf_attr *attr) | |||
| 231 | * therefore all map accessors rely on this fact, so do the same here | 231 | * therefore all map accessors rely on this fact, so do the same here |
| 232 | */ | 232 | */ |
| 233 | rcu_read_lock(); | 233 | rcu_read_lock(); |
| 234 | err = map->ops->map_update_elem(map, key, value); | 234 | err = map->ops->map_update_elem(map, key, value, attr->flags); |
| 235 | rcu_read_unlock(); | 235 | rcu_read_unlock(); |
| 236 | 236 | ||
| 237 | free_value: | 237 | free_value: |
diff --git a/kernel/bpf/test_stub.c b/kernel/bpf/test_stub.c index fcaddff4003e..0ceae1e6e8b5 100644 --- a/kernel/bpf/test_stub.c +++ b/kernel/bpf/test_stub.c | |||
| @@ -18,26 +18,18 @@ struct bpf_context { | |||
| 18 | u64 arg2; | 18 | u64 arg2; |
| 19 | }; | 19 | }; |
| 20 | 20 | ||
| 21 | static u64 test_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | ||
| 22 | { | ||
| 23 | return 0; | ||
| 24 | } | ||
| 25 | |||
| 26 | static struct bpf_func_proto test_funcs[] = { | ||
| 27 | [BPF_FUNC_unspec] = { | ||
| 28 | .func = test_func, | ||
| 29 | .gpl_only = true, | ||
| 30 | .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, | ||
| 31 | .arg1_type = ARG_CONST_MAP_PTR, | ||
| 32 | .arg2_type = ARG_PTR_TO_MAP_KEY, | ||
| 33 | }, | ||
| 34 | }; | ||
| 35 | |||
| 36 | static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id) | 21 | static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id) |
| 37 | { | 22 | { |
| 38 | if (func_id < 0 || func_id >= ARRAY_SIZE(test_funcs)) | 23 | switch (func_id) { |
| 24 | case BPF_FUNC_map_lookup_elem: | ||
| 25 | return &bpf_map_lookup_elem_proto; | ||
| 26 | case BPF_FUNC_map_update_elem: | ||
| 27 | return &bpf_map_update_elem_proto; | ||
| 28 | case BPF_FUNC_map_delete_elem: | ||
| 29 | return &bpf_map_delete_elem_proto; | ||
| 30 | default: | ||
| 39 | return NULL; | 31 | return NULL; |
| 40 | return &test_funcs[func_id]; | 32 | } |
| 41 | } | 33 | } |
| 42 | 34 | ||
| 43 | static const struct bpf_context_access { | 35 | static const struct bpf_context_access { |
| @@ -78,38 +70,8 @@ static struct bpf_prog_type_list tl_prog = { | |||
| 78 | .type = BPF_PROG_TYPE_UNSPEC, | 70 | .type = BPF_PROG_TYPE_UNSPEC, |
| 79 | }; | 71 | }; |
| 80 | 72 | ||
| 81 | static struct bpf_map *test_map_alloc(union bpf_attr *attr) | ||
| 82 | { | ||
| 83 | struct bpf_map *map; | ||
| 84 | |||
| 85 | map = kzalloc(sizeof(*map), GFP_USER); | ||
| 86 | if (!map) | ||
| 87 | return ERR_PTR(-ENOMEM); | ||
| 88 | |||
| 89 | map->key_size = attr->key_size; | ||
| 90 | map->value_size = attr->value_size; | ||
| 91 | map->max_entries = attr->max_entries; | ||
| 92 | return map; | ||
| 93 | } | ||
| 94 | |||
| 95 | static void test_map_free(struct bpf_map *map) | ||
| 96 | { | ||
| 97 | kfree(map); | ||
| 98 | } | ||
| 99 | |||
| 100 | static struct bpf_map_ops test_map_ops = { | ||
| 101 | .map_alloc = test_map_alloc, | ||
| 102 | .map_free = test_map_free, | ||
| 103 | }; | ||
| 104 | |||
| 105 | static struct bpf_map_type_list tl_map = { | ||
| 106 | .ops = &test_map_ops, | ||
| 107 | .type = BPF_MAP_TYPE_UNSPEC, | ||
| 108 | }; | ||
| 109 | |||
| 110 | static int __init register_test_ops(void) | 73 | static int __init register_test_ops(void) |
| 111 | { | 74 | { |
| 112 | bpf_register_map_type(&tl_map); | ||
| 113 | bpf_register_prog_type(&tl_prog); | 75 | bpf_register_prog_type(&tl_prog); |
| 114 | return 0; | 76 | return 0; |
| 115 | } | 77 | } |
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9f81818f2941..a28e09c7825d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c | |||
| @@ -153,22 +153,19 @@ struct reg_state { | |||
| 153 | 153 | ||
| 154 | enum bpf_stack_slot_type { | 154 | enum bpf_stack_slot_type { |
| 155 | STACK_INVALID, /* nothing was stored in this stack slot */ | 155 | STACK_INVALID, /* nothing was stored in this stack slot */ |
| 156 | STACK_SPILL, /* 1st byte of register spilled into stack */ | 156 | STACK_SPILL, /* register spilled into stack */ |
| 157 | STACK_SPILL_PART, /* other 7 bytes of register spill */ | ||
| 158 | STACK_MISC /* BPF program wrote some data into this slot */ | 157 | STACK_MISC /* BPF program wrote some data into this slot */ |
| 159 | }; | 158 | }; |
| 160 | 159 | ||
| 161 | struct bpf_stack_slot { | 160 | #define BPF_REG_SIZE 8 /* size of eBPF register in bytes */ |
| 162 | enum bpf_stack_slot_type stype; | ||
| 163 | struct reg_state reg_st; | ||
| 164 | }; | ||
| 165 | 161 | ||
| 166 | /* state of the program: | 162 | /* state of the program: |
| 167 | * type of all registers and stack info | 163 | * type of all registers and stack info |
| 168 | */ | 164 | */ |
| 169 | struct verifier_state { | 165 | struct verifier_state { |
| 170 | struct reg_state regs[MAX_BPF_REG]; | 166 | struct reg_state regs[MAX_BPF_REG]; |
| 171 | struct bpf_stack_slot stack[MAX_BPF_STACK]; | 167 | u8 stack_slot_type[MAX_BPF_STACK]; |
| 168 | struct reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE]; | ||
| 172 | }; | 169 | }; |
| 173 | 170 | ||
| 174 | /* linked list of verifier states used to prune search */ | 171 | /* linked list of verifier states used to prune search */ |
| @@ -259,10 +256,10 @@ static void print_verifier_state(struct verifier_env *env) | |||
| 259 | env->cur_state.regs[i].map_ptr->key_size, | 256 | env->cur_state.regs[i].map_ptr->key_size, |
| 260 | env->cur_state.regs[i].map_ptr->value_size); | 257 | env->cur_state.regs[i].map_ptr->value_size); |
| 261 | } | 258 | } |
| 262 | for (i = 0; i < MAX_BPF_STACK; i++) { | 259 | for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { |
| 263 | if (env->cur_state.stack[i].stype == STACK_SPILL) | 260 | if (env->cur_state.stack_slot_type[i] == STACK_SPILL) |
| 264 | verbose(" fp%d=%s", -MAX_BPF_STACK + i, | 261 | verbose(" fp%d=%s", -MAX_BPF_STACK + i, |
| 265 | reg_type_str[env->cur_state.stack[i].reg_st.type]); | 262 | reg_type_str[env->cur_state.spilled_regs[i / BPF_REG_SIZE].type]); |
| 266 | } | 263 | } |
| 267 | verbose("\n"); | 264 | verbose("\n"); |
| 268 | } | 265 | } |
| @@ -539,8 +536,10 @@ static int bpf_size_to_bytes(int bpf_size) | |||
| 539 | static int check_stack_write(struct verifier_state *state, int off, int size, | 536 | static int check_stack_write(struct verifier_state *state, int off, int size, |
| 540 | int value_regno) | 537 | int value_regno) |
| 541 | { | 538 | { |
| 542 | struct bpf_stack_slot *slot; | ||
| 543 | int i; | 539 | int i; |
| 540 | /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, | ||
| 541 | * so it's aligned access and [off, off + size) are within stack limits | ||
| 542 | */ | ||
| 544 | 543 | ||
| 545 | if (value_regno >= 0 && | 544 | if (value_regno >= 0 && |
| 546 | (state->regs[value_regno].type == PTR_TO_MAP_VALUE || | 545 | (state->regs[value_regno].type == PTR_TO_MAP_VALUE || |
| @@ -548,30 +547,24 @@ static int check_stack_write(struct verifier_state *state, int off, int size, | |||
| 548 | state->regs[value_regno].type == PTR_TO_CTX)) { | 547 | state->regs[value_regno].type == PTR_TO_CTX)) { |
| 549 | 548 | ||
| 550 | /* register containing pointer is being spilled into stack */ | 549 | /* register containing pointer is being spilled into stack */ |
| 551 | if (size != 8) { | 550 | if (size != BPF_REG_SIZE) { |
| 552 | verbose("invalid size of register spill\n"); | 551 | verbose("invalid size of register spill\n"); |
| 553 | return -EACCES; | 552 | return -EACCES; |
| 554 | } | 553 | } |
| 555 | 554 | ||
| 556 | slot = &state->stack[MAX_BPF_STACK + off]; | ||
| 557 | slot->stype = STACK_SPILL; | ||
| 558 | /* save register state */ | 555 | /* save register state */ |
| 559 | slot->reg_st = state->regs[value_regno]; | 556 | state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] = |
| 560 | for (i = 1; i < 8; i++) { | 557 | state->regs[value_regno]; |
| 561 | slot = &state->stack[MAX_BPF_STACK + off + i]; | ||
| 562 | slot->stype = STACK_SPILL_PART; | ||
| 563 | slot->reg_st.type = UNKNOWN_VALUE; | ||
| 564 | slot->reg_st.map_ptr = NULL; | ||
| 565 | } | ||
| 566 | } else { | ||
| 567 | 558 | ||
| 559 | for (i = 0; i < BPF_REG_SIZE; i++) | ||
| 560 | state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL; | ||
| 561 | } else { | ||
| 568 | /* regular write of data into stack */ | 562 | /* regular write of data into stack */ |
| 569 | for (i = 0; i < size; i++) { | 563 | state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] = |
| 570 | slot = &state->stack[MAX_BPF_STACK + off + i]; | 564 | (struct reg_state) {}; |
| 571 | slot->stype = STACK_MISC; | 565 | |
| 572 | slot->reg_st.type = UNKNOWN_VALUE; | 566 | for (i = 0; i < size; i++) |
| 573 | slot->reg_st.map_ptr = NULL; | 567 | state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC; |
| 574 | } | ||
| 575 | } | 568 | } |
| 576 | return 0; | 569 | return 0; |
| 577 | } | 570 | } |
| @@ -579,19 +572,18 @@ static int check_stack_write(struct verifier_state *state, int off, int size, | |||
| 579 | static int check_stack_read(struct verifier_state *state, int off, int size, | 572 | static int check_stack_read(struct verifier_state *state, int off, int size, |
| 580 | int value_regno) | 573 | int value_regno) |
| 581 | { | 574 | { |
| 575 | u8 *slot_type; | ||
| 582 | int i; | 576 | int i; |
| 583 | struct bpf_stack_slot *slot; | ||
| 584 | 577 | ||
| 585 | slot = &state->stack[MAX_BPF_STACK + off]; | 578 | slot_type = &state->stack_slot_type[MAX_BPF_STACK + off]; |
| 586 | 579 | ||
| 587 | if (slot->stype == STACK_SPILL) { | 580 | if (slot_type[0] == STACK_SPILL) { |
| 588 | if (size != 8) { | 581 | if (size != BPF_REG_SIZE) { |
| 589 | verbose("invalid size of register spill\n"); | 582 | verbose("invalid size of register spill\n"); |
| 590 | return -EACCES; | 583 | return -EACCES; |
| 591 | } | 584 | } |
| 592 | for (i = 1; i < 8; i++) { | 585 | for (i = 1; i < BPF_REG_SIZE; i++) { |
| 593 | if (state->stack[MAX_BPF_STACK + off + i].stype != | 586 | if (slot_type[i] != STACK_SPILL) { |
| 594 | STACK_SPILL_PART) { | ||
| 595 | verbose("corrupted spill memory\n"); | 587 | verbose("corrupted spill memory\n"); |
| 596 | return -EACCES; | 588 | return -EACCES; |
| 597 | } | 589 | } |
| @@ -599,12 +591,12 @@ static int check_stack_read(struct verifier_state *state, int off, int size, | |||
| 599 | 591 | ||
| 600 | if (value_regno >= 0) | 592 | if (value_regno >= 0) |
| 601 | /* restore register state from stack */ | 593 | /* restore register state from stack */ |
| 602 | state->regs[value_regno] = slot->reg_st; | 594 | state->regs[value_regno] = |
| 595 | state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE]; | ||
| 603 | return 0; | 596 | return 0; |
| 604 | } else { | 597 | } else { |
| 605 | for (i = 0; i < size; i++) { | 598 | for (i = 0; i < size; i++) { |
| 606 | if (state->stack[MAX_BPF_STACK + off + i].stype != | 599 | if (slot_type[i] != STACK_MISC) { |
| 607 | STACK_MISC) { | ||
| 608 | verbose("invalid read from stack off %d+%d size %d\n", | 600 | verbose("invalid read from stack off %d+%d size %d\n", |
| 609 | off, i, size); | 601 | off, i, size); |
| 610 | return -EACCES; | 602 | return -EACCES; |
| @@ -747,7 +739,7 @@ static int check_stack_boundary(struct verifier_env *env, | |||
| 747 | } | 739 | } |
| 748 | 740 | ||
| 749 | for (i = 0; i < access_size; i++) { | 741 | for (i = 0; i < access_size; i++) { |
| 750 | if (state->stack[MAX_BPF_STACK + off + i].stype != STACK_MISC) { | 742 | if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) { |
| 751 | verbose("invalid indirect read from stack off %d+%d size %d\n", | 743 | verbose("invalid indirect read from stack off %d+%d size %d\n", |
| 752 | off, i, access_size); | 744 | off, i, access_size); |
| 753 | return -EACCES; | 745 | return -EACCES; |
| @@ -1180,6 +1172,70 @@ static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn) | |||
| 1180 | return 0; | 1172 | return 0; |
| 1181 | } | 1173 | } |
| 1182 | 1174 | ||
| 1175 | /* verify safety of LD_ABS|LD_IND instructions: | ||
| 1176 | * - they can only appear in the programs where ctx == skb | ||
| 1177 | * - since they are wrappers of function calls, they scratch R1-R5 registers, | ||
| 1178 | * preserve R6-R9, and store return value into R0 | ||
| 1179 | * | ||
| 1180 | * Implicit input: | ||
| 1181 | * ctx == skb == R6 == CTX | ||
| 1182 | * | ||
| 1183 | * Explicit input: | ||
| 1184 | * SRC == any register | ||
| 1185 | * IMM == 32-bit immediate | ||
| 1186 | * | ||
| 1187 | * Output: | ||
| 1188 | * R0 - 8/16/32-bit skb data converted to cpu endianness | ||
| 1189 | */ | ||
| 1190 | static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn) | ||
| 1191 | { | ||
| 1192 | struct reg_state *regs = env->cur_state.regs; | ||
| 1193 | u8 mode = BPF_MODE(insn->code); | ||
| 1194 | struct reg_state *reg; | ||
| 1195 | int i, err; | ||
| 1196 | |||
| 1197 | if (env->prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) { | ||
| 1198 | verbose("BPF_LD_ABS|IND instructions are only allowed in socket filters\n"); | ||
| 1199 | return -EINVAL; | ||
| 1200 | } | ||
| 1201 | |||
| 1202 | if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || | ||
| 1203 | (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { | ||
| 1204 | verbose("BPF_LD_ABS uses reserved fields\n"); | ||
| 1205 | return -EINVAL; | ||
| 1206 | } | ||
| 1207 | |||
| 1208 | /* check whether implicit source operand (register R6) is readable */ | ||
| 1209 | err = check_reg_arg(regs, BPF_REG_6, SRC_OP); | ||
| 1210 | if (err) | ||
| 1211 | return err; | ||
| 1212 | |||
| 1213 | if (regs[BPF_REG_6].type != PTR_TO_CTX) { | ||
| 1214 | verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); | ||
| 1215 | return -EINVAL; | ||
| 1216 | } | ||
| 1217 | |||
| 1218 | if (mode == BPF_IND) { | ||
| 1219 | /* check explicit source operand */ | ||
| 1220 | err = check_reg_arg(regs, insn->src_reg, SRC_OP); | ||
| 1221 | if (err) | ||
| 1222 | return err; | ||
| 1223 | } | ||
| 1224 | |||
| 1225 | /* reset caller saved regs to unreadable */ | ||
| 1226 | for (i = 0; i < CALLER_SAVED_REGS; i++) { | ||
| 1227 | reg = regs + caller_saved[i]; | ||
| 1228 | reg->type = NOT_INIT; | ||
| 1229 | reg->imm = 0; | ||
| 1230 | } | ||
| 1231 | |||
| 1232 | /* mark destination R0 register as readable, since it contains | ||
| 1233 | * the value fetched from the packet | ||
| 1234 | */ | ||
| 1235 | regs[BPF_REG_0].type = UNKNOWN_VALUE; | ||
| 1236 | return 0; | ||
| 1237 | } | ||
| 1238 | |||
| 1183 | /* non-recursive DFS pseudo code | 1239 | /* non-recursive DFS pseudo code |
| 1184 | * 1 procedure DFS-iterative(G,v): | 1240 | * 1 procedure DFS-iterative(G,v): |
| 1185 | * 2 label v as discovered | 1241 | * 2 label v as discovered |
| @@ -1417,12 +1473,33 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur) | |||
| 1417 | } | 1473 | } |
| 1418 | 1474 | ||
| 1419 | for (i = 0; i < MAX_BPF_STACK; i++) { | 1475 | for (i = 0; i < MAX_BPF_STACK; i++) { |
| 1420 | if (memcmp(&old->stack[i], &cur->stack[i], | 1476 | if (old->stack_slot_type[i] == STACK_INVALID) |
| 1421 | sizeof(old->stack[0])) != 0) { | 1477 | continue; |
| 1422 | if (old->stack[i].stype == STACK_INVALID) | 1478 | if (old->stack_slot_type[i] != cur->stack_slot_type[i]) |
| 1423 | continue; | 1479 | /* Ex: old explored (safe) state has STACK_SPILL in |
| 1480 | * this stack slot, but current has has STACK_MISC -> | ||
| 1481 | * this verifier states are not equivalent, | ||
| 1482 | * return false to continue verification of this path | ||
| 1483 | */ | ||
| 1424 | return false; | 1484 | return false; |
| 1425 | } | 1485 | if (i % BPF_REG_SIZE) |
| 1486 | continue; | ||
| 1487 | if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE], | ||
| 1488 | &cur->spilled_regs[i / BPF_REG_SIZE], | ||
| 1489 | sizeof(old->spilled_regs[0]))) | ||
| 1490 | /* when explored and current stack slot types are | ||
| 1491 | * the same, check that stored pointers types | ||
| 1492 | * are the same as well. | ||
| 1493 | * Ex: explored safe path could have stored | ||
| 1494 | * (struct reg_state) {.type = PTR_TO_STACK, .imm = -8} | ||
| 1495 | * but current path has stored: | ||
| 1496 | * (struct reg_state) {.type = PTR_TO_STACK, .imm = -16} | ||
| 1497 | * such verifier states are not equivalent. | ||
| 1498 | * return false to continue verification of this path | ||
| 1499 | */ | ||
| 1500 | return false; | ||
| 1501 | else | ||
| 1502 | continue; | ||
| 1426 | } | 1503 | } |
| 1427 | return true; | 1504 | return true; |
| 1428 | } | 1505 | } |
| @@ -1664,8 +1741,10 @@ process_bpf_exit: | |||
| 1664 | u8 mode = BPF_MODE(insn->code); | 1741 | u8 mode = BPF_MODE(insn->code); |
| 1665 | 1742 | ||
| 1666 | if (mode == BPF_ABS || mode == BPF_IND) { | 1743 | if (mode == BPF_ABS || mode == BPF_IND) { |
| 1667 | verbose("LD_ABS is not supported yet\n"); | 1744 | err = check_ld_abs(env, insn); |
| 1668 | return -EINVAL; | 1745 | if (err) |
| 1746 | return err; | ||
| 1747 | |||
| 1669 | } else if (mode == BPF_IMM) { | 1748 | } else if (mode == BPF_IMM) { |
| 1670 | err = check_ld_imm(env, insn); | 1749 | err = check_ld_imm(env, insn); |
| 1671 | if (err) | 1750 | if (err) |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 136eceadeed1..04cfe8ace520 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
| @@ -277,6 +277,10 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, | |||
| 277 | if (!(cgrp->root->subsys_mask & (1 << ss->id))) | 277 | if (!(cgrp->root->subsys_mask & (1 << ss->id))) |
| 278 | return NULL; | 278 | return NULL; |
| 279 | 279 | ||
| 280 | /* | ||
| 281 | * This function is used while updating css associations and thus | ||
| 282 | * can't test the csses directly. Use ->child_subsys_mask. | ||
| 283 | */ | ||
| 280 | while (cgroup_parent(cgrp) && | 284 | while (cgroup_parent(cgrp) && |
| 281 | !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id))) | 285 | !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id))) |
| 282 | cgrp = cgroup_parent(cgrp); | 286 | cgrp = cgroup_parent(cgrp); |
| @@ -284,6 +288,39 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, | |||
| 284 | return cgroup_css(cgrp, ss); | 288 | return cgroup_css(cgrp, ss); |
| 285 | } | 289 | } |
| 286 | 290 | ||
| 291 | /** | ||
| 292 | * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem | ||
| 293 | * @cgrp: the cgroup of interest | ||
| 294 | * @ss: the subsystem of interest | ||
| 295 | * | ||
| 296 | * Find and get the effective css of @cgrp for @ss. The effective css is | ||
| 297 | * defined as the matching css of the nearest ancestor including self which | ||
| 298 | * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on, | ||
| 299 | * the root css is returned, so this function always returns a valid css. | ||
| 300 | * The returned css must be put using css_put(). | ||
| 301 | */ | ||
| 302 | struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp, | ||
| 303 | struct cgroup_subsys *ss) | ||
| 304 | { | ||
| 305 | struct cgroup_subsys_state *css; | ||
| 306 | |||
| 307 | rcu_read_lock(); | ||
| 308 | |||
| 309 | do { | ||
| 310 | css = cgroup_css(cgrp, ss); | ||
| 311 | |||
| 312 | if (css && css_tryget_online(css)) | ||
| 313 | goto out_unlock; | ||
| 314 | cgrp = cgroup_parent(cgrp); | ||
| 315 | } while (cgrp); | ||
| 316 | |||
| 317 | css = init_css_set.subsys[ss->id]; | ||
| 318 | css_get(css); | ||
| 319 | out_unlock: | ||
| 320 | rcu_read_unlock(); | ||
| 321 | return css; | ||
| 322 | } | ||
| 323 | |||
| 287 | /* convenient tests for these bits */ | 324 | /* convenient tests for these bits */ |
| 288 | static inline bool cgroup_is_dead(const struct cgroup *cgrp) | 325 | static inline bool cgroup_is_dead(const struct cgroup *cgrp) |
| 289 | { | 326 | { |
| @@ -1019,31 +1056,30 @@ static void cgroup_put(struct cgroup *cgrp) | |||
| 1019 | } | 1056 | } |
| 1020 | 1057 | ||
| 1021 | /** | 1058 | /** |
| 1022 | * cgroup_refresh_child_subsys_mask - update child_subsys_mask | 1059 | * cgroup_calc_child_subsys_mask - calculate child_subsys_mask |
| 1023 | * @cgrp: the target cgroup | 1060 | * @cgrp: the target cgroup |
| 1061 | * @subtree_control: the new subtree_control mask to consider | ||
| 1024 | * | 1062 | * |
| 1025 | * On the default hierarchy, a subsystem may request other subsystems to be | 1063 | * On the default hierarchy, a subsystem may request other subsystems to be |
| 1026 | * enabled together through its ->depends_on mask. In such cases, more | 1064 | * enabled together through its ->depends_on mask. In such cases, more |
| 1027 | * subsystems than specified in "cgroup.subtree_control" may be enabled. | 1065 | * subsystems than specified in "cgroup.subtree_control" may be enabled. |
| 1028 | * | 1066 | * |
| 1029 | * This function determines which subsystems need to be enabled given the | 1067 | * This function calculates which subsystems need to be enabled if |
| 1030 | * current @cgrp->subtree_control and records it in | 1068 | * @subtree_control is to be applied to @cgrp. The returned mask is always |
| 1031 | * @cgrp->child_subsys_mask. The resulting mask is always a superset of | 1069 | * a superset of @subtree_control and follows the usual hierarchy rules. |
| 1032 | * @cgrp->subtree_control and follows the usual hierarchy rules. | ||
| 1033 | */ | 1070 | */ |
| 1034 | static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp) | 1071 | static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp, |
| 1072 | unsigned int subtree_control) | ||
| 1035 | { | 1073 | { |
| 1036 | struct cgroup *parent = cgroup_parent(cgrp); | 1074 | struct cgroup *parent = cgroup_parent(cgrp); |
| 1037 | unsigned int cur_ss_mask = cgrp->subtree_control; | 1075 | unsigned int cur_ss_mask = subtree_control; |
| 1038 | struct cgroup_subsys *ss; | 1076 | struct cgroup_subsys *ss; |
| 1039 | int ssid; | 1077 | int ssid; |
| 1040 | 1078 | ||
| 1041 | lockdep_assert_held(&cgroup_mutex); | 1079 | lockdep_assert_held(&cgroup_mutex); |
| 1042 | 1080 | ||
| 1043 | if (!cgroup_on_dfl(cgrp)) { | 1081 | if (!cgroup_on_dfl(cgrp)) |
| 1044 | cgrp->child_subsys_mask = cur_ss_mask; | 1082 | return cur_ss_mask; |
| 1045 | return; | ||
| 1046 | } | ||
| 1047 | 1083 | ||
| 1048 | while (true) { | 1084 | while (true) { |
| 1049 | unsigned int new_ss_mask = cur_ss_mask; | 1085 | unsigned int new_ss_mask = cur_ss_mask; |
| @@ -1067,7 +1103,20 @@ static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp) | |||
| 1067 | cur_ss_mask = new_ss_mask; | 1103 | cur_ss_mask = new_ss_mask; |
| 1068 | } | 1104 | } |
| 1069 | 1105 | ||
| 1070 | cgrp->child_subsys_mask = cur_ss_mask; | 1106 | return cur_ss_mask; |
| 1107 | } | ||
| 1108 | |||
| 1109 | /** | ||
| 1110 | * cgroup_refresh_child_subsys_mask - update child_subsys_mask | ||
| 1111 | * @cgrp: the target cgroup | ||
| 1112 | * | ||
| 1113 | * Update @cgrp->child_subsys_mask according to the current | ||
| 1114 | * @cgrp->subtree_control using cgroup_calc_child_subsys_mask(). | ||
| 1115 | */ | ||
| 1116 | static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp) | ||
| 1117 | { | ||
| 1118 | cgrp->child_subsys_mask = | ||
| 1119 | cgroup_calc_child_subsys_mask(cgrp, cgrp->subtree_control); | ||
| 1071 | } | 1120 | } |
| 1072 | 1121 | ||
| 1073 | /** | 1122 | /** |
| @@ -1860,7 +1909,7 @@ static void cgroup_kill_sb(struct super_block *sb) | |||
| 1860 | * | 1909 | * |
| 1861 | * And don't kill the default root. | 1910 | * And don't kill the default root. |
| 1862 | */ | 1911 | */ |
| 1863 | if (css_has_online_children(&root->cgrp.self) || | 1912 | if (!list_empty(&root->cgrp.self.children) || |
| 1864 | root == &cgrp_dfl_root) | 1913 | root == &cgrp_dfl_root) |
| 1865 | cgroup_put(&root->cgrp); | 1914 | cgroup_put(&root->cgrp); |
| 1866 | else | 1915 | else |
| @@ -2641,7 +2690,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, | |||
| 2641 | loff_t off) | 2690 | loff_t off) |
| 2642 | { | 2691 | { |
| 2643 | unsigned int enable = 0, disable = 0; | 2692 | unsigned int enable = 0, disable = 0; |
| 2644 | unsigned int css_enable, css_disable, old_ctrl, new_ctrl; | 2693 | unsigned int css_enable, css_disable, old_sc, new_sc, old_ss, new_ss; |
| 2645 | struct cgroup *cgrp, *child; | 2694 | struct cgroup *cgrp, *child; |
| 2646 | struct cgroup_subsys *ss; | 2695 | struct cgroup_subsys *ss; |
| 2647 | char *tok; | 2696 | char *tok; |
| @@ -2693,36 +2742,6 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, | |||
| 2693 | ret = -ENOENT; | 2742 | ret = -ENOENT; |
| 2694 | goto out_unlock; | 2743 | goto out_unlock; |
| 2695 | } | 2744 | } |
| 2696 | |||
| 2697 | /* | ||
| 2698 | * @ss is already enabled through dependency and | ||
| 2699 | * we'll just make it visible. Skip draining. | ||
| 2700 | */ | ||
| 2701 | if (cgrp->child_subsys_mask & (1 << ssid)) | ||
| 2702 | continue; | ||
| 2703 | |||
| 2704 | /* | ||
| 2705 | * Because css offlining is asynchronous, userland | ||
| 2706 | * might try to re-enable the same controller while | ||
| 2707 | * the previous instance is still around. In such | ||
| 2708 | * cases, wait till it's gone using offline_waitq. | ||
| 2709 | */ | ||
| 2710 | cgroup_for_each_live_child(child, cgrp) { | ||
| 2711 | DEFINE_WAIT(wait); | ||
| 2712 | |||
| 2713 | if (!cgroup_css(child, ss)) | ||
| 2714 | continue; | ||
| 2715 | |||
| 2716 | cgroup_get(child); | ||
| 2717 | prepare_to_wait(&child->offline_waitq, &wait, | ||
| 2718 | TASK_UNINTERRUPTIBLE); | ||
| 2719 | cgroup_kn_unlock(of->kn); | ||
| 2720 | schedule(); | ||
| 2721 | finish_wait(&child->offline_waitq, &wait); | ||
| 2722 | cgroup_put(child); | ||
| 2723 | |||
| 2724 | return restart_syscall(); | ||
| 2725 | } | ||
| 2726 | } else if (disable & (1 << ssid)) { | 2745 | } else if (disable & (1 << ssid)) { |
| 2727 | if (!(cgrp->subtree_control & (1 << ssid))) { | 2746 | if (!(cgrp->subtree_control & (1 << ssid))) { |
| 2728 | disable &= ~(1 << ssid); | 2747 | disable &= ~(1 << ssid); |
| @@ -2758,19 +2777,48 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, | |||
| 2758 | * subsystems than specified may need to be enabled or disabled | 2777 | * subsystems than specified may need to be enabled or disabled |
| 2759 | * depending on subsystem dependencies. | 2778 | * depending on subsystem dependencies. |
| 2760 | */ | 2779 | */ |
| 2761 | cgrp->subtree_control |= enable; | 2780 | old_sc = cgrp->subtree_control; |
| 2762 | cgrp->subtree_control &= ~disable; | 2781 | old_ss = cgrp->child_subsys_mask; |
| 2782 | new_sc = (old_sc | enable) & ~disable; | ||
| 2783 | new_ss = cgroup_calc_child_subsys_mask(cgrp, new_sc); | ||
| 2763 | 2784 | ||
| 2764 | old_ctrl = cgrp->child_subsys_mask; | 2785 | css_enable = ~old_ss & new_ss; |
| 2765 | cgroup_refresh_child_subsys_mask(cgrp); | 2786 | css_disable = old_ss & ~new_ss; |
| 2766 | new_ctrl = cgrp->child_subsys_mask; | ||
| 2767 | |||
| 2768 | css_enable = ~old_ctrl & new_ctrl; | ||
| 2769 | css_disable = old_ctrl & ~new_ctrl; | ||
| 2770 | enable |= css_enable; | 2787 | enable |= css_enable; |
| 2771 | disable |= css_disable; | 2788 | disable |= css_disable; |
| 2772 | 2789 | ||
| 2773 | /* | 2790 | /* |
| 2791 | * Because css offlining is asynchronous, userland might try to | ||
| 2792 | * re-enable the same controller while the previous instance is | ||
| 2793 | * still around. In such cases, wait till it's gone using | ||
| 2794 | * offline_waitq. | ||
| 2795 | */ | ||
| 2796 | for_each_subsys(ss, ssid) { | ||
| 2797 | if (!(css_enable & (1 << ssid))) | ||
| 2798 | continue; | ||
| 2799 | |||
| 2800 | cgroup_for_each_live_child(child, cgrp) { | ||
| 2801 | DEFINE_WAIT(wait); | ||
| 2802 | |||
| 2803 | if (!cgroup_css(child, ss)) | ||
| 2804 | continue; | ||
| 2805 | |||
| 2806 | cgroup_get(child); | ||
| 2807 | prepare_to_wait(&child->offline_waitq, &wait, | ||
| 2808 | TASK_UNINTERRUPTIBLE); | ||
| 2809 | cgroup_kn_unlock(of->kn); | ||
| 2810 | schedule(); | ||
| 2811 | finish_wait(&child->offline_waitq, &wait); | ||
| 2812 | cgroup_put(child); | ||
| 2813 | |||
| 2814 | return restart_syscall(); | ||
| 2815 | } | ||
| 2816 | } | ||
| 2817 | |||
| 2818 | cgrp->subtree_control = new_sc; | ||
| 2819 | cgrp->child_subsys_mask = new_ss; | ||
| 2820 | |||
| 2821 | /* | ||
| 2774 | * Create new csses or make the existing ones visible. A css is | 2822 | * Create new csses or make the existing ones visible. A css is |
| 2775 | * created invisible if it's being implicitly enabled through | 2823 | * created invisible if it's being implicitly enabled through |
| 2776 | * dependency. An invisible css is made visible when the userland | 2824 | * dependency. An invisible css is made visible when the userland |
| @@ -2825,6 +2873,24 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, | |||
| 2825 | } | 2873 | } |
| 2826 | } | 2874 | } |
| 2827 | 2875 | ||
| 2876 | /* | ||
| 2877 | * The effective csses of all the descendants (excluding @cgrp) may | ||
| 2878 | * have changed. Subsystems can optionally subscribe to this event | ||
| 2879 | * by implementing ->css_e_css_changed() which is invoked if any of | ||
| 2880 | * the effective csses seen from the css's cgroup may have changed. | ||
| 2881 | */ | ||
| 2882 | for_each_subsys(ss, ssid) { | ||
| 2883 | struct cgroup_subsys_state *this_css = cgroup_css(cgrp, ss); | ||
| 2884 | struct cgroup_subsys_state *css; | ||
| 2885 | |||
| 2886 | if (!ss->css_e_css_changed || !this_css) | ||
| 2887 | continue; | ||
| 2888 | |||
| 2889 | css_for_each_descendant_pre(css, this_css) | ||
| 2890 | if (css != this_css) | ||
| 2891 | ss->css_e_css_changed(css); | ||
| 2892 | } | ||
| 2893 | |||
| 2828 | kernfs_activate(cgrp->kn); | 2894 | kernfs_activate(cgrp->kn); |
| 2829 | ret = 0; | 2895 | ret = 0; |
| 2830 | out_unlock: | 2896 | out_unlock: |
| @@ -2832,9 +2898,8 @@ out_unlock: | |||
| 2832 | return ret ?: nbytes; | 2898 | return ret ?: nbytes; |
| 2833 | 2899 | ||
| 2834 | err_undo_css: | 2900 | err_undo_css: |
| 2835 | cgrp->subtree_control &= ~enable; | 2901 | cgrp->subtree_control = old_sc; |
| 2836 | cgrp->subtree_control |= disable; | 2902 | cgrp->child_subsys_mask = old_ss; |
| 2837 | cgroup_refresh_child_subsys_mask(cgrp); | ||
| 2838 | 2903 | ||
| 2839 | for_each_subsys(ss, ssid) { | 2904 | for_each_subsys(ss, ssid) { |
| 2840 | if (!(enable & (1 << ssid))) | 2905 | if (!(enable & (1 << ssid))) |
| @@ -4370,6 +4435,8 @@ static void css_release_work_fn(struct work_struct *work) | |||
| 4370 | if (ss) { | 4435 | if (ss) { |
| 4371 | /* css release path */ | 4436 | /* css release path */ |
| 4372 | cgroup_idr_remove(&ss->css_idr, css->id); | 4437 | cgroup_idr_remove(&ss->css_idr, css->id); |
| 4438 | if (ss->css_released) | ||
| 4439 | ss->css_released(css); | ||
| 4373 | } else { | 4440 | } else { |
| 4374 | /* cgroup release path */ | 4441 | /* cgroup release path */ |
| 4375 | cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); | 4442 | cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 90a3d017b90c..5d220234b3ca 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
| @@ -86,6 +86,16 @@ static struct { | |||
| 86 | #define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map) | 86 | #define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map) |
| 87 | #define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) | 87 | #define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) |
| 88 | 88 | ||
| 89 | static void apply_puts_pending(int max) | ||
| 90 | { | ||
| 91 | int delta; | ||
| 92 | |||
| 93 | if (atomic_read(&cpu_hotplug.puts_pending) >= max) { | ||
| 94 | delta = atomic_xchg(&cpu_hotplug.puts_pending, 0); | ||
| 95 | cpu_hotplug.refcount -= delta; | ||
| 96 | } | ||
| 97 | } | ||
| 98 | |||
| 89 | void get_online_cpus(void) | 99 | void get_online_cpus(void) |
| 90 | { | 100 | { |
| 91 | might_sleep(); | 101 | might_sleep(); |
| @@ -93,6 +103,7 @@ void get_online_cpus(void) | |||
| 93 | return; | 103 | return; |
| 94 | cpuhp_lock_acquire_read(); | 104 | cpuhp_lock_acquire_read(); |
| 95 | mutex_lock(&cpu_hotplug.lock); | 105 | mutex_lock(&cpu_hotplug.lock); |
| 106 | apply_puts_pending(65536); | ||
| 96 | cpu_hotplug.refcount++; | 107 | cpu_hotplug.refcount++; |
| 97 | mutex_unlock(&cpu_hotplug.lock); | 108 | mutex_unlock(&cpu_hotplug.lock); |
| 98 | } | 109 | } |
| @@ -105,6 +116,7 @@ bool try_get_online_cpus(void) | |||
| 105 | if (!mutex_trylock(&cpu_hotplug.lock)) | 116 | if (!mutex_trylock(&cpu_hotplug.lock)) |
| 106 | return false; | 117 | return false; |
| 107 | cpuhp_lock_acquire_tryread(); | 118 | cpuhp_lock_acquire_tryread(); |
| 119 | apply_puts_pending(65536); | ||
| 108 | cpu_hotplug.refcount++; | 120 | cpu_hotplug.refcount++; |
| 109 | mutex_unlock(&cpu_hotplug.lock); | 121 | mutex_unlock(&cpu_hotplug.lock); |
| 110 | return true; | 122 | return true; |
| @@ -161,12 +173,7 @@ void cpu_hotplug_begin(void) | |||
| 161 | cpuhp_lock_acquire(); | 173 | cpuhp_lock_acquire(); |
| 162 | for (;;) { | 174 | for (;;) { |
| 163 | mutex_lock(&cpu_hotplug.lock); | 175 | mutex_lock(&cpu_hotplug.lock); |
| 164 | if (atomic_read(&cpu_hotplug.puts_pending)) { | 176 | apply_puts_pending(1); |
| 165 | int delta; | ||
| 166 | |||
| 167 | delta = atomic_xchg(&cpu_hotplug.puts_pending, 0); | ||
| 168 | cpu_hotplug.refcount -= delta; | ||
| 169 | } | ||
| 170 | if (likely(!cpu_hotplug.refcount)) | 177 | if (likely(!cpu_hotplug.refcount)) |
| 171 | break; | 178 | break; |
| 172 | __set_current_state(TASK_UNINTERRUPTIBLE); | 179 | __set_current_state(TASK_UNINTERRUPTIBLE); |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1f107c74087b..64b257f6bca2 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -248,34 +248,34 @@ static struct cpuset top_cpuset = { | |||
| 248 | if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) | 248 | if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) |
| 249 | 249 | ||
| 250 | /* | 250 | /* |
| 251 | * There are two global mutexes guarding cpuset structures - cpuset_mutex | 251 | * There are two global locks guarding cpuset structures - cpuset_mutex and |
| 252 | * and callback_mutex. The latter may nest inside the former. We also | 252 | * callback_lock. We also require taking task_lock() when dereferencing a |
| 253 | * require taking task_lock() when dereferencing a task's cpuset pointer. | 253 | * task's cpuset pointer. See "The task_lock() exception", at the end of this |
| 254 | * See "The task_lock() exception", at the end of this comment. | 254 | * comment. |
| 255 | * | 255 | * |
| 256 | * A task must hold both mutexes to modify cpusets. If a task holds | 256 | * A task must hold both locks to modify cpusets. If a task holds |
| 257 | * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it | 257 | * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it |
| 258 | * is the only task able to also acquire callback_mutex and be able to | 258 | * is the only task able to also acquire callback_lock and be able to |
| 259 | * modify cpusets. It can perform various checks on the cpuset structure | 259 | * modify cpusets. It can perform various checks on the cpuset structure |
| 260 | * first, knowing nothing will change. It can also allocate memory while | 260 | * first, knowing nothing will change. It can also allocate memory while |
| 261 | * just holding cpuset_mutex. While it is performing these checks, various | 261 | * just holding cpuset_mutex. While it is performing these checks, various |
| 262 | * callback routines can briefly acquire callback_mutex to query cpusets. | 262 | * callback routines can briefly acquire callback_lock to query cpusets. |
| 263 | * Once it is ready to make the changes, it takes callback_mutex, blocking | 263 | * Once it is ready to make the changes, it takes callback_lock, blocking |
| 264 | * everyone else. | 264 | * everyone else. |
| 265 | * | 265 | * |
| 266 | * Calls to the kernel memory allocator can not be made while holding | 266 | * Calls to the kernel memory allocator can not be made while holding |
| 267 | * callback_mutex, as that would risk double tripping on callback_mutex | 267 | * callback_lock, as that would risk double tripping on callback_lock |
| 268 | * from one of the callbacks into the cpuset code from within | 268 | * from one of the callbacks into the cpuset code from within |
| 269 | * __alloc_pages(). | 269 | * __alloc_pages(). |
| 270 | * | 270 | * |
| 271 | * If a task is only holding callback_mutex, then it has read-only | 271 | * If a task is only holding callback_lock, then it has read-only |
| 272 | * access to cpusets. | 272 | * access to cpusets. |
| 273 | * | 273 | * |
| 274 | * Now, the task_struct fields mems_allowed and mempolicy may be changed | 274 | * Now, the task_struct fields mems_allowed and mempolicy may be changed |
| 275 | * by other task, we use alloc_lock in the task_struct fields to protect | 275 | * by other task, we use alloc_lock in the task_struct fields to protect |
| 276 | * them. | 276 | * them. |
| 277 | * | 277 | * |
| 278 | * The cpuset_common_file_read() handlers only hold callback_mutex across | 278 | * The cpuset_common_file_read() handlers only hold callback_lock across |
| 279 | * small pieces of code, such as when reading out possibly multi-word | 279 | * small pieces of code, such as when reading out possibly multi-word |
| 280 | * cpumasks and nodemasks. | 280 | * cpumasks and nodemasks. |
| 281 | * | 281 | * |
| @@ -284,7 +284,7 @@ static struct cpuset top_cpuset = { | |||
| 284 | */ | 284 | */ |
| 285 | 285 | ||
| 286 | static DEFINE_MUTEX(cpuset_mutex); | 286 | static DEFINE_MUTEX(cpuset_mutex); |
| 287 | static DEFINE_MUTEX(callback_mutex); | 287 | static DEFINE_SPINLOCK(callback_lock); |
| 288 | 288 | ||
| 289 | /* | 289 | /* |
| 290 | * CPU / memory hotplug is handled asynchronously. | 290 | * CPU / memory hotplug is handled asynchronously. |
| @@ -329,7 +329,7 @@ static struct file_system_type cpuset_fs_type = { | |||
| 329 | * One way or another, we guarantee to return some non-empty subset | 329 | * One way or another, we guarantee to return some non-empty subset |
| 330 | * of cpu_online_mask. | 330 | * of cpu_online_mask. |
| 331 | * | 331 | * |
| 332 | * Call with callback_mutex held. | 332 | * Call with callback_lock or cpuset_mutex held. |
| 333 | */ | 333 | */ |
| 334 | static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) | 334 | static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) |
| 335 | { | 335 | { |
| @@ -347,7 +347,7 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) | |||
| 347 | * One way or another, we guarantee to return some non-empty subset | 347 | * One way or another, we guarantee to return some non-empty subset |
| 348 | * of node_states[N_MEMORY]. | 348 | * of node_states[N_MEMORY]. |
| 349 | * | 349 | * |
| 350 | * Call with callback_mutex held. | 350 | * Call with callback_lock or cpuset_mutex held. |
| 351 | */ | 351 | */ |
| 352 | static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) | 352 | static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) |
| 353 | { | 353 | { |
| @@ -359,7 +359,7 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) | |||
| 359 | /* | 359 | /* |
| 360 | * update task's spread flag if cpuset's page/slab spread flag is set | 360 | * update task's spread flag if cpuset's page/slab spread flag is set |
| 361 | * | 361 | * |
| 362 | * Called with callback_mutex/cpuset_mutex held | 362 | * Call with callback_lock or cpuset_mutex held. |
| 363 | */ | 363 | */ |
| 364 | static void cpuset_update_task_spread_flag(struct cpuset *cs, | 364 | static void cpuset_update_task_spread_flag(struct cpuset *cs, |
| 365 | struct task_struct *tsk) | 365 | struct task_struct *tsk) |
| @@ -506,6 +506,16 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) | |||
| 506 | goto out; | 506 | goto out; |
| 507 | } | 507 | } |
| 508 | 508 | ||
| 509 | /* | ||
| 510 | * We can't shrink if we won't have enough room for SCHED_DEADLINE | ||
| 511 | * tasks. | ||
| 512 | */ | ||
| 513 | ret = -EBUSY; | ||
| 514 | if (is_cpu_exclusive(cur) && | ||
| 515 | !cpuset_cpumask_can_shrink(cur->cpus_allowed, | ||
| 516 | trial->cpus_allowed)) | ||
| 517 | goto out; | ||
| 518 | |||
| 509 | ret = 0; | 519 | ret = 0; |
| 510 | out: | 520 | out: |
| 511 | rcu_read_unlock(); | 521 | rcu_read_unlock(); |
| @@ -876,9 +886,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) | |||
| 876 | continue; | 886 | continue; |
| 877 | rcu_read_unlock(); | 887 | rcu_read_unlock(); |
| 878 | 888 | ||
| 879 | mutex_lock(&callback_mutex); | 889 | spin_lock_irq(&callback_lock); |
| 880 | cpumask_copy(cp->effective_cpus, new_cpus); | 890 | cpumask_copy(cp->effective_cpus, new_cpus); |
| 881 | mutex_unlock(&callback_mutex); | 891 | spin_unlock_irq(&callback_lock); |
| 882 | 892 | ||
| 883 | WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && | 893 | WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && |
| 884 | !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); | 894 | !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); |
| @@ -943,9 +953,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, | |||
| 943 | if (retval < 0) | 953 | if (retval < 0) |
| 944 | return retval; | 954 | return retval; |
| 945 | 955 | ||
| 946 | mutex_lock(&callback_mutex); | 956 | spin_lock_irq(&callback_lock); |
| 947 | cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); | 957 | cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); |
| 948 | mutex_unlock(&callback_mutex); | 958 | spin_unlock_irq(&callback_lock); |
| 949 | 959 | ||
| 950 | /* use trialcs->cpus_allowed as a temp variable */ | 960 | /* use trialcs->cpus_allowed as a temp variable */ |
| 951 | update_cpumasks_hier(cs, trialcs->cpus_allowed); | 961 | update_cpumasks_hier(cs, trialcs->cpus_allowed); |
| @@ -1132,9 +1142,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) | |||
| 1132 | continue; | 1142 | continue; |
| 1133 | rcu_read_unlock(); | 1143 | rcu_read_unlock(); |
| 1134 | 1144 | ||
| 1135 | mutex_lock(&callback_mutex); | 1145 | spin_lock_irq(&callback_lock); |
| 1136 | cp->effective_mems = *new_mems; | 1146 | cp->effective_mems = *new_mems; |
| 1137 | mutex_unlock(&callback_mutex); | 1147 | spin_unlock_irq(&callback_lock); |
| 1138 | 1148 | ||
| 1139 | WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && | 1149 | WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && |
| 1140 | !nodes_equal(cp->mems_allowed, cp->effective_mems)); | 1150 | !nodes_equal(cp->mems_allowed, cp->effective_mems)); |
| @@ -1155,7 +1165,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) | |||
| 1155 | * mempolicies and if the cpuset is marked 'memory_migrate', | 1165 | * mempolicies and if the cpuset is marked 'memory_migrate', |
| 1156 | * migrate the tasks pages to the new memory. | 1166 | * migrate the tasks pages to the new memory. |
| 1157 | * | 1167 | * |
| 1158 | * Call with cpuset_mutex held. May take callback_mutex during call. | 1168 | * Call with cpuset_mutex held. May take callback_lock during call. |
| 1159 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, | 1169 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, |
| 1160 | * lock each such tasks mm->mmap_sem, scan its vma's and rebind | 1170 | * lock each such tasks mm->mmap_sem, scan its vma's and rebind |
| 1161 | * their mempolicies to the cpusets new mems_allowed. | 1171 | * their mempolicies to the cpusets new mems_allowed. |
| @@ -1202,9 +1212,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, | |||
| 1202 | if (retval < 0) | 1212 | if (retval < 0) |
| 1203 | goto done; | 1213 | goto done; |
| 1204 | 1214 | ||
| 1205 | mutex_lock(&callback_mutex); | 1215 | spin_lock_irq(&callback_lock); |
| 1206 | cs->mems_allowed = trialcs->mems_allowed; | 1216 | cs->mems_allowed = trialcs->mems_allowed; |
| 1207 | mutex_unlock(&callback_mutex); | 1217 | spin_unlock_irq(&callback_lock); |
| 1208 | 1218 | ||
| 1209 | /* use trialcs->mems_allowed as a temp variable */ | 1219 | /* use trialcs->mems_allowed as a temp variable */ |
| 1210 | update_nodemasks_hier(cs, &cs->mems_allowed); | 1220 | update_nodemasks_hier(cs, &cs->mems_allowed); |
| @@ -1295,9 +1305,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, | |||
| 1295 | spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs)) | 1305 | spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs)) |
| 1296 | || (is_spread_page(cs) != is_spread_page(trialcs))); | 1306 | || (is_spread_page(cs) != is_spread_page(trialcs))); |
| 1297 | 1307 | ||
| 1298 | mutex_lock(&callback_mutex); | 1308 | spin_lock_irq(&callback_lock); |
| 1299 | cs->flags = trialcs->flags; | 1309 | cs->flags = trialcs->flags; |
| 1300 | mutex_unlock(&callback_mutex); | 1310 | spin_unlock_irq(&callback_lock); |
| 1301 | 1311 | ||
| 1302 | if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) | 1312 | if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) |
| 1303 | rebuild_sched_domains_locked(); | 1313 | rebuild_sched_domains_locked(); |
| @@ -1429,17 +1439,8 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css, | |||
| 1429 | goto out_unlock; | 1439 | goto out_unlock; |
| 1430 | 1440 | ||
| 1431 | cgroup_taskset_for_each(task, tset) { | 1441 | cgroup_taskset_for_each(task, tset) { |
| 1432 | /* | 1442 | ret = task_can_attach(task, cs->cpus_allowed); |
| 1433 | * Kthreads which disallow setaffinity shouldn't be moved | 1443 | if (ret) |
| 1434 | * to a new cpuset; we don't want to change their cpu | ||
| 1435 | * affinity and isolating such threads by their set of | ||
| 1436 | * allowed nodes is unnecessary. Thus, cpusets are not | ||
| 1437 | * applicable for such threads. This prevents checking for | ||
| 1438 | * success of set_cpus_allowed_ptr() on all attached tasks | ||
| 1439 | * before cpus_allowed may be changed. | ||
| 1440 | */ | ||
| 1441 | ret = -EINVAL; | ||
| 1442 | if (task->flags & PF_NO_SETAFFINITY) | ||
| 1443 | goto out_unlock; | 1444 | goto out_unlock; |
| 1444 | ret = security_task_setscheduler(task); | 1445 | ret = security_task_setscheduler(task); |
| 1445 | if (ret) | 1446 | if (ret) |
| @@ -1713,7 +1714,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) | |||
| 1713 | count = seq_get_buf(sf, &buf); | 1714 | count = seq_get_buf(sf, &buf); |
| 1714 | s = buf; | 1715 | s = buf; |
| 1715 | 1716 | ||
| 1716 | mutex_lock(&callback_mutex); | 1717 | spin_lock_irq(&callback_lock); |
| 1717 | 1718 | ||
| 1718 | switch (type) { | 1719 | switch (type) { |
| 1719 | case FILE_CPULIST: | 1720 | case FILE_CPULIST: |
| @@ -1740,7 +1741,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) | |||
| 1740 | seq_commit(sf, -1); | 1741 | seq_commit(sf, -1); |
| 1741 | } | 1742 | } |
| 1742 | out_unlock: | 1743 | out_unlock: |
| 1743 | mutex_unlock(&callback_mutex); | 1744 | spin_unlock_irq(&callback_lock); |
| 1744 | return ret; | 1745 | return ret; |
| 1745 | } | 1746 | } |
| 1746 | 1747 | ||
| @@ -1957,12 +1958,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) | |||
| 1957 | 1958 | ||
| 1958 | cpuset_inc(); | 1959 | cpuset_inc(); |
| 1959 | 1960 | ||
| 1960 | mutex_lock(&callback_mutex); | 1961 | spin_lock_irq(&callback_lock); |
| 1961 | if (cgroup_on_dfl(cs->css.cgroup)) { | 1962 | if (cgroup_on_dfl(cs->css.cgroup)) { |
| 1962 | cpumask_copy(cs->effective_cpus, parent->effective_cpus); | 1963 | cpumask_copy(cs->effective_cpus, parent->effective_cpus); |
| 1963 | cs->effective_mems = parent->effective_mems; | 1964 | cs->effective_mems = parent->effective_mems; |
| 1964 | } | 1965 | } |
| 1965 | mutex_unlock(&callback_mutex); | 1966 | spin_unlock_irq(&callback_lock); |
| 1966 | 1967 | ||
| 1967 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) | 1968 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) |
| 1968 | goto out_unlock; | 1969 | goto out_unlock; |
| @@ -1989,10 +1990,10 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) | |||
| 1989 | } | 1990 | } |
| 1990 | rcu_read_unlock(); | 1991 | rcu_read_unlock(); |
| 1991 | 1992 | ||
| 1992 | mutex_lock(&callback_mutex); | 1993 | spin_lock_irq(&callback_lock); |
| 1993 | cs->mems_allowed = parent->mems_allowed; | 1994 | cs->mems_allowed = parent->mems_allowed; |
| 1994 | cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); | 1995 | cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); |
| 1995 | mutex_unlock(&callback_mutex); | 1996 | spin_unlock_irq(&callback_lock); |
| 1996 | out_unlock: | 1997 | out_unlock: |
| 1997 | mutex_unlock(&cpuset_mutex); | 1998 | mutex_unlock(&cpuset_mutex); |
| 1998 | return 0; | 1999 | return 0; |
| @@ -2031,7 +2032,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) | |||
| 2031 | static void cpuset_bind(struct cgroup_subsys_state *root_css) | 2032 | static void cpuset_bind(struct cgroup_subsys_state *root_css) |
| 2032 | { | 2033 | { |
| 2033 | mutex_lock(&cpuset_mutex); | 2034 | mutex_lock(&cpuset_mutex); |
| 2034 | mutex_lock(&callback_mutex); | 2035 | spin_lock_irq(&callback_lock); |
| 2035 | 2036 | ||
| 2036 | if (cgroup_on_dfl(root_css->cgroup)) { | 2037 | if (cgroup_on_dfl(root_css->cgroup)) { |
| 2037 | cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); | 2038 | cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); |
| @@ -2042,7 +2043,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) | |||
| 2042 | top_cpuset.mems_allowed = top_cpuset.effective_mems; | 2043 | top_cpuset.mems_allowed = top_cpuset.effective_mems; |
| 2043 | } | 2044 | } |
| 2044 | 2045 | ||
| 2045 | mutex_unlock(&callback_mutex); | 2046 | spin_unlock_irq(&callback_lock); |
| 2046 | mutex_unlock(&cpuset_mutex); | 2047 | mutex_unlock(&cpuset_mutex); |
| 2047 | } | 2048 | } |
| 2048 | 2049 | ||
| @@ -2127,12 +2128,12 @@ hotplug_update_tasks_legacy(struct cpuset *cs, | |||
| 2127 | { | 2128 | { |
| 2128 | bool is_empty; | 2129 | bool is_empty; |
| 2129 | 2130 | ||
| 2130 | mutex_lock(&callback_mutex); | 2131 | spin_lock_irq(&callback_lock); |
| 2131 | cpumask_copy(cs->cpus_allowed, new_cpus); | 2132 | cpumask_copy(cs->cpus_allowed, new_cpus); |
| 2132 | cpumask_copy(cs->effective_cpus, new_cpus); | 2133 | cpumask_copy(cs->effective_cpus, new_cpus); |
| 2133 | cs->mems_allowed = *new_mems; | 2134 | cs->mems_allowed = *new_mems; |
| 2134 | cs->effective_mems = *new_mems; | 2135 | cs->effective_mems = *new_mems; |
| 2135 | mutex_unlock(&callback_mutex); | 2136 | spin_unlock_irq(&callback_lock); |
| 2136 | 2137 | ||
| 2137 | /* | 2138 | /* |
| 2138 | * Don't call update_tasks_cpumask() if the cpuset becomes empty, | 2139 | * Don't call update_tasks_cpumask() if the cpuset becomes empty, |
| @@ -2169,10 +2170,10 @@ hotplug_update_tasks(struct cpuset *cs, | |||
| 2169 | if (nodes_empty(*new_mems)) | 2170 | if (nodes_empty(*new_mems)) |
| 2170 | *new_mems = parent_cs(cs)->effective_mems; | 2171 | *new_mems = parent_cs(cs)->effective_mems; |
| 2171 | 2172 | ||
| 2172 | mutex_lock(&callback_mutex); | 2173 | spin_lock_irq(&callback_lock); |
| 2173 | cpumask_copy(cs->effective_cpus, new_cpus); | 2174 | cpumask_copy(cs->effective_cpus, new_cpus); |
| 2174 | cs->effective_mems = *new_mems; | 2175 | cs->effective_mems = *new_mems; |
| 2175 | mutex_unlock(&callback_mutex); | 2176 | spin_unlock_irq(&callback_lock); |
| 2176 | 2177 | ||
| 2177 | if (cpus_updated) | 2178 | if (cpus_updated) |
| 2178 | update_tasks_cpumask(cs); | 2179 | update_tasks_cpumask(cs); |
| @@ -2258,21 +2259,21 @@ static void cpuset_hotplug_workfn(struct work_struct *work) | |||
| 2258 | 2259 | ||
| 2259 | /* synchronize cpus_allowed to cpu_active_mask */ | 2260 | /* synchronize cpus_allowed to cpu_active_mask */ |
| 2260 | if (cpus_updated) { | 2261 | if (cpus_updated) { |
| 2261 | mutex_lock(&callback_mutex); | 2262 | spin_lock_irq(&callback_lock); |
| 2262 | if (!on_dfl) | 2263 | if (!on_dfl) |
| 2263 | cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); | 2264 | cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); |
| 2264 | cpumask_copy(top_cpuset.effective_cpus, &new_cpus); | 2265 | cpumask_copy(top_cpuset.effective_cpus, &new_cpus); |
| 2265 | mutex_unlock(&callback_mutex); | 2266 | spin_unlock_irq(&callback_lock); |
| 2266 | /* we don't mess with cpumasks of tasks in top_cpuset */ | 2267 | /* we don't mess with cpumasks of tasks in top_cpuset */ |
| 2267 | } | 2268 | } |
| 2268 | 2269 | ||
| 2269 | /* synchronize mems_allowed to N_MEMORY */ | 2270 | /* synchronize mems_allowed to N_MEMORY */ |
| 2270 | if (mems_updated) { | 2271 | if (mems_updated) { |
| 2271 | mutex_lock(&callback_mutex); | 2272 | spin_lock_irq(&callback_lock); |
| 2272 | if (!on_dfl) | 2273 | if (!on_dfl) |
| 2273 | top_cpuset.mems_allowed = new_mems; | 2274 | top_cpuset.mems_allowed = new_mems; |
| 2274 | top_cpuset.effective_mems = new_mems; | 2275 | top_cpuset.effective_mems = new_mems; |
| 2275 | mutex_unlock(&callback_mutex); | 2276 | spin_unlock_irq(&callback_lock); |
| 2276 | update_tasks_nodemask(&top_cpuset); | 2277 | update_tasks_nodemask(&top_cpuset); |
| 2277 | } | 2278 | } |
| 2278 | 2279 | ||
| @@ -2365,11 +2366,13 @@ void __init cpuset_init_smp(void) | |||
| 2365 | 2366 | ||
| 2366 | void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) | 2367 | void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) |
| 2367 | { | 2368 | { |
| 2368 | mutex_lock(&callback_mutex); | 2369 | unsigned long flags; |
| 2370 | |||
| 2371 | spin_lock_irqsave(&callback_lock, flags); | ||
| 2369 | rcu_read_lock(); | 2372 | rcu_read_lock(); |
| 2370 | guarantee_online_cpus(task_cs(tsk), pmask); | 2373 | guarantee_online_cpus(task_cs(tsk), pmask); |
| 2371 | rcu_read_unlock(); | 2374 | rcu_read_unlock(); |
| 2372 | mutex_unlock(&callback_mutex); | 2375 | spin_unlock_irqrestore(&callback_lock, flags); |
| 2373 | } | 2376 | } |
| 2374 | 2377 | ||
| 2375 | void cpuset_cpus_allowed_fallback(struct task_struct *tsk) | 2378 | void cpuset_cpus_allowed_fallback(struct task_struct *tsk) |
| @@ -2415,12 +2418,13 @@ void cpuset_init_current_mems_allowed(void) | |||
| 2415 | nodemask_t cpuset_mems_allowed(struct task_struct *tsk) | 2418 | nodemask_t cpuset_mems_allowed(struct task_struct *tsk) |
| 2416 | { | 2419 | { |
| 2417 | nodemask_t mask; | 2420 | nodemask_t mask; |
| 2421 | unsigned long flags; | ||
| 2418 | 2422 | ||
| 2419 | mutex_lock(&callback_mutex); | 2423 | spin_lock_irqsave(&callback_lock, flags); |
| 2420 | rcu_read_lock(); | 2424 | rcu_read_lock(); |
| 2421 | guarantee_online_mems(task_cs(tsk), &mask); | 2425 | guarantee_online_mems(task_cs(tsk), &mask); |
| 2422 | rcu_read_unlock(); | 2426 | rcu_read_unlock(); |
| 2423 | mutex_unlock(&callback_mutex); | 2427 | spin_unlock_irqrestore(&callback_lock, flags); |
| 2424 | 2428 | ||
| 2425 | return mask; | 2429 | return mask; |
| 2426 | } | 2430 | } |
| @@ -2439,7 +2443,7 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) | |||
| 2439 | /* | 2443 | /* |
| 2440 | * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or | 2444 | * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or |
| 2441 | * mem_hardwall ancestor to the specified cpuset. Call holding | 2445 | * mem_hardwall ancestor to the specified cpuset. Call holding |
| 2442 | * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall | 2446 | * callback_lock. If no ancestor is mem_exclusive or mem_hardwall |
| 2443 | * (an unusual configuration), then returns the root cpuset. | 2447 | * (an unusual configuration), then returns the root cpuset. |
| 2444 | */ | 2448 | */ |
| 2445 | static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) | 2449 | static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) |
| @@ -2450,7 +2454,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) | |||
| 2450 | } | 2454 | } |
| 2451 | 2455 | ||
| 2452 | /** | 2456 | /** |
| 2453 | * cpuset_node_allowed_softwall - Can we allocate on a memory node? | 2457 | * cpuset_node_allowed - Can we allocate on a memory node? |
| 2454 | * @node: is this an allowed node? | 2458 | * @node: is this an allowed node? |
| 2455 | * @gfp_mask: memory allocation flags | 2459 | * @gfp_mask: memory allocation flags |
| 2456 | * | 2460 | * |
| @@ -2462,13 +2466,6 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) | |||
| 2462 | * flag, yes. | 2466 | * flag, yes. |
| 2463 | * Otherwise, no. | 2467 | * Otherwise, no. |
| 2464 | * | 2468 | * |
| 2465 | * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to | ||
| 2466 | * cpuset_node_allowed_hardwall(). Otherwise, cpuset_node_allowed_softwall() | ||
| 2467 | * might sleep, and might allow a node from an enclosing cpuset. | ||
| 2468 | * | ||
| 2469 | * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall | ||
| 2470 | * cpusets, and never sleeps. | ||
| 2471 | * | ||
| 2472 | * The __GFP_THISNODE placement logic is really handled elsewhere, | 2469 | * The __GFP_THISNODE placement logic is really handled elsewhere, |
| 2473 | * by forcibly using a zonelist starting at a specified node, and by | 2470 | * by forcibly using a zonelist starting at a specified node, and by |
| 2474 | * (in get_page_from_freelist()) refusing to consider the zones for | 2471 | * (in get_page_from_freelist()) refusing to consider the zones for |
| @@ -2481,13 +2478,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) | |||
| 2481 | * GFP_KERNEL allocations are not so marked, so can escape to the | 2478 | * GFP_KERNEL allocations are not so marked, so can escape to the |
| 2482 | * nearest enclosing hardwalled ancestor cpuset. | 2479 | * nearest enclosing hardwalled ancestor cpuset. |
| 2483 | * | 2480 | * |
| 2484 | * Scanning up parent cpusets requires callback_mutex. The | 2481 | * Scanning up parent cpusets requires callback_lock. The |
| 2485 | * __alloc_pages() routine only calls here with __GFP_HARDWALL bit | 2482 | * __alloc_pages() routine only calls here with __GFP_HARDWALL bit |
| 2486 | * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the | 2483 | * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the |
| 2487 | * current tasks mems_allowed came up empty on the first pass over | 2484 | * current tasks mems_allowed came up empty on the first pass over |
| 2488 | * the zonelist. So only GFP_KERNEL allocations, if all nodes in the | 2485 | * the zonelist. So only GFP_KERNEL allocations, if all nodes in the |
| 2489 | * cpuset are short of memory, might require taking the callback_mutex | 2486 | * cpuset are short of memory, might require taking the callback_lock. |
| 2490 | * mutex. | ||
| 2491 | * | 2487 | * |
| 2492 | * The first call here from mm/page_alloc:get_page_from_freelist() | 2488 | * The first call here from mm/page_alloc:get_page_from_freelist() |
| 2493 | * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, | 2489 | * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, |
| @@ -2504,20 +2500,15 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) | |||
| 2504 | * TIF_MEMDIE - any node ok | 2500 | * TIF_MEMDIE - any node ok |
| 2505 | * GFP_KERNEL - any node in enclosing hardwalled cpuset ok | 2501 | * GFP_KERNEL - any node in enclosing hardwalled cpuset ok |
| 2506 | * GFP_USER - only nodes in current tasks mems allowed ok. | 2502 | * GFP_USER - only nodes in current tasks mems allowed ok. |
| 2507 | * | ||
| 2508 | * Rule: | ||
| 2509 | * Don't call cpuset_node_allowed_softwall if you can't sleep, unless you | ||
| 2510 | * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables | ||
| 2511 | * the code that might scan up ancestor cpusets and sleep. | ||
| 2512 | */ | 2503 | */ |
| 2513 | int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) | 2504 | int __cpuset_node_allowed(int node, gfp_t gfp_mask) |
| 2514 | { | 2505 | { |
| 2515 | struct cpuset *cs; /* current cpuset ancestors */ | 2506 | struct cpuset *cs; /* current cpuset ancestors */ |
| 2516 | int allowed; /* is allocation in zone z allowed? */ | 2507 | int allowed; /* is allocation in zone z allowed? */ |
| 2508 | unsigned long flags; | ||
| 2517 | 2509 | ||
| 2518 | if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) | 2510 | if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) |
| 2519 | return 1; | 2511 | return 1; |
| 2520 | might_sleep_if(!(gfp_mask & __GFP_HARDWALL)); | ||
| 2521 | if (node_isset(node, current->mems_allowed)) | 2512 | if (node_isset(node, current->mems_allowed)) |
| 2522 | return 1; | 2513 | return 1; |
| 2523 | /* | 2514 | /* |
| @@ -2533,55 +2524,17 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) | |||
| 2533 | return 1; | 2524 | return 1; |
| 2534 | 2525 | ||
| 2535 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ | 2526 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ |
| 2536 | mutex_lock(&callback_mutex); | 2527 | spin_lock_irqsave(&callback_lock, flags); |
| 2537 | 2528 | ||
| 2538 | rcu_read_lock(); | 2529 | rcu_read_lock(); |
| 2539 | cs = nearest_hardwall_ancestor(task_cs(current)); | 2530 | cs = nearest_hardwall_ancestor(task_cs(current)); |
| 2540 | allowed = node_isset(node, cs->mems_allowed); | 2531 | allowed = node_isset(node, cs->mems_allowed); |
| 2541 | rcu_read_unlock(); | 2532 | rcu_read_unlock(); |
| 2542 | 2533 | ||
| 2543 | mutex_unlock(&callback_mutex); | 2534 | spin_unlock_irqrestore(&callback_lock, flags); |
| 2544 | return allowed; | 2535 | return allowed; |
| 2545 | } | 2536 | } |
| 2546 | 2537 | ||
| 2547 | /* | ||
| 2548 | * cpuset_node_allowed_hardwall - Can we allocate on a memory node? | ||
| 2549 | * @node: is this an allowed node? | ||
| 2550 | * @gfp_mask: memory allocation flags | ||
| 2551 | * | ||
| 2552 | * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is | ||
| 2553 | * set, yes, we can always allocate. If node is in our task's mems_allowed, | ||
| 2554 | * yes. If the task has been OOM killed and has access to memory reserves as | ||
| 2555 | * specified by the TIF_MEMDIE flag, yes. | ||
| 2556 | * Otherwise, no. | ||
| 2557 | * | ||
| 2558 | * The __GFP_THISNODE placement logic is really handled elsewhere, | ||
| 2559 | * by forcibly using a zonelist starting at a specified node, and by | ||
| 2560 | * (in get_page_from_freelist()) refusing to consider the zones for | ||
| 2561 | * any node on the zonelist except the first. By the time any such | ||
| 2562 | * calls get to this routine, we should just shut up and say 'yes'. | ||
| 2563 | * | ||
| 2564 | * Unlike the cpuset_node_allowed_softwall() variant, above, | ||
| 2565 | * this variant requires that the node be in the current task's | ||
| 2566 | * mems_allowed or that we're in interrupt. It does not scan up the | ||
| 2567 | * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset. | ||
| 2568 | * It never sleeps. | ||
| 2569 | */ | ||
| 2570 | int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) | ||
| 2571 | { | ||
| 2572 | if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) | ||
| 2573 | return 1; | ||
| 2574 | if (node_isset(node, current->mems_allowed)) | ||
| 2575 | return 1; | ||
| 2576 | /* | ||
| 2577 | * Allow tasks that have access to memory reserves because they have | ||
| 2578 | * been OOM killed to get memory anywhere. | ||
| 2579 | */ | ||
| 2580 | if (unlikely(test_thread_flag(TIF_MEMDIE))) | ||
| 2581 | return 1; | ||
| 2582 | return 0; | ||
| 2583 | } | ||
| 2584 | |||
| 2585 | /** | 2538 | /** |
| 2586 | * cpuset_mem_spread_node() - On which node to begin search for a file page | 2539 | * cpuset_mem_spread_node() - On which node to begin search for a file page |
| 2587 | * cpuset_slab_spread_node() - On which node to begin search for a slab page | 2540 | * cpuset_slab_spread_node() - On which node to begin search for a slab page |
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 1adf62b39b96..07ce18ca71e0 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c | |||
| @@ -27,6 +27,9 @@ | |||
| 27 | * version 2. This program is licensed "as is" without any warranty of any | 27 | * version 2. This program is licensed "as is" without any warranty of any |
| 28 | * kind, whether express or implied. | 28 | * kind, whether express or implied. |
| 29 | */ | 29 | */ |
| 30 | |||
| 31 | #define pr_fmt(fmt) "KGDB: " fmt | ||
| 32 | |||
| 30 | #include <linux/pid_namespace.h> | 33 | #include <linux/pid_namespace.h> |
| 31 | #include <linux/clocksource.h> | 34 | #include <linux/clocksource.h> |
| 32 | #include <linux/serial_core.h> | 35 | #include <linux/serial_core.h> |
| @@ -196,8 +199,8 @@ int __weak kgdb_validate_break_address(unsigned long addr) | |||
| 196 | return err; | 199 | return err; |
| 197 | err = kgdb_arch_remove_breakpoint(&tmp); | 200 | err = kgdb_arch_remove_breakpoint(&tmp); |
| 198 | if (err) | 201 | if (err) |
| 199 | printk(KERN_ERR "KGDB: Critical breakpoint error, kernel " | 202 | pr_err("Critical breakpoint error, kernel memory destroyed at: %lx\n", |
| 200 | "memory destroyed at: %lx", addr); | 203 | addr); |
| 201 | return err; | 204 | return err; |
| 202 | } | 205 | } |
| 203 | 206 | ||
| @@ -256,8 +259,8 @@ int dbg_activate_sw_breakpoints(void) | |||
| 256 | error = kgdb_arch_set_breakpoint(&kgdb_break[i]); | 259 | error = kgdb_arch_set_breakpoint(&kgdb_break[i]); |
| 257 | if (error) { | 260 | if (error) { |
| 258 | ret = error; | 261 | ret = error; |
| 259 | printk(KERN_INFO "KGDB: BP install failed: %lx", | 262 | pr_info("BP install failed: %lx\n", |
| 260 | kgdb_break[i].bpt_addr); | 263 | kgdb_break[i].bpt_addr); |
| 261 | continue; | 264 | continue; |
| 262 | } | 265 | } |
| 263 | 266 | ||
| @@ -319,8 +322,8 @@ int dbg_deactivate_sw_breakpoints(void) | |||
| 319 | continue; | 322 | continue; |
| 320 | error = kgdb_arch_remove_breakpoint(&kgdb_break[i]); | 323 | error = kgdb_arch_remove_breakpoint(&kgdb_break[i]); |
| 321 | if (error) { | 324 | if (error) { |
| 322 | printk(KERN_INFO "KGDB: BP remove failed: %lx\n", | 325 | pr_info("BP remove failed: %lx\n", |
| 323 | kgdb_break[i].bpt_addr); | 326 | kgdb_break[i].bpt_addr); |
| 324 | ret = error; | 327 | ret = error; |
| 325 | } | 328 | } |
| 326 | 329 | ||
| @@ -367,7 +370,7 @@ int dbg_remove_all_break(void) | |||
| 367 | goto setundefined; | 370 | goto setundefined; |
| 368 | error = kgdb_arch_remove_breakpoint(&kgdb_break[i]); | 371 | error = kgdb_arch_remove_breakpoint(&kgdb_break[i]); |
| 369 | if (error) | 372 | if (error) |
| 370 | printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n", | 373 | pr_err("breakpoint remove failed: %lx\n", |
| 371 | kgdb_break[i].bpt_addr); | 374 | kgdb_break[i].bpt_addr); |
| 372 | setundefined: | 375 | setundefined: |
| 373 | kgdb_break[i].state = BP_UNDEFINED; | 376 | kgdb_break[i].state = BP_UNDEFINED; |
| @@ -400,9 +403,9 @@ static int kgdb_io_ready(int print_wait) | |||
| 400 | if (print_wait) { | 403 | if (print_wait) { |
| 401 | #ifdef CONFIG_KGDB_KDB | 404 | #ifdef CONFIG_KGDB_KDB |
| 402 | if (!dbg_kdb_mode) | 405 | if (!dbg_kdb_mode) |
| 403 | printk(KERN_CRIT "KGDB: waiting... or $3#33 for KDB\n"); | 406 | pr_crit("waiting... or $3#33 for KDB\n"); |
| 404 | #else | 407 | #else |
| 405 | printk(KERN_CRIT "KGDB: Waiting for remote debugger\n"); | 408 | pr_crit("Waiting for remote debugger\n"); |
| 406 | #endif | 409 | #endif |
| 407 | } | 410 | } |
| 408 | return 1; | 411 | return 1; |
| @@ -430,8 +433,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks) | |||
| 430 | exception_level = 0; | 433 | exception_level = 0; |
| 431 | kgdb_skipexception(ks->ex_vector, ks->linux_regs); | 434 | kgdb_skipexception(ks->ex_vector, ks->linux_regs); |
| 432 | dbg_activate_sw_breakpoints(); | 435 | dbg_activate_sw_breakpoints(); |
| 433 | printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n", | 436 | pr_crit("re-enter error: breakpoint removed %lx\n", addr); |
| 434 | addr); | ||
| 435 | WARN_ON_ONCE(1); | 437 | WARN_ON_ONCE(1); |
| 436 | 438 | ||
| 437 | return 1; | 439 | return 1; |
| @@ -444,7 +446,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks) | |||
| 444 | panic("Recursive entry to debugger"); | 446 | panic("Recursive entry to debugger"); |
| 445 | } | 447 | } |
| 446 | 448 | ||
| 447 | printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n"); | 449 | pr_crit("re-enter exception: ALL breakpoints killed\n"); |
| 448 | #ifdef CONFIG_KGDB_KDB | 450 | #ifdef CONFIG_KGDB_KDB |
| 449 | /* Allow kdb to debug itself one level */ | 451 | /* Allow kdb to debug itself one level */ |
| 450 | return 0; | 452 | return 0; |
| @@ -471,6 +473,7 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs, | |||
| 471 | int cpu; | 473 | int cpu; |
| 472 | int trace_on = 0; | 474 | int trace_on = 0; |
| 473 | int online_cpus = num_online_cpus(); | 475 | int online_cpus = num_online_cpus(); |
| 476 | u64 time_left; | ||
| 474 | 477 | ||
| 475 | kgdb_info[ks->cpu].enter_kgdb++; | 478 | kgdb_info[ks->cpu].enter_kgdb++; |
| 476 | kgdb_info[ks->cpu].exception_state |= exception_state; | 479 | kgdb_info[ks->cpu].exception_state |= exception_state; |
| @@ -595,9 +598,13 @@ return_normal: | |||
| 595 | /* | 598 | /* |
| 596 | * Wait for the other CPUs to be notified and be waiting for us: | 599 | * Wait for the other CPUs to be notified and be waiting for us: |
| 597 | */ | 600 | */ |
| 598 | while (kgdb_do_roundup && (atomic_read(&masters_in_kgdb) + | 601 | time_left = loops_per_jiffy * HZ; |
| 599 | atomic_read(&slaves_in_kgdb)) != online_cpus) | 602 | while (kgdb_do_roundup && --time_left && |
| 603 | (atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) != | ||
| 604 | online_cpus) | ||
| 600 | cpu_relax(); | 605 | cpu_relax(); |
| 606 | if (!time_left) | ||
| 607 | pr_crit("KGDB: Timed out waiting for secondary CPUs.\n"); | ||
| 601 | 608 | ||
| 602 | /* | 609 | /* |
| 603 | * At this point the primary processor is completely | 610 | * At this point the primary processor is completely |
| @@ -795,15 +802,15 @@ static struct console kgdbcons = { | |||
| 795 | static void sysrq_handle_dbg(int key) | 802 | static void sysrq_handle_dbg(int key) |
| 796 | { | 803 | { |
| 797 | if (!dbg_io_ops) { | 804 | if (!dbg_io_ops) { |
| 798 | printk(KERN_CRIT "ERROR: No KGDB I/O module available\n"); | 805 | pr_crit("ERROR: No KGDB I/O module available\n"); |
| 799 | return; | 806 | return; |
| 800 | } | 807 | } |
| 801 | if (!kgdb_connected) { | 808 | if (!kgdb_connected) { |
| 802 | #ifdef CONFIG_KGDB_KDB | 809 | #ifdef CONFIG_KGDB_KDB |
| 803 | if (!dbg_kdb_mode) | 810 | if (!dbg_kdb_mode) |
| 804 | printk(KERN_CRIT "KGDB or $3#33 for KDB\n"); | 811 | pr_crit("KGDB or $3#33 for KDB\n"); |
| 805 | #else | 812 | #else |
| 806 | printk(KERN_CRIT "Entering KGDB\n"); | 813 | pr_crit("Entering KGDB\n"); |
| 807 | #endif | 814 | #endif |
| 808 | } | 815 | } |
| 809 | 816 | ||
| @@ -945,7 +952,7 @@ static void kgdb_initial_breakpoint(void) | |||
| 945 | { | 952 | { |
| 946 | kgdb_break_asap = 0; | 953 | kgdb_break_asap = 0; |
| 947 | 954 | ||
| 948 | printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n"); | 955 | pr_crit("Waiting for connection from remote gdb...\n"); |
| 949 | kgdb_breakpoint(); | 956 | kgdb_breakpoint(); |
| 950 | } | 957 | } |
| 951 | 958 | ||
| @@ -964,8 +971,7 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops) | |||
| 964 | if (dbg_io_ops) { | 971 | if (dbg_io_ops) { |
| 965 | spin_unlock(&kgdb_registration_lock); | 972 | spin_unlock(&kgdb_registration_lock); |
| 966 | 973 | ||
| 967 | printk(KERN_ERR "kgdb: Another I/O driver is already " | 974 | pr_err("Another I/O driver is already registered with KGDB\n"); |
| 968 | "registered with KGDB.\n"); | ||
| 969 | return -EBUSY; | 975 | return -EBUSY; |
| 970 | } | 976 | } |
| 971 | 977 | ||
| @@ -981,8 +987,7 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops) | |||
| 981 | 987 | ||
| 982 | spin_unlock(&kgdb_registration_lock); | 988 | spin_unlock(&kgdb_registration_lock); |
| 983 | 989 | ||
| 984 | printk(KERN_INFO "kgdb: Registered I/O driver %s.\n", | 990 | pr_info("Registered I/O driver %s\n", new_dbg_io_ops->name); |
| 985 | new_dbg_io_ops->name); | ||
| 986 | 991 | ||
| 987 | /* Arm KGDB now. */ | 992 | /* Arm KGDB now. */ |
| 988 | kgdb_register_callbacks(); | 993 | kgdb_register_callbacks(); |
| @@ -1017,8 +1022,7 @@ void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops) | |||
| 1017 | 1022 | ||
| 1018 | spin_unlock(&kgdb_registration_lock); | 1023 | spin_unlock(&kgdb_registration_lock); |
| 1019 | 1024 | ||
| 1020 | printk(KERN_INFO | 1025 | pr_info("Unregistered I/O driver %s, debugger disabled\n", |
| 1021 | "kgdb: Unregistered I/O driver %s, debugger disabled.\n", | ||
| 1022 | old_dbg_io_ops->name); | 1026 | old_dbg_io_ops->name); |
| 1023 | } | 1027 | } |
| 1024 | EXPORT_SYMBOL_GPL(kgdb_unregister_io_module); | 1028 | EXPORT_SYMBOL_GPL(kgdb_unregister_io_module); |
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index b20d544f20c2..e1dbf4a2c69e 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c | |||
| @@ -531,22 +531,29 @@ void __init kdb_initbptab(void) | |||
| 531 | for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) | 531 | for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) |
| 532 | bp->bp_free = 1; | 532 | bp->bp_free = 1; |
| 533 | 533 | ||
| 534 | kdb_register_repeat("bp", kdb_bp, "[<vaddr>]", | 534 | kdb_register_flags("bp", kdb_bp, "[<vaddr>]", |
| 535 | "Set/Display breakpoints", 0, KDB_REPEAT_NO_ARGS); | 535 | "Set/Display breakpoints", 0, |
| 536 | kdb_register_repeat("bl", kdb_bp, "[<vaddr>]", | 536 | KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS); |
| 537 | "Display breakpoints", 0, KDB_REPEAT_NO_ARGS); | 537 | kdb_register_flags("bl", kdb_bp, "[<vaddr>]", |
| 538 | "Display breakpoints", 0, | ||
| 539 | KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS); | ||
| 538 | if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT) | 540 | if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT) |
| 539 | kdb_register_repeat("bph", kdb_bp, "[<vaddr>]", | 541 | kdb_register_flags("bph", kdb_bp, "[<vaddr>]", |
| 540 | "[datar [length]|dataw [length]] Set hw brk", 0, KDB_REPEAT_NO_ARGS); | 542 | "[datar [length]|dataw [length]] Set hw brk", 0, |
| 541 | kdb_register_repeat("bc", kdb_bc, "<bpnum>", | 543 | KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS); |
| 542 | "Clear Breakpoint", 0, KDB_REPEAT_NONE); | 544 | kdb_register_flags("bc", kdb_bc, "<bpnum>", |
| 543 | kdb_register_repeat("be", kdb_bc, "<bpnum>", | 545 | "Clear Breakpoint", 0, |
| 544 | "Enable Breakpoint", 0, KDB_REPEAT_NONE); | 546 | KDB_ENABLE_FLOW_CTRL); |
| 545 | kdb_register_repeat("bd", kdb_bc, "<bpnum>", | 547 | kdb_register_flags("be", kdb_bc, "<bpnum>", |
| 546 | "Disable Breakpoint", 0, KDB_REPEAT_NONE); | 548 | "Enable Breakpoint", 0, |
| 547 | 549 | KDB_ENABLE_FLOW_CTRL); | |
| 548 | kdb_register_repeat("ss", kdb_ss, "", | 550 | kdb_register_flags("bd", kdb_bc, "<bpnum>", |
| 549 | "Single Step", 1, KDB_REPEAT_NO_ARGS); | 551 | "Disable Breakpoint", 0, |
| 552 | KDB_ENABLE_FLOW_CTRL); | ||
| 553 | |||
| 554 | kdb_register_flags("ss", kdb_ss, "", | ||
| 555 | "Single Step", 1, | ||
| 556 | KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS); | ||
| 550 | /* | 557 | /* |
| 551 | * Architecture dependent initialization. | 558 | * Architecture dependent initialization. |
| 552 | */ | 559 | */ |
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index 8859ca34dcfe..15e1a7af5dd0 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c | |||
| @@ -129,6 +129,10 @@ int kdb_stub(struct kgdb_state *ks) | |||
| 129 | ks->pass_exception = 1; | 129 | ks->pass_exception = 1; |
| 130 | KDB_FLAG_SET(CATASTROPHIC); | 130 | KDB_FLAG_SET(CATASTROPHIC); |
| 131 | } | 131 | } |
| 132 | /* set CATASTROPHIC if the system contains unresponsive processors */ | ||
| 133 | for_each_online_cpu(i) | ||
| 134 | if (!kgdb_info[i].enter_kgdb) | ||
| 135 | KDB_FLAG_SET(CATASTROPHIC); | ||
| 132 | if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) { | 136 | if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) { |
| 133 | KDB_STATE_CLEAR(SSBPT); | 137 | KDB_STATE_CLEAR(SSBPT); |
| 134 | KDB_STATE_CLEAR(DOING_SS); | 138 | KDB_STATE_CLEAR(DOING_SS); |
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 379650b984f8..7b40c5f07dce 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c | |||
| @@ -12,6 +12,7 @@ | |||
| 12 | */ | 12 | */ |
| 13 | 13 | ||
| 14 | #include <linux/ctype.h> | 14 | #include <linux/ctype.h> |
| 15 | #include <linux/types.h> | ||
| 15 | #include <linux/string.h> | 16 | #include <linux/string.h> |
| 16 | #include <linux/kernel.h> | 17 | #include <linux/kernel.h> |
| 17 | #include <linux/kmsg_dump.h> | 18 | #include <linux/kmsg_dump.h> |
| @@ -23,6 +24,7 @@ | |||
| 23 | #include <linux/vmalloc.h> | 24 | #include <linux/vmalloc.h> |
| 24 | #include <linux/atomic.h> | 25 | #include <linux/atomic.h> |
| 25 | #include <linux/module.h> | 26 | #include <linux/module.h> |
| 27 | #include <linux/moduleparam.h> | ||
| 26 | #include <linux/mm.h> | 28 | #include <linux/mm.h> |
| 27 | #include <linux/init.h> | 29 | #include <linux/init.h> |
| 28 | #include <linux/kallsyms.h> | 30 | #include <linux/kallsyms.h> |
| @@ -42,6 +44,12 @@ | |||
| 42 | #include <linux/slab.h> | 44 | #include <linux/slab.h> |
| 43 | #include "kdb_private.h" | 45 | #include "kdb_private.h" |
| 44 | 46 | ||
| 47 | #undef MODULE_PARAM_PREFIX | ||
| 48 | #define MODULE_PARAM_PREFIX "kdb." | ||
| 49 | |||
| 50 | static int kdb_cmd_enabled = CONFIG_KDB_DEFAULT_ENABLE; | ||
| 51 | module_param_named(cmd_enable, kdb_cmd_enabled, int, 0600); | ||
| 52 | |||
| 45 | #define GREP_LEN 256 | 53 | #define GREP_LEN 256 |
| 46 | char kdb_grep_string[GREP_LEN]; | 54 | char kdb_grep_string[GREP_LEN]; |
| 47 | int kdb_grepping_flag; | 55 | int kdb_grepping_flag; |
| @@ -121,6 +129,7 @@ static kdbmsg_t kdbmsgs[] = { | |||
| 121 | KDBMSG(BADLENGTH, "Invalid length field"), | 129 | KDBMSG(BADLENGTH, "Invalid length field"), |
| 122 | KDBMSG(NOBP, "No Breakpoint exists"), | 130 | KDBMSG(NOBP, "No Breakpoint exists"), |
| 123 | KDBMSG(BADADDR, "Invalid address"), | 131 | KDBMSG(BADADDR, "Invalid address"), |
| 132 | KDBMSG(NOPERM, "Permission denied"), | ||
| 124 | }; | 133 | }; |
| 125 | #undef KDBMSG | 134 | #undef KDBMSG |
| 126 | 135 | ||
| @@ -188,6 +197,26 @@ struct task_struct *kdb_curr_task(int cpu) | |||
| 188 | } | 197 | } |
| 189 | 198 | ||
| 190 | /* | 199 | /* |
| 200 | * Check whether the flags of the current command and the permissions | ||
| 201 | * of the kdb console has allow a command to be run. | ||
| 202 | */ | ||
| 203 | static inline bool kdb_check_flags(kdb_cmdflags_t flags, int permissions, | ||
| 204 | bool no_args) | ||
| 205 | { | ||
| 206 | /* permissions comes from userspace so needs massaging slightly */ | ||
| 207 | permissions &= KDB_ENABLE_MASK; | ||
| 208 | permissions |= KDB_ENABLE_ALWAYS_SAFE; | ||
| 209 | |||
| 210 | /* some commands change group when launched with no arguments */ | ||
| 211 | if (no_args) | ||
| 212 | permissions |= permissions << KDB_ENABLE_NO_ARGS_SHIFT; | ||
| 213 | |||
| 214 | flags |= KDB_ENABLE_ALL; | ||
| 215 | |||
| 216 | return permissions & flags; | ||
| 217 | } | ||
| 218 | |||
| 219 | /* | ||
| 191 | * kdbgetenv - This function will return the character string value of | 220 | * kdbgetenv - This function will return the character string value of |
| 192 | * an environment variable. | 221 | * an environment variable. |
| 193 | * Parameters: | 222 | * Parameters: |
| @@ -476,6 +505,15 @@ int kdbgetaddrarg(int argc, const char **argv, int *nextarg, | |||
| 476 | kdb_symtab_t symtab; | 505 | kdb_symtab_t symtab; |
| 477 | 506 | ||
| 478 | /* | 507 | /* |
| 508 | * If the enable flags prohibit both arbitrary memory access | ||
| 509 | * and flow control then there are no reasonable grounds to | ||
| 510 | * provide symbol lookup. | ||
| 511 | */ | ||
| 512 | if (!kdb_check_flags(KDB_ENABLE_MEM_READ | KDB_ENABLE_FLOW_CTRL, | ||
| 513 | kdb_cmd_enabled, false)) | ||
| 514 | return KDB_NOPERM; | ||
| 515 | |||
| 516 | /* | ||
| 479 | * Process arguments which follow the following syntax: | 517 | * Process arguments which follow the following syntax: |
| 480 | * | 518 | * |
| 481 | * symbol | numeric-address [+/- numeric-offset] | 519 | * symbol | numeric-address [+/- numeric-offset] |
| @@ -641,8 +679,13 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0) | |||
| 641 | if (!s->count) | 679 | if (!s->count) |
| 642 | s->usable = 0; | 680 | s->usable = 0; |
| 643 | if (s->usable) | 681 | if (s->usable) |
| 644 | kdb_register(s->name, kdb_exec_defcmd, | 682 | /* macros are always safe because when executed each |
| 645 | s->usage, s->help, 0); | 683 | * internal command re-enters kdb_parse() and is |
| 684 | * safety checked individually. | ||
| 685 | */ | ||
| 686 | kdb_register_flags(s->name, kdb_exec_defcmd, s->usage, | ||
| 687 | s->help, 0, | ||
| 688 | KDB_ENABLE_ALWAYS_SAFE); | ||
| 646 | return 0; | 689 | return 0; |
| 647 | } | 690 | } |
| 648 | if (!s->usable) | 691 | if (!s->usable) |
| @@ -1003,25 +1046,22 @@ int kdb_parse(const char *cmdstr) | |||
| 1003 | 1046 | ||
| 1004 | if (i < kdb_max_commands) { | 1047 | if (i < kdb_max_commands) { |
| 1005 | int result; | 1048 | int result; |
| 1049 | |||
| 1050 | if (!kdb_check_flags(tp->cmd_flags, kdb_cmd_enabled, argc <= 1)) | ||
| 1051 | return KDB_NOPERM; | ||
| 1052 | |||
| 1006 | KDB_STATE_SET(CMD); | 1053 | KDB_STATE_SET(CMD); |
| 1007 | result = (*tp->cmd_func)(argc-1, (const char **)argv); | 1054 | result = (*tp->cmd_func)(argc-1, (const char **)argv); |
| 1008 | if (result && ignore_errors && result > KDB_CMD_GO) | 1055 | if (result && ignore_errors && result > KDB_CMD_GO) |
| 1009 | result = 0; | 1056 | result = 0; |
| 1010 | KDB_STATE_CLEAR(CMD); | 1057 | KDB_STATE_CLEAR(CMD); |
| 1011 | switch (tp->cmd_repeat) { | 1058 | |
| 1012 | case KDB_REPEAT_NONE: | 1059 | if (tp->cmd_flags & KDB_REPEAT_WITH_ARGS) |
| 1013 | argc = 0; | 1060 | return result; |
| 1014 | if (argv[0]) | 1061 | |
| 1015 | *(argv[0]) = '\0'; | 1062 | argc = tp->cmd_flags & KDB_REPEAT_NO_ARGS ? 1 : 0; |
| 1016 | break; | 1063 | if (argv[argc]) |
| 1017 | case KDB_REPEAT_NO_ARGS: | 1064 | *(argv[argc]) = '\0'; |
| 1018 | argc = 1; | ||
| 1019 | if (argv[1]) | ||
| 1020 | *(argv[1]) = '\0'; | ||
| 1021 | break; | ||
| 1022 | case KDB_REPEAT_WITH_ARGS: | ||
| 1023 | break; | ||
| 1024 | } | ||
| 1025 | return result; | 1065 | return result; |
| 1026 | } | 1066 | } |
| 1027 | 1067 | ||
| @@ -1921,10 +1961,14 @@ static int kdb_rm(int argc, const char **argv) | |||
| 1921 | */ | 1961 | */ |
| 1922 | static int kdb_sr(int argc, const char **argv) | 1962 | static int kdb_sr(int argc, const char **argv) |
| 1923 | { | 1963 | { |
| 1964 | bool check_mask = | ||
| 1965 | !kdb_check_flags(KDB_ENABLE_ALL, kdb_cmd_enabled, false); | ||
| 1966 | |||
| 1924 | if (argc != 1) | 1967 | if (argc != 1) |
| 1925 | return KDB_ARGCOUNT; | 1968 | return KDB_ARGCOUNT; |
| 1969 | |||
| 1926 | kdb_trap_printk++; | 1970 | kdb_trap_printk++; |
| 1927 | __handle_sysrq(*argv[1], false); | 1971 | __handle_sysrq(*argv[1], check_mask); |
| 1928 | kdb_trap_printk--; | 1972 | kdb_trap_printk--; |
| 1929 | 1973 | ||
| 1930 | return 0; | 1974 | return 0; |
| @@ -1979,7 +2023,7 @@ static int kdb_lsmod(int argc, const char **argv) | |||
| 1979 | kdb_printf("%-20s%8u 0x%p ", mod->name, | 2023 | kdb_printf("%-20s%8u 0x%p ", mod->name, |
| 1980 | mod->core_size, (void *)mod); | 2024 | mod->core_size, (void *)mod); |
| 1981 | #ifdef CONFIG_MODULE_UNLOAD | 2025 | #ifdef CONFIG_MODULE_UNLOAD |
| 1982 | kdb_printf("%4ld ", module_refcount(mod)); | 2026 | kdb_printf("%4d ", module_refcount(mod)); |
| 1983 | #endif | 2027 | #endif |
| 1984 | if (mod->state == MODULE_STATE_GOING) | 2028 | if (mod->state == MODULE_STATE_GOING) |
| 1985 | kdb_printf(" (Unloading)"); | 2029 | kdb_printf(" (Unloading)"); |
| @@ -2157,6 +2201,8 @@ static void kdb_cpu_status(void) | |||
| 2157 | for (start_cpu = -1, i = 0; i < NR_CPUS; i++) { | 2201 | for (start_cpu = -1, i = 0; i < NR_CPUS; i++) { |
| 2158 | if (!cpu_online(i)) { | 2202 | if (!cpu_online(i)) { |
| 2159 | state = 'F'; /* cpu is offline */ | 2203 | state = 'F'; /* cpu is offline */ |
| 2204 | } else if (!kgdb_info[i].enter_kgdb) { | ||
| 2205 | state = 'D'; /* cpu is online but unresponsive */ | ||
| 2160 | } else { | 2206 | } else { |
| 2161 | state = ' '; /* cpu is responding to kdb */ | 2207 | state = ' '; /* cpu is responding to kdb */ |
| 2162 | if (kdb_task_state_char(KDB_TSK(i)) == 'I') | 2208 | if (kdb_task_state_char(KDB_TSK(i)) == 'I') |
| @@ -2210,7 +2256,7 @@ static int kdb_cpu(int argc, const char **argv) | |||
| 2210 | /* | 2256 | /* |
| 2211 | * Validate cpunum | 2257 | * Validate cpunum |
| 2212 | */ | 2258 | */ |
| 2213 | if ((cpunum > NR_CPUS) || !cpu_online(cpunum)) | 2259 | if ((cpunum > NR_CPUS) || !kgdb_info[cpunum].enter_kgdb) |
| 2214 | return KDB_BADCPUNUM; | 2260 | return KDB_BADCPUNUM; |
| 2215 | 2261 | ||
| 2216 | dbg_switch_cpu = cpunum; | 2262 | dbg_switch_cpu = cpunum; |
| @@ -2375,6 +2421,8 @@ static int kdb_help(int argc, const char **argv) | |||
| 2375 | return 0; | 2421 | return 0; |
| 2376 | if (!kt->cmd_name) | 2422 | if (!kt->cmd_name) |
| 2377 | continue; | 2423 | continue; |
| 2424 | if (!kdb_check_flags(kt->cmd_flags, kdb_cmd_enabled, true)) | ||
| 2425 | continue; | ||
| 2378 | if (strlen(kt->cmd_usage) > 20) | 2426 | if (strlen(kt->cmd_usage) > 20) |
| 2379 | space = "\n "; | 2427 | space = "\n "; |
| 2380 | kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name, | 2428 | kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name, |
| @@ -2629,7 +2677,7 @@ static int kdb_grep_help(int argc, const char **argv) | |||
| 2629 | } | 2677 | } |
| 2630 | 2678 | ||
| 2631 | /* | 2679 | /* |
| 2632 | * kdb_register_repeat - This function is used to register a kernel | 2680 | * kdb_register_flags - This function is used to register a kernel |
| 2633 | * debugger command. | 2681 | * debugger command. |
| 2634 | * Inputs: | 2682 | * Inputs: |
| 2635 | * cmd Command name | 2683 | * cmd Command name |
| @@ -2641,12 +2689,12 @@ static int kdb_grep_help(int argc, const char **argv) | |||
| 2641 | * zero for success, one if a duplicate command. | 2689 | * zero for success, one if a duplicate command. |
| 2642 | */ | 2690 | */ |
| 2643 | #define kdb_command_extend 50 /* arbitrary */ | 2691 | #define kdb_command_extend 50 /* arbitrary */ |
| 2644 | int kdb_register_repeat(char *cmd, | 2692 | int kdb_register_flags(char *cmd, |
| 2645 | kdb_func_t func, | 2693 | kdb_func_t func, |
| 2646 | char *usage, | 2694 | char *usage, |
| 2647 | char *help, | 2695 | char *help, |
| 2648 | short minlen, | 2696 | short minlen, |
| 2649 | kdb_repeat_t repeat) | 2697 | kdb_cmdflags_t flags) |
| 2650 | { | 2698 | { |
| 2651 | int i; | 2699 | int i; |
| 2652 | kdbtab_t *kp; | 2700 | kdbtab_t *kp; |
| @@ -2694,19 +2742,18 @@ int kdb_register_repeat(char *cmd, | |||
| 2694 | kp->cmd_func = func; | 2742 | kp->cmd_func = func; |
| 2695 | kp->cmd_usage = usage; | 2743 | kp->cmd_usage = usage; |
| 2696 | kp->cmd_help = help; | 2744 | kp->cmd_help = help; |
| 2697 | kp->cmd_flags = 0; | ||
| 2698 | kp->cmd_minlen = minlen; | 2745 | kp->cmd_minlen = minlen; |
| 2699 | kp->cmd_repeat = repeat; | 2746 | kp->cmd_flags = flags; |
| 2700 | 2747 | ||
| 2701 | return 0; | 2748 | return 0; |
| 2702 | } | 2749 | } |
| 2703 | EXPORT_SYMBOL_GPL(kdb_register_repeat); | 2750 | EXPORT_SYMBOL_GPL(kdb_register_flags); |
| 2704 | 2751 | ||
| 2705 | 2752 | ||
| 2706 | /* | 2753 | /* |
| 2707 | * kdb_register - Compatibility register function for commands that do | 2754 | * kdb_register - Compatibility register function for commands that do |
| 2708 | * not need to specify a repeat state. Equivalent to | 2755 | * not need to specify a repeat state. Equivalent to |
| 2709 | * kdb_register_repeat with KDB_REPEAT_NONE. | 2756 | * kdb_register_flags with flags set to 0. |
| 2710 | * Inputs: | 2757 | * Inputs: |
| 2711 | * cmd Command name | 2758 | * cmd Command name |
| 2712 | * func Function to execute the command | 2759 | * func Function to execute the command |
| @@ -2721,8 +2768,7 @@ int kdb_register(char *cmd, | |||
| 2721 | char *help, | 2768 | char *help, |
| 2722 | short minlen) | 2769 | short minlen) |
| 2723 | { | 2770 | { |
| 2724 | return kdb_register_repeat(cmd, func, usage, help, minlen, | 2771 | return kdb_register_flags(cmd, func, usage, help, minlen, 0); |
| 2725 | KDB_REPEAT_NONE); | ||
| 2726 | } | 2772 | } |
| 2727 | EXPORT_SYMBOL_GPL(kdb_register); | 2773 | EXPORT_SYMBOL_GPL(kdb_register); |
| 2728 | 2774 | ||
| @@ -2764,80 +2810,109 @@ static void __init kdb_inittab(void) | |||
| 2764 | for_each_kdbcmd(kp, i) | 2810 | for_each_kdbcmd(kp, i) |
| 2765 | kp->cmd_name = NULL; | 2811 | kp->cmd_name = NULL; |
| 2766 | 2812 | ||
| 2767 | kdb_register_repeat("md", kdb_md, "<vaddr>", | 2813 | kdb_register_flags("md", kdb_md, "<vaddr>", |
| 2768 | "Display Memory Contents, also mdWcN, e.g. md8c1", 1, | 2814 | "Display Memory Contents, also mdWcN, e.g. md8c1", 1, |
| 2769 | KDB_REPEAT_NO_ARGS); | 2815 | KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS); |
| 2770 | kdb_register_repeat("mdr", kdb_md, "<vaddr> <bytes>", | 2816 | kdb_register_flags("mdr", kdb_md, "<vaddr> <bytes>", |
| 2771 | "Display Raw Memory", 0, KDB_REPEAT_NO_ARGS); | 2817 | "Display Raw Memory", 0, |
| 2772 | kdb_register_repeat("mdp", kdb_md, "<paddr> <bytes>", | 2818 | KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS); |
| 2773 | "Display Physical Memory", 0, KDB_REPEAT_NO_ARGS); | 2819 | kdb_register_flags("mdp", kdb_md, "<paddr> <bytes>", |
| 2774 | kdb_register_repeat("mds", kdb_md, "<vaddr>", | 2820 | "Display Physical Memory", 0, |
| 2775 | "Display Memory Symbolically", 0, KDB_REPEAT_NO_ARGS); | 2821 | KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS); |
| 2776 | kdb_register_repeat("mm", kdb_mm, "<vaddr> <contents>", | 2822 | kdb_register_flags("mds", kdb_md, "<vaddr>", |
| 2777 | "Modify Memory Contents", 0, KDB_REPEAT_NO_ARGS); | 2823 | "Display Memory Symbolically", 0, |
| 2778 | kdb_register_repeat("go", kdb_go, "[<vaddr>]", | 2824 | KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS); |
| 2779 | "Continue Execution", 1, KDB_REPEAT_NONE); | 2825 | kdb_register_flags("mm", kdb_mm, "<vaddr> <contents>", |
| 2780 | kdb_register_repeat("rd", kdb_rd, "", | 2826 | "Modify Memory Contents", 0, |
| 2781 | "Display Registers", 0, KDB_REPEAT_NONE); | 2827 | KDB_ENABLE_MEM_WRITE | KDB_REPEAT_NO_ARGS); |
| 2782 | kdb_register_repeat("rm", kdb_rm, "<reg> <contents>", | 2828 | kdb_register_flags("go", kdb_go, "[<vaddr>]", |
| 2783 | "Modify Registers", 0, KDB_REPEAT_NONE); | 2829 | "Continue Execution", 1, |
| 2784 | kdb_register_repeat("ef", kdb_ef, "<vaddr>", | 2830 | KDB_ENABLE_REG_WRITE | KDB_ENABLE_ALWAYS_SAFE_NO_ARGS); |
| 2785 | "Display exception frame", 0, KDB_REPEAT_NONE); | 2831 | kdb_register_flags("rd", kdb_rd, "", |
| 2786 | kdb_register_repeat("bt", kdb_bt, "[<vaddr>]", | 2832 | "Display Registers", 0, |
| 2787 | "Stack traceback", 1, KDB_REPEAT_NONE); | 2833 | KDB_ENABLE_REG_READ); |
| 2788 | kdb_register_repeat("btp", kdb_bt, "<pid>", | 2834 | kdb_register_flags("rm", kdb_rm, "<reg> <contents>", |
| 2789 | "Display stack for process <pid>", 0, KDB_REPEAT_NONE); | 2835 | "Modify Registers", 0, |
| 2790 | kdb_register_repeat("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]", | 2836 | KDB_ENABLE_REG_WRITE); |
| 2791 | "Backtrace all processes matching state flag", 0, KDB_REPEAT_NONE); | 2837 | kdb_register_flags("ef", kdb_ef, "<vaddr>", |
| 2792 | kdb_register_repeat("btc", kdb_bt, "", | 2838 | "Display exception frame", 0, |
| 2793 | "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE); | 2839 | KDB_ENABLE_MEM_READ); |
| 2794 | kdb_register_repeat("btt", kdb_bt, "<vaddr>", | 2840 | kdb_register_flags("bt", kdb_bt, "[<vaddr>]", |
| 2841 | "Stack traceback", 1, | ||
| 2842 | KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS); | ||
| 2843 | kdb_register_flags("btp", kdb_bt, "<pid>", | ||
| 2844 | "Display stack for process <pid>", 0, | ||
| 2845 | KDB_ENABLE_INSPECT); | ||
| 2846 | kdb_register_flags("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]", | ||
| 2847 | "Backtrace all processes matching state flag", 0, | ||
| 2848 | KDB_ENABLE_INSPECT); | ||
| 2849 | kdb_register_flags("btc", kdb_bt, "", | ||
| 2850 | "Backtrace current process on each cpu", 0, | ||
| 2851 | KDB_ENABLE_INSPECT); | ||
| 2852 | kdb_register_flags("btt", kdb_bt, "<vaddr>", | ||
| 2795 | "Backtrace process given its struct task address", 0, | 2853 | "Backtrace process given its struct task address", 0, |
| 2796 | KDB_REPEAT_NONE); | 2854 | KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS); |
| 2797 | kdb_register_repeat("env", kdb_env, "", | 2855 | kdb_register_flags("env", kdb_env, "", |
| 2798 | "Show environment variables", 0, KDB_REPEAT_NONE); | 2856 | "Show environment variables", 0, |
| 2799 | kdb_register_repeat("set", kdb_set, "", | 2857 | KDB_ENABLE_ALWAYS_SAFE); |
| 2800 | "Set environment variables", 0, KDB_REPEAT_NONE); | 2858 | kdb_register_flags("set", kdb_set, "", |
| 2801 | kdb_register_repeat("help", kdb_help, "", | 2859 | "Set environment variables", 0, |
| 2802 | "Display Help Message", 1, KDB_REPEAT_NONE); | 2860 | KDB_ENABLE_ALWAYS_SAFE); |
| 2803 | kdb_register_repeat("?", kdb_help, "", | 2861 | kdb_register_flags("help", kdb_help, "", |
| 2804 | "Display Help Message", 0, KDB_REPEAT_NONE); | 2862 | "Display Help Message", 1, |
| 2805 | kdb_register_repeat("cpu", kdb_cpu, "<cpunum>", | 2863 | KDB_ENABLE_ALWAYS_SAFE); |
| 2806 | "Switch to new cpu", 0, KDB_REPEAT_NONE); | 2864 | kdb_register_flags("?", kdb_help, "", |
| 2807 | kdb_register_repeat("kgdb", kdb_kgdb, "", | 2865 | "Display Help Message", 0, |
| 2808 | "Enter kgdb mode", 0, KDB_REPEAT_NONE); | 2866 | KDB_ENABLE_ALWAYS_SAFE); |
| 2809 | kdb_register_repeat("ps", kdb_ps, "[<flags>|A]", | 2867 | kdb_register_flags("cpu", kdb_cpu, "<cpunum>", |
| 2810 | "Display active task list", 0, KDB_REPEAT_NONE); | 2868 | "Switch to new cpu", 0, |
| 2811 | kdb_register_repeat("pid", kdb_pid, "<pidnum>", | 2869 | KDB_ENABLE_ALWAYS_SAFE_NO_ARGS); |
| 2812 | "Switch to another task", 0, KDB_REPEAT_NONE); | 2870 | kdb_register_flags("kgdb", kdb_kgdb, "", |
| 2813 | kdb_register_repeat("reboot", kdb_reboot, "", | 2871 | "Enter kgdb mode", 0, 0); |
| 2814 | "Reboot the machine immediately", 0, KDB_REPEAT_NONE); | 2872 | kdb_register_flags("ps", kdb_ps, "[<flags>|A]", |
| 2873 | "Display active task list", 0, | ||
| 2874 | KDB_ENABLE_INSPECT); | ||
| 2875 | kdb_register_flags("pid", kdb_pid, "<pidnum>", | ||
| 2876 | "Switch to another task", 0, | ||
| 2877 | KDB_ENABLE_INSPECT); | ||
| 2878 | kdb_register_flags("reboot", kdb_reboot, "", | ||
| 2879 | "Reboot the machine immediately", 0, | ||
| 2880 | KDB_ENABLE_REBOOT); | ||
| 2815 | #if defined(CONFIG_MODULES) | 2881 | #if defined(CONFIG_MODULES) |
| 2816 | kdb_register_repeat("lsmod", kdb_lsmod, "", | 2882 | kdb_register_flags("lsmod", kdb_lsmod, "", |
| 2817 | "List loaded kernel modules", 0, KDB_REPEAT_NONE); | 2883 | "List loaded kernel modules", 0, |
| 2884 | KDB_ENABLE_INSPECT); | ||
| 2818 | #endif | 2885 | #endif |
| 2819 | #if defined(CONFIG_MAGIC_SYSRQ) | 2886 | #if defined(CONFIG_MAGIC_SYSRQ) |
| 2820 | kdb_register_repeat("sr", kdb_sr, "<key>", | 2887 | kdb_register_flags("sr", kdb_sr, "<key>", |
| 2821 | "Magic SysRq key", 0, KDB_REPEAT_NONE); | 2888 | "Magic SysRq key", 0, |
| 2889 | KDB_ENABLE_ALWAYS_SAFE); | ||
| 2822 | #endif | 2890 | #endif |
| 2823 | #if defined(CONFIG_PRINTK) | 2891 | #if defined(CONFIG_PRINTK) |
| 2824 | kdb_register_repeat("dmesg", kdb_dmesg, "[lines]", | 2892 | kdb_register_flags("dmesg", kdb_dmesg, "[lines]", |
| 2825 | "Display syslog buffer", 0, KDB_REPEAT_NONE); | 2893 | "Display syslog buffer", 0, |
| 2894 | KDB_ENABLE_ALWAYS_SAFE); | ||
| 2826 | #endif | 2895 | #endif |
| 2827 | if (arch_kgdb_ops.enable_nmi) { | 2896 | if (arch_kgdb_ops.enable_nmi) { |
| 2828 | kdb_register_repeat("disable_nmi", kdb_disable_nmi, "", | 2897 | kdb_register_flags("disable_nmi", kdb_disable_nmi, "", |
| 2829 | "Disable NMI entry to KDB", 0, KDB_REPEAT_NONE); | 2898 | "Disable NMI entry to KDB", 0, |
| 2830 | } | 2899 | KDB_ENABLE_ALWAYS_SAFE); |
| 2831 | kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"", | 2900 | } |
| 2832 | "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE); | 2901 | kdb_register_flags("defcmd", kdb_defcmd, "name \"usage\" \"help\"", |
| 2833 | kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>", | 2902 | "Define a set of commands, down to endefcmd", 0, |
| 2834 | "Send a signal to a process", 0, KDB_REPEAT_NONE); | 2903 | KDB_ENABLE_ALWAYS_SAFE); |
| 2835 | kdb_register_repeat("summary", kdb_summary, "", | 2904 | kdb_register_flags("kill", kdb_kill, "<-signal> <pid>", |
| 2836 | "Summarize the system", 4, KDB_REPEAT_NONE); | 2905 | "Send a signal to a process", 0, |
| 2837 | kdb_register_repeat("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]", | 2906 | KDB_ENABLE_SIGNAL); |
| 2838 | "Display per_cpu variables", 3, KDB_REPEAT_NONE); | 2907 | kdb_register_flags("summary", kdb_summary, "", |
| 2839 | kdb_register_repeat("grephelp", kdb_grep_help, "", | 2908 | "Summarize the system", 4, |
| 2840 | "Display help on | grep", 0, KDB_REPEAT_NONE); | 2909 | KDB_ENABLE_ALWAYS_SAFE); |
| 2910 | kdb_register_flags("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]", | ||
| 2911 | "Display per_cpu variables", 3, | ||
| 2912 | KDB_ENABLE_MEM_READ); | ||
| 2913 | kdb_register_flags("grephelp", kdb_grep_help, "", | ||
| 2914 | "Display help on | grep", 0, | ||
| 2915 | KDB_ENABLE_ALWAYS_SAFE); | ||
| 2841 | } | 2916 | } |
| 2842 | 2917 | ||
| 2843 | /* Execute any commands defined in kdb_cmds. */ | 2918 | /* Execute any commands defined in kdb_cmds. */ |
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 7afd3c8c41d5..eaacd1693954 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h | |||
| @@ -172,10 +172,9 @@ typedef struct _kdbtab { | |||
| 172 | kdb_func_t cmd_func; /* Function to execute command */ | 172 | kdb_func_t cmd_func; /* Function to execute command */ |
| 173 | char *cmd_usage; /* Usage String for this command */ | 173 | char *cmd_usage; /* Usage String for this command */ |
| 174 | char *cmd_help; /* Help message for this command */ | 174 | char *cmd_help; /* Help message for this command */ |
| 175 | short cmd_flags; /* Parsing flags */ | ||
| 176 | short cmd_minlen; /* Minimum legal # command | 175 | short cmd_minlen; /* Minimum legal # command |
| 177 | * chars required */ | 176 | * chars required */ |
| 178 | kdb_repeat_t cmd_repeat; /* Does command auto repeat on enter? */ | 177 | kdb_cmdflags_t cmd_flags; /* Command behaviour flags */ |
| 179 | } kdbtab_t; | 178 | } kdbtab_t; |
| 180 | 179 | ||
| 181 | extern int kdb_bt(int, const char **); /* KDB display back trace */ | 180 | extern int kdb_bt(int, const char **); /* KDB display back trace */ |
diff --git a/kernel/events/core.c b/kernel/events/core.c index 3e19d3ebc29c..b4a696c4dc76 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
| @@ -614,7 +614,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, | |||
| 614 | if (!f.file) | 614 | if (!f.file) |
| 615 | return -EBADF; | 615 | return -EBADF; |
| 616 | 616 | ||
| 617 | css = css_tryget_online_from_dir(f.file->f_dentry, | 617 | css = css_tryget_online_from_dir(f.file->f_path.dentry, |
| 618 | &perf_event_cgrp_subsys); | 618 | &perf_event_cgrp_subsys); |
| 619 | if (IS_ERR(css)) { | 619 | if (IS_ERR(css)) { |
| 620 | ret = PTR_ERR(css); | 620 | ret = PTR_ERR(css); |
| @@ -4461,18 +4461,14 @@ perf_output_sample_regs(struct perf_output_handle *handle, | |||
| 4461 | } | 4461 | } |
| 4462 | 4462 | ||
| 4463 | static void perf_sample_regs_user(struct perf_regs *regs_user, | 4463 | static void perf_sample_regs_user(struct perf_regs *regs_user, |
| 4464 | struct pt_regs *regs) | 4464 | struct pt_regs *regs, |
| 4465 | struct pt_regs *regs_user_copy) | ||
| 4465 | { | 4466 | { |
| 4466 | if (!user_mode(regs)) { | 4467 | if (user_mode(regs)) { |
| 4467 | if (current->mm) | 4468 | regs_user->abi = perf_reg_abi(current); |
| 4468 | regs = task_pt_regs(current); | ||
| 4469 | else | ||
| 4470 | regs = NULL; | ||
| 4471 | } | ||
| 4472 | |||
| 4473 | if (regs) { | ||
| 4474 | regs_user->abi = perf_reg_abi(current); | ||
| 4475 | regs_user->regs = regs; | 4469 | regs_user->regs = regs; |
| 4470 | } else if (current->mm) { | ||
| 4471 | perf_get_regs_user(regs_user, regs, regs_user_copy); | ||
| 4476 | } else { | 4472 | } else { |
| 4477 | regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; | 4473 | regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; |
| 4478 | regs_user->regs = NULL; | 4474 | regs_user->regs = NULL; |
| @@ -4951,7 +4947,8 @@ void perf_prepare_sample(struct perf_event_header *header, | |||
| 4951 | } | 4947 | } |
| 4952 | 4948 | ||
| 4953 | if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER)) | 4949 | if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER)) |
| 4954 | perf_sample_regs_user(&data->regs_user, regs); | 4950 | perf_sample_regs_user(&data->regs_user, regs, |
| 4951 | &data->regs_user_copy); | ||
| 4955 | 4952 | ||
| 4956 | if (sample_type & PERF_SAMPLE_REGS_USER) { | 4953 | if (sample_type & PERF_SAMPLE_REGS_USER) { |
| 4957 | /* regs dump ABI info */ | 4954 | /* regs dump ABI info */ |
| @@ -5892,6 +5889,8 @@ end: | |||
| 5892 | rcu_read_unlock(); | 5889 | rcu_read_unlock(); |
| 5893 | } | 5890 | } |
| 5894 | 5891 | ||
| 5892 | DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]); | ||
| 5893 | |||
| 5895 | int perf_swevent_get_recursion_context(void) | 5894 | int perf_swevent_get_recursion_context(void) |
| 5896 | { | 5895 | { |
| 5897 | struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); | 5896 | struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); |
| @@ -5907,21 +5906,30 @@ inline void perf_swevent_put_recursion_context(int rctx) | |||
| 5907 | put_recursion_context(swhash->recursion, rctx); | 5906 | put_recursion_context(swhash->recursion, rctx); |
| 5908 | } | 5907 | } |
| 5909 | 5908 | ||
| 5910 | void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) | 5909 | void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) |
| 5911 | { | 5910 | { |
| 5912 | struct perf_sample_data data; | 5911 | struct perf_sample_data data; |
| 5913 | int rctx; | ||
| 5914 | 5912 | ||
| 5915 | preempt_disable_notrace(); | 5913 | if (WARN_ON_ONCE(!regs)) |
| 5916 | rctx = perf_swevent_get_recursion_context(); | ||
| 5917 | if (rctx < 0) | ||
| 5918 | return; | 5914 | return; |
| 5919 | 5915 | ||
| 5920 | perf_sample_data_init(&data, addr, 0); | 5916 | perf_sample_data_init(&data, addr, 0); |
| 5921 | |||
| 5922 | do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); | 5917 | do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); |
| 5918 | } | ||
| 5919 | |||
| 5920 | void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) | ||
| 5921 | { | ||
| 5922 | int rctx; | ||
| 5923 | |||
| 5924 | preempt_disable_notrace(); | ||
| 5925 | rctx = perf_swevent_get_recursion_context(); | ||
| 5926 | if (unlikely(rctx < 0)) | ||
| 5927 | goto fail; | ||
| 5928 | |||
| 5929 | ___perf_sw_event(event_id, nr, regs, addr); | ||
| 5923 | 5930 | ||
| 5924 | perf_swevent_put_recursion_context(rctx); | 5931 | perf_swevent_put_recursion_context(rctx); |
| 5932 | fail: | ||
| 5925 | preempt_enable_notrace(); | 5933 | preempt_enable_notrace(); |
| 5926 | } | 5934 | } |
| 5927 | 5935 | ||
| @@ -6779,7 +6787,6 @@ skip_type: | |||
| 6779 | __perf_event_init_context(&cpuctx->ctx); | 6787 | __perf_event_init_context(&cpuctx->ctx); |
| 6780 | lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); | 6788 | lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); |
| 6781 | lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); | 6789 | lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); |
| 6782 | cpuctx->ctx.type = cpu_context; | ||
| 6783 | cpuctx->ctx.pmu = pmu; | 6790 | cpuctx->ctx.pmu = pmu; |
| 6784 | 6791 | ||
| 6785 | __perf_cpu_hrtimer_init(cpuctx, cpu); | 6792 | __perf_cpu_hrtimer_init(cpuctx, cpu); |
| @@ -7423,7 +7430,19 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 7423 | * task or CPU context: | 7430 | * task or CPU context: |
| 7424 | */ | 7431 | */ |
| 7425 | if (move_group) { | 7432 | if (move_group) { |
| 7426 | if (group_leader->ctx->type != ctx->type) | 7433 | /* |
| 7434 | * Make sure we're both on the same task, or both | ||
| 7435 | * per-cpu events. | ||
| 7436 | */ | ||
| 7437 | if (group_leader->ctx->task != ctx->task) | ||
| 7438 | goto err_context; | ||
| 7439 | |||
| 7440 | /* | ||
| 7441 | * Make sure we're both events for the same CPU; | ||
| 7442 | * grouping events for different CPUs is broken; since | ||
| 7443 | * you can never concurrently schedule them anyhow. | ||
| 7444 | */ | ||
| 7445 | if (group_leader->cpu != event->cpu) | ||
| 7427 | goto err_context; | 7446 | goto err_context; |
| 7428 | } else { | 7447 | } else { |
| 7429 | if (group_leader->ctx != ctx) | 7448 | if (group_leader->ctx != ctx) |
| @@ -7477,11 +7496,11 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 7477 | 7496 | ||
| 7478 | if (move_group) { | 7497 | if (move_group) { |
| 7479 | synchronize_rcu(); | 7498 | synchronize_rcu(); |
| 7480 | perf_install_in_context(ctx, group_leader, event->cpu); | 7499 | perf_install_in_context(ctx, group_leader, group_leader->cpu); |
| 7481 | get_ctx(ctx); | 7500 | get_ctx(ctx); |
| 7482 | list_for_each_entry(sibling, &group_leader->sibling_list, | 7501 | list_for_each_entry(sibling, &group_leader->sibling_list, |
| 7483 | group_entry) { | 7502 | group_entry) { |
| 7484 | perf_install_in_context(ctx, sibling, event->cpu); | 7503 | perf_install_in_context(ctx, sibling, sibling->cpu); |
| 7485 | get_ctx(ctx); | 7504 | get_ctx(ctx); |
| 7486 | } | 7505 | } |
| 7487 | } | 7506 | } |
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ed8f2cde34c5..cb346f26a22d 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c | |||
| @@ -193,7 +193,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, | |||
| 193 | } | 193 | } |
| 194 | 194 | ||
| 195 | flush_cache_page(vma, addr, pte_pfn(*ptep)); | 195 | flush_cache_page(vma, addr, pte_pfn(*ptep)); |
| 196 | ptep_clear_flush(vma, addr, ptep); | 196 | ptep_clear_flush_notify(vma, addr, ptep); |
| 197 | set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); | 197 | set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); |
| 198 | 198 | ||
| 199 | page_remove_rmap(page); | 199 | page_remove_rmap(page); |
| @@ -724,14 +724,14 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) | |||
| 724 | int more = 0; | 724 | int more = 0; |
| 725 | 725 | ||
| 726 | again: | 726 | again: |
| 727 | mutex_lock(&mapping->i_mmap_mutex); | 727 | i_mmap_lock_read(mapping); |
| 728 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { | 728 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { |
| 729 | if (!valid_vma(vma, is_register)) | 729 | if (!valid_vma(vma, is_register)) |
| 730 | continue; | 730 | continue; |
| 731 | 731 | ||
| 732 | if (!prev && !more) { | 732 | if (!prev && !more) { |
| 733 | /* | 733 | /* |
| 734 | * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through | 734 | * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through |
| 735 | * reclaim. This is optimistic, no harm done if it fails. | 735 | * reclaim. This is optimistic, no harm done if it fails. |
| 736 | */ | 736 | */ |
| 737 | prev = kmalloc(sizeof(struct map_info), | 737 | prev = kmalloc(sizeof(struct map_info), |
| @@ -755,7 +755,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) | |||
| 755 | info->mm = vma->vm_mm; | 755 | info->mm = vma->vm_mm; |
| 756 | info->vaddr = offset_to_vaddr(vma, offset); | 756 | info->vaddr = offset_to_vaddr(vma, offset); |
| 757 | } | 757 | } |
| 758 | mutex_unlock(&mapping->i_mmap_mutex); | 758 | i_mmap_unlock_read(mapping); |
| 759 | 759 | ||
| 760 | if (!more) | 760 | if (!more) |
| 761 | goto out; | 761 | goto out; |
diff --git a/kernel/exit.c b/kernel/exit.c index 5d30019ff953..6806c55475ee 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
| @@ -118,13 +118,10 @@ static void __exit_signal(struct task_struct *tsk) | |||
| 118 | } | 118 | } |
| 119 | 119 | ||
| 120 | /* | 120 | /* |
| 121 | * Accumulate here the counters for all threads but the group leader | 121 | * Accumulate here the counters for all threads as they die. We could |
| 122 | * as they die, so they can be added into the process-wide totals | 122 | * skip the group leader because it is the last user of signal_struct, |
| 123 | * when those are taken. The group leader stays around as a zombie as | 123 | * but we want to avoid the race with thread_group_cputime() which can |
| 124 | * long as there are other threads. When it gets reaped, the exit.c | 124 | * see the empty ->thread_head list. |
| 125 | * code will add its counts into these totals. We won't ever get here | ||
| 126 | * for the group leader, since it will have been the last reference on | ||
| 127 | * the signal_struct. | ||
| 128 | */ | 125 | */ |
| 129 | task_cputime(tsk, &utime, &stime); | 126 | task_cputime(tsk, &utime, &stime); |
| 130 | write_seqlock(&sig->stats_lock); | 127 | write_seqlock(&sig->stats_lock); |
| @@ -215,27 +212,6 @@ repeat: | |||
| 215 | } | 212 | } |
| 216 | 213 | ||
| 217 | /* | 214 | /* |
| 218 | * This checks not only the pgrp, but falls back on the pid if no | ||
| 219 | * satisfactory pgrp is found. I dunno - gdb doesn't work correctly | ||
| 220 | * without this... | ||
| 221 | * | ||
| 222 | * The caller must hold rcu lock or the tasklist lock. | ||
| 223 | */ | ||
| 224 | struct pid *session_of_pgrp(struct pid *pgrp) | ||
| 225 | { | ||
| 226 | struct task_struct *p; | ||
| 227 | struct pid *sid = NULL; | ||
| 228 | |||
| 229 | p = pid_task(pgrp, PIDTYPE_PGID); | ||
| 230 | if (p == NULL) | ||
| 231 | p = pid_task(pgrp, PIDTYPE_PID); | ||
| 232 | if (p != NULL) | ||
| 233 | sid = task_session(p); | ||
| 234 | |||
| 235 | return sid; | ||
| 236 | } | ||
| 237 | |||
| 238 | /* | ||
| 239 | * Determine if a process group is "orphaned", according to the POSIX | 215 | * Determine if a process group is "orphaned", according to the POSIX |
| 240 | * definition in 2.2.2.52. Orphaned process groups are not to be affected | 216 | * definition in 2.2.2.52. Orphaned process groups are not to be affected |
| 241 | * by terminal-generated stop signals. Newly orphaned process groups are | 217 | * by terminal-generated stop signals. Newly orphaned process groups are |
| @@ -462,6 +438,44 @@ static void exit_mm(struct task_struct *tsk) | |||
| 462 | clear_thread_flag(TIF_MEMDIE); | 438 | clear_thread_flag(TIF_MEMDIE); |
| 463 | } | 439 | } |
| 464 | 440 | ||
| 441 | static struct task_struct *find_alive_thread(struct task_struct *p) | ||
| 442 | { | ||
| 443 | struct task_struct *t; | ||
| 444 | |||
| 445 | for_each_thread(p, t) { | ||
| 446 | if (!(t->flags & PF_EXITING)) | ||
| 447 | return t; | ||
| 448 | } | ||
| 449 | return NULL; | ||
| 450 | } | ||
| 451 | |||
| 452 | static struct task_struct *find_child_reaper(struct task_struct *father) | ||
| 453 | __releases(&tasklist_lock) | ||
| 454 | __acquires(&tasklist_lock) | ||
| 455 | { | ||
| 456 | struct pid_namespace *pid_ns = task_active_pid_ns(father); | ||
| 457 | struct task_struct *reaper = pid_ns->child_reaper; | ||
| 458 | |||
| 459 | if (likely(reaper != father)) | ||
| 460 | return reaper; | ||
| 461 | |||
| 462 | reaper = find_alive_thread(father); | ||
| 463 | if (reaper) { | ||
| 464 | pid_ns->child_reaper = reaper; | ||
| 465 | return reaper; | ||
| 466 | } | ||
| 467 | |||
| 468 | write_unlock_irq(&tasklist_lock); | ||
| 469 | if (unlikely(pid_ns == &init_pid_ns)) { | ||
| 470 | panic("Attempted to kill init! exitcode=0x%08x\n", | ||
| 471 | father->signal->group_exit_code ?: father->exit_code); | ||
| 472 | } | ||
| 473 | zap_pid_ns_processes(pid_ns); | ||
| 474 | write_lock_irq(&tasklist_lock); | ||
| 475 | |||
| 476 | return father; | ||
| 477 | } | ||
| 478 | |||
| 465 | /* | 479 | /* |
| 466 | * When we die, we re-parent all our children, and try to: | 480 | * When we die, we re-parent all our children, and try to: |
| 467 | * 1. give them to another thread in our thread group, if such a member exists | 481 | * 1. give them to another thread in our thread group, if such a member exists |
| @@ -469,58 +483,36 @@ static void exit_mm(struct task_struct *tsk) | |||
| 469 | * child_subreaper for its children (like a service manager) | 483 | * child_subreaper for its children (like a service manager) |
| 470 | * 3. give it to the init process (PID 1) in our pid namespace | 484 | * 3. give it to the init process (PID 1) in our pid namespace |
| 471 | */ | 485 | */ |
| 472 | static struct task_struct *find_new_reaper(struct task_struct *father) | 486 | static struct task_struct *find_new_reaper(struct task_struct *father, |
| 473 | __releases(&tasklist_lock) | 487 | struct task_struct *child_reaper) |
| 474 | __acquires(&tasklist_lock) | ||
| 475 | { | 488 | { |
| 476 | struct pid_namespace *pid_ns = task_active_pid_ns(father); | 489 | struct task_struct *thread, *reaper; |
| 477 | struct task_struct *thread; | ||
| 478 | 490 | ||
| 479 | thread = father; | 491 | thread = find_alive_thread(father); |
| 480 | while_each_thread(father, thread) { | 492 | if (thread) |
| 481 | if (thread->flags & PF_EXITING) | ||
| 482 | continue; | ||
| 483 | if (unlikely(pid_ns->child_reaper == father)) | ||
| 484 | pid_ns->child_reaper = thread; | ||
| 485 | return thread; | 493 | return thread; |
| 486 | } | ||
| 487 | |||
| 488 | if (unlikely(pid_ns->child_reaper == father)) { | ||
| 489 | write_unlock_irq(&tasklist_lock); | ||
| 490 | if (unlikely(pid_ns == &init_pid_ns)) { | ||
| 491 | panic("Attempted to kill init! exitcode=0x%08x\n", | ||
| 492 | father->signal->group_exit_code ?: | ||
| 493 | father->exit_code); | ||
| 494 | } | ||
| 495 | |||
| 496 | zap_pid_ns_processes(pid_ns); | ||
| 497 | write_lock_irq(&tasklist_lock); | ||
| 498 | } else if (father->signal->has_child_subreaper) { | ||
| 499 | struct task_struct *reaper; | ||
| 500 | 494 | ||
| 495 | if (father->signal->has_child_subreaper) { | ||
| 501 | /* | 496 | /* |
| 502 | * Find the first ancestor marked as child_subreaper. | 497 | * Find the first ->is_child_subreaper ancestor in our pid_ns. |
| 503 | * Note that the code below checks same_thread_group(reaper, | 498 | * We start from father to ensure we can not look into another |
| 504 | * pid_ns->child_reaper). This is what we need to DTRT in a | 499 | * namespace, this is safe because all its threads are dead. |
| 505 | * PID namespace. However we still need the check above, see | ||
| 506 | * http://marc.info/?l=linux-kernel&m=131385460420380 | ||
| 507 | */ | 500 | */ |
| 508 | for (reaper = father->real_parent; | 501 | for (reaper = father; |
| 509 | reaper != &init_task; | 502 | !same_thread_group(reaper, child_reaper); |
| 510 | reaper = reaper->real_parent) { | 503 | reaper = reaper->real_parent) { |
| 511 | if (same_thread_group(reaper, pid_ns->child_reaper)) | 504 | /* call_usermodehelper() descendants need this check */ |
| 505 | if (reaper == &init_task) | ||
| 512 | break; | 506 | break; |
| 513 | if (!reaper->signal->is_child_subreaper) | 507 | if (!reaper->signal->is_child_subreaper) |
| 514 | continue; | 508 | continue; |
| 515 | thread = reaper; | 509 | thread = find_alive_thread(reaper); |
| 516 | do { | 510 | if (thread) |
| 517 | if (!(thread->flags & PF_EXITING)) | 511 | return thread; |
| 518 | return reaper; | ||
| 519 | } while_each_thread(reaper, thread); | ||
| 520 | } | 512 | } |
| 521 | } | 513 | } |
| 522 | 514 | ||
| 523 | return pid_ns->child_reaper; | 515 | return child_reaper; |
| 524 | } | 516 | } |
| 525 | 517 | ||
| 526 | /* | 518 | /* |
| @@ -529,15 +521,7 @@ static struct task_struct *find_new_reaper(struct task_struct *father) | |||
| 529 | static void reparent_leader(struct task_struct *father, struct task_struct *p, | 521 | static void reparent_leader(struct task_struct *father, struct task_struct *p, |
| 530 | struct list_head *dead) | 522 | struct list_head *dead) |
| 531 | { | 523 | { |
| 532 | list_move_tail(&p->sibling, &p->real_parent->children); | 524 | if (unlikely(p->exit_state == EXIT_DEAD)) |
| 533 | |||
| 534 | if (p->exit_state == EXIT_DEAD) | ||
| 535 | return; | ||
| 536 | /* | ||
| 537 | * If this is a threaded reparent there is no need to | ||
| 538 | * notify anyone anything has happened. | ||
| 539 | */ | ||
| 540 | if (same_thread_group(p->real_parent, father)) | ||
| 541 | return; | 525 | return; |
| 542 | 526 | ||
| 543 | /* We don't want people slaying init. */ | 527 | /* We don't want people slaying init. */ |
| @@ -548,49 +532,53 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, | |||
| 548 | p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { | 532 | p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { |
| 549 | if (do_notify_parent(p, p->exit_signal)) { | 533 | if (do_notify_parent(p, p->exit_signal)) { |
| 550 | p->exit_state = EXIT_DEAD; | 534 | p->exit_state = EXIT_DEAD; |
| 551 | list_move_tail(&p->sibling, dead); | 535 | list_add(&p->ptrace_entry, dead); |
| 552 | } | 536 | } |
| 553 | } | 537 | } |
| 554 | 538 | ||
| 555 | kill_orphaned_pgrp(p, father); | 539 | kill_orphaned_pgrp(p, father); |
| 556 | } | 540 | } |
| 557 | 541 | ||
| 558 | static void forget_original_parent(struct task_struct *father) | 542 | /* |
| 543 | * This does two things: | ||
| 544 | * | ||
| 545 | * A. Make init inherit all the child processes | ||
| 546 | * B. Check to see if any process groups have become orphaned | ||
| 547 | * as a result of our exiting, and if they have any stopped | ||
| 548 | * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) | ||
| 549 | */ | ||
| 550 | static void forget_original_parent(struct task_struct *father, | ||
| 551 | struct list_head *dead) | ||
| 559 | { | 552 | { |
| 560 | struct task_struct *p, *n, *reaper; | 553 | struct task_struct *p, *t, *reaper; |
| 561 | LIST_HEAD(dead_children); | ||
| 562 | 554 | ||
| 563 | write_lock_irq(&tasklist_lock); | 555 | if (unlikely(!list_empty(&father->ptraced))) |
| 564 | /* | 556 | exit_ptrace(father, dead); |
| 565 | * Note that exit_ptrace() and find_new_reaper() might | ||
| 566 | * drop tasklist_lock and reacquire it. | ||
| 567 | */ | ||
| 568 | exit_ptrace(father); | ||
| 569 | reaper = find_new_reaper(father); | ||
| 570 | 557 | ||
| 571 | list_for_each_entry_safe(p, n, &father->children, sibling) { | 558 | /* Can drop and reacquire tasklist_lock */ |
| 572 | struct task_struct *t = p; | 559 | reaper = find_child_reaper(father); |
| 560 | if (list_empty(&father->children)) | ||
| 561 | return; | ||
| 573 | 562 | ||
| 574 | do { | 563 | reaper = find_new_reaper(father, reaper); |
| 564 | list_for_each_entry(p, &father->children, sibling) { | ||
| 565 | for_each_thread(p, t) { | ||
| 575 | t->real_parent = reaper; | 566 | t->real_parent = reaper; |
| 576 | if (t->parent == father) { | 567 | BUG_ON((!t->ptrace) != (t->parent == father)); |
| 577 | BUG_ON(t->ptrace); | 568 | if (likely(!t->ptrace)) |
| 578 | t->parent = t->real_parent; | 569 | t->parent = t->real_parent; |
| 579 | } | ||
| 580 | if (t->pdeath_signal) | 570 | if (t->pdeath_signal) |
| 581 | group_send_sig_info(t->pdeath_signal, | 571 | group_send_sig_info(t->pdeath_signal, |
| 582 | SEND_SIG_NOINFO, t); | 572 | SEND_SIG_NOINFO, t); |
| 583 | } while_each_thread(p, t); | 573 | } |
| 584 | reparent_leader(father, p, &dead_children); | 574 | /* |
| 585 | } | 575 | * If this is a threaded reparent there is no need to |
| 586 | write_unlock_irq(&tasklist_lock); | 576 | * notify anyone anything has happened. |
| 587 | 577 | */ | |
| 588 | BUG_ON(!list_empty(&father->children)); | 578 | if (!same_thread_group(reaper, father)) |
| 589 | 579 | reparent_leader(father, p, dead); | |
| 590 | list_for_each_entry_safe(p, n, &dead_children, sibling) { | ||
| 591 | list_del_init(&p->sibling); | ||
| 592 | release_task(p); | ||
| 593 | } | 580 | } |
| 581 | list_splice_tail_init(&father->children, &reaper->children); | ||
| 594 | } | 582 | } |
| 595 | 583 | ||
| 596 | /* | 584 | /* |
| @@ -600,18 +588,12 @@ static void forget_original_parent(struct task_struct *father) | |||
| 600 | static void exit_notify(struct task_struct *tsk, int group_dead) | 588 | static void exit_notify(struct task_struct *tsk, int group_dead) |
| 601 | { | 589 | { |
| 602 | bool autoreap; | 590 | bool autoreap; |
| 603 | 591 | struct task_struct *p, *n; | |
| 604 | /* | 592 | LIST_HEAD(dead); |
| 605 | * This does two things: | ||
| 606 | * | ||
| 607 | * A. Make init inherit all the child processes | ||
| 608 | * B. Check to see if any process groups have become orphaned | ||
| 609 | * as a result of our exiting, and if they have any stopped | ||
| 610 | * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) | ||
| 611 | */ | ||
| 612 | forget_original_parent(tsk); | ||
| 613 | 593 | ||
| 614 | write_lock_irq(&tasklist_lock); | 594 | write_lock_irq(&tasklist_lock); |
| 595 | forget_original_parent(tsk, &dead); | ||
| 596 | |||
| 615 | if (group_dead) | 597 | if (group_dead) |
| 616 | kill_orphaned_pgrp(tsk->group_leader, NULL); | 598 | kill_orphaned_pgrp(tsk->group_leader, NULL); |
| 617 | 599 | ||
| @@ -629,15 +611,18 @@ static void exit_notify(struct task_struct *tsk, int group_dead) | |||
| 629 | } | 611 | } |
| 630 | 612 | ||
| 631 | tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE; | 613 | tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE; |
| 614 | if (tsk->exit_state == EXIT_DEAD) | ||
| 615 | list_add(&tsk->ptrace_entry, &dead); | ||
| 632 | 616 | ||
| 633 | /* mt-exec, de_thread() is waiting for group leader */ | 617 | /* mt-exec, de_thread() is waiting for group leader */ |
| 634 | if (unlikely(tsk->signal->notify_count < 0)) | 618 | if (unlikely(tsk->signal->notify_count < 0)) |
| 635 | wake_up_process(tsk->signal->group_exit_task); | 619 | wake_up_process(tsk->signal->group_exit_task); |
| 636 | write_unlock_irq(&tasklist_lock); | 620 | write_unlock_irq(&tasklist_lock); |
| 637 | 621 | ||
| 638 | /* If the process is dead, release it - nobody will wait for it */ | 622 | list_for_each_entry_safe(p, n, &dead, ptrace_entry) { |
| 639 | if (autoreap) | 623 | list_del_init(&p->ptrace_entry); |
| 640 | release_task(tsk); | 624 | release_task(p); |
| 625 | } | ||
| 641 | } | 626 | } |
| 642 | 627 | ||
| 643 | #ifdef CONFIG_DEBUG_STACK_USAGE | 628 | #ifdef CONFIG_DEBUG_STACK_USAGE |
| @@ -982,8 +967,7 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, | |||
| 982 | */ | 967 | */ |
| 983 | static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | 968 | static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) |
| 984 | { | 969 | { |
| 985 | unsigned long state; | 970 | int state, retval, status; |
| 986 | int retval, status, traced; | ||
| 987 | pid_t pid = task_pid_vnr(p); | 971 | pid_t pid = task_pid_vnr(p); |
| 988 | uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); | 972 | uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); |
| 989 | struct siginfo __user *infop; | 973 | struct siginfo __user *infop; |
| @@ -997,6 +981,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
| 997 | 981 | ||
| 998 | get_task_struct(p); | 982 | get_task_struct(p); |
| 999 | read_unlock(&tasklist_lock); | 983 | read_unlock(&tasklist_lock); |
| 984 | sched_annotate_sleep(); | ||
| 985 | |||
| 1000 | if ((exit_code & 0x7f) == 0) { | 986 | if ((exit_code & 0x7f) == 0) { |
| 1001 | why = CLD_EXITED; | 987 | why = CLD_EXITED; |
| 1002 | status = exit_code >> 8; | 988 | status = exit_code >> 8; |
| @@ -1006,21 +992,25 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
| 1006 | } | 992 | } |
| 1007 | return wait_noreap_copyout(wo, p, pid, uid, why, status); | 993 | return wait_noreap_copyout(wo, p, pid, uid, why, status); |
| 1008 | } | 994 | } |
| 1009 | |||
| 1010 | traced = ptrace_reparented(p); | ||
| 1011 | /* | 995 | /* |
| 1012 | * Move the task's state to DEAD/TRACE, only one thread can do this. | 996 | * Move the task's state to DEAD/TRACE, only one thread can do this. |
| 1013 | */ | 997 | */ |
| 1014 | state = traced && thread_group_leader(p) ? EXIT_TRACE : EXIT_DEAD; | 998 | state = (ptrace_reparented(p) && thread_group_leader(p)) ? |
| 999 | EXIT_TRACE : EXIT_DEAD; | ||
| 1015 | if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) | 1000 | if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) |
| 1016 | return 0; | 1001 | return 0; |
| 1017 | /* | 1002 | /* |
| 1018 | * It can be ptraced but not reparented, check | 1003 | * We own this thread, nobody else can reap it. |
| 1019 | * thread_group_leader() to filter out sub-threads. | 1004 | */ |
| 1005 | read_unlock(&tasklist_lock); | ||
| 1006 | sched_annotate_sleep(); | ||
| 1007 | |||
| 1008 | /* | ||
| 1009 | * Check thread_group_leader() to exclude the traced sub-threads. | ||
| 1020 | */ | 1010 | */ |
| 1021 | if (likely(!traced) && thread_group_leader(p)) { | 1011 | if (state == EXIT_DEAD && thread_group_leader(p)) { |
| 1022 | struct signal_struct *psig; | 1012 | struct signal_struct *sig = p->signal; |
| 1023 | struct signal_struct *sig; | 1013 | struct signal_struct *psig = current->signal; |
| 1024 | unsigned long maxrss; | 1014 | unsigned long maxrss; |
| 1025 | cputime_t tgutime, tgstime; | 1015 | cputime_t tgutime, tgstime; |
| 1026 | 1016 | ||
| @@ -1032,21 +1022,20 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
| 1032 | * accumulate in the parent's signal_struct c* fields. | 1022 | * accumulate in the parent's signal_struct c* fields. |
| 1033 | * | 1023 | * |
| 1034 | * We don't bother to take a lock here to protect these | 1024 | * We don't bother to take a lock here to protect these |
| 1035 | * p->signal fields, because they are only touched by | 1025 | * p->signal fields because the whole thread group is dead |
| 1036 | * __exit_signal, which runs with tasklist_lock | 1026 | * and nobody can change them. |
| 1037 | * write-locked anyway, and so is excluded here. We do | 1027 | * |
| 1038 | * need to protect the access to parent->signal fields, | 1028 | * psig->stats_lock also protects us from our sub-theads |
| 1039 | * as other threads in the parent group can be right | 1029 | * which can reap other children at the same time. Until |
| 1040 | * here reaping other children at the same time. | 1030 | * we change k_getrusage()-like users to rely on this lock |
| 1031 | * we have to take ->siglock as well. | ||
| 1041 | * | 1032 | * |
| 1042 | * We use thread_group_cputime_adjusted() to get times for | 1033 | * We use thread_group_cputime_adjusted() to get times for |
| 1043 | * the thread group, which consolidates times for all threads | 1034 | * the thread group, which consolidates times for all threads |
| 1044 | * in the group including the group leader. | 1035 | * in the group including the group leader. |
| 1045 | */ | 1036 | */ |
| 1046 | thread_group_cputime_adjusted(p, &tgutime, &tgstime); | 1037 | thread_group_cputime_adjusted(p, &tgutime, &tgstime); |
| 1047 | spin_lock_irq(&p->real_parent->sighand->siglock); | 1038 | spin_lock_irq(¤t->sighand->siglock); |
| 1048 | psig = p->real_parent->signal; | ||
| 1049 | sig = p->signal; | ||
| 1050 | write_seqlock(&psig->stats_lock); | 1039 | write_seqlock(&psig->stats_lock); |
| 1051 | psig->cutime += tgutime + sig->cutime; | 1040 | psig->cutime += tgutime + sig->cutime; |
| 1052 | psig->cstime += tgstime + sig->cstime; | 1041 | psig->cstime += tgstime + sig->cstime; |
| @@ -1071,15 +1060,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
| 1071 | task_io_accounting_add(&psig->ioac, &p->ioac); | 1060 | task_io_accounting_add(&psig->ioac, &p->ioac); |
| 1072 | task_io_accounting_add(&psig->ioac, &sig->ioac); | 1061 | task_io_accounting_add(&psig->ioac, &sig->ioac); |
| 1073 | write_sequnlock(&psig->stats_lock); | 1062 | write_sequnlock(&psig->stats_lock); |
| 1074 | spin_unlock_irq(&p->real_parent->sighand->siglock); | 1063 | spin_unlock_irq(¤t->sighand->siglock); |
| 1075 | } | 1064 | } |
| 1076 | 1065 | ||
| 1077 | /* | ||
| 1078 | * Now we are sure this task is interesting, and no other | ||
| 1079 | * thread can reap it because we its state == DEAD/TRACE. | ||
| 1080 | */ | ||
| 1081 | read_unlock(&tasklist_lock); | ||
| 1082 | |||
| 1083 | retval = wo->wo_rusage | 1066 | retval = wo->wo_rusage |
| 1084 | ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; | 1067 | ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; |
| 1085 | status = (p->signal->flags & SIGNAL_GROUP_EXIT) | 1068 | status = (p->signal->flags & SIGNAL_GROUP_EXIT) |
| @@ -1210,6 +1193,7 @@ unlock_sig: | |||
| 1210 | pid = task_pid_vnr(p); | 1193 | pid = task_pid_vnr(p); |
| 1211 | why = ptrace ? CLD_TRAPPED : CLD_STOPPED; | 1194 | why = ptrace ? CLD_TRAPPED : CLD_STOPPED; |
| 1212 | read_unlock(&tasklist_lock); | 1195 | read_unlock(&tasklist_lock); |
| 1196 | sched_annotate_sleep(); | ||
| 1213 | 1197 | ||
| 1214 | if (unlikely(wo->wo_flags & WNOWAIT)) | 1198 | if (unlikely(wo->wo_flags & WNOWAIT)) |
| 1215 | return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); | 1199 | return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); |
| @@ -1272,6 +1256,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) | |||
| 1272 | pid = task_pid_vnr(p); | 1256 | pid = task_pid_vnr(p); |
| 1273 | get_task_struct(p); | 1257 | get_task_struct(p); |
| 1274 | read_unlock(&tasklist_lock); | 1258 | read_unlock(&tasklist_lock); |
| 1259 | sched_annotate_sleep(); | ||
| 1275 | 1260 | ||
| 1276 | if (!wo->wo_info) { | 1261 | if (!wo->wo_info) { |
| 1277 | retval = wo->wo_rusage | 1262 | retval = wo->wo_rusage |
| @@ -1302,9 +1287,15 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) | |||
| 1302 | static int wait_consider_task(struct wait_opts *wo, int ptrace, | 1287 | static int wait_consider_task(struct wait_opts *wo, int ptrace, |
| 1303 | struct task_struct *p) | 1288 | struct task_struct *p) |
| 1304 | { | 1289 | { |
| 1290 | /* | ||
| 1291 | * We can race with wait_task_zombie() from another thread. | ||
| 1292 | * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition | ||
| 1293 | * can't confuse the checks below. | ||
| 1294 | */ | ||
| 1295 | int exit_state = ACCESS_ONCE(p->exit_state); | ||
| 1305 | int ret; | 1296 | int ret; |
| 1306 | 1297 | ||
| 1307 | if (unlikely(p->exit_state == EXIT_DEAD)) | 1298 | if (unlikely(exit_state == EXIT_DEAD)) |
| 1308 | return 0; | 1299 | return 0; |
| 1309 | 1300 | ||
| 1310 | ret = eligible_child(wo, p); | 1301 | ret = eligible_child(wo, p); |
| @@ -1325,7 +1316,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, | |||
| 1325 | return 0; | 1316 | return 0; |
| 1326 | } | 1317 | } |
| 1327 | 1318 | ||
| 1328 | if (unlikely(p->exit_state == EXIT_TRACE)) { | 1319 | if (unlikely(exit_state == EXIT_TRACE)) { |
| 1329 | /* | 1320 | /* |
| 1330 | * ptrace == 0 means we are the natural parent. In this case | 1321 | * ptrace == 0 means we are the natural parent. In this case |
| 1331 | * we should clear notask_error, debugger will notify us. | 1322 | * we should clear notask_error, debugger will notify us. |
| @@ -1352,7 +1343,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, | |||
| 1352 | } | 1343 | } |
| 1353 | 1344 | ||
| 1354 | /* slay zombie? */ | 1345 | /* slay zombie? */ |
| 1355 | if (p->exit_state == EXIT_ZOMBIE) { | 1346 | if (exit_state == EXIT_ZOMBIE) { |
| 1356 | /* we don't reap group leaders with subthreads */ | 1347 | /* we don't reap group leaders with subthreads */ |
| 1357 | if (!delay_group_leader(p)) { | 1348 | if (!delay_group_leader(p)) { |
| 1358 | /* | 1349 | /* |
diff --git a/kernel/extable.c b/kernel/extable.c index d8a6446adbcb..c98f926277a8 100644 --- a/kernel/extable.c +++ b/kernel/extable.c | |||
| @@ -18,6 +18,7 @@ | |||
| 18 | #include <linux/ftrace.h> | 18 | #include <linux/ftrace.h> |
| 19 | #include <linux/memory.h> | 19 | #include <linux/memory.h> |
| 20 | #include <linux/module.h> | 20 | #include <linux/module.h> |
| 21 | #include <linux/ftrace.h> | ||
| 21 | #include <linux/mutex.h> | 22 | #include <linux/mutex.h> |
| 22 | #include <linux/init.h> | 23 | #include <linux/init.h> |
| 23 | 24 | ||
| @@ -102,6 +103,8 @@ int __kernel_text_address(unsigned long addr) | |||
| 102 | return 1; | 103 | return 1; |
| 103 | if (is_module_text_address(addr)) | 104 | if (is_module_text_address(addr)) |
| 104 | return 1; | 105 | return 1; |
| 106 | if (is_ftrace_trampoline(addr)) | ||
| 107 | return 1; | ||
| 105 | /* | 108 | /* |
| 106 | * There might be init symbols in saved stacktraces. | 109 | * There might be init symbols in saved stacktraces. |
| 107 | * Give those symbols a chance to be printed in | 110 | * Give those symbols a chance to be printed in |
| @@ -119,7 +122,9 @@ int kernel_text_address(unsigned long addr) | |||
| 119 | { | 122 | { |
| 120 | if (core_kernel_text(addr)) | 123 | if (core_kernel_text(addr)) |
| 121 | return 1; | 124 | return 1; |
| 122 | return is_module_text_address(addr); | 125 | if (is_module_text_address(addr)) |
| 126 | return 1; | ||
| 127 | return is_ftrace_trampoline(addr); | ||
| 123 | } | 128 | } |
| 124 | 129 | ||
| 125 | /* | 130 | /* |
diff --git a/kernel/fork.c b/kernel/fork.c index 9b7d746d6d62..4dc2ddade9f1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -433,7 +433,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
| 433 | get_file(file); | 433 | get_file(file); |
| 434 | if (tmp->vm_flags & VM_DENYWRITE) | 434 | if (tmp->vm_flags & VM_DENYWRITE) |
| 435 | atomic_dec(&inode->i_writecount); | 435 | atomic_dec(&inode->i_writecount); |
| 436 | mutex_lock(&mapping->i_mmap_mutex); | 436 | i_mmap_lock_write(mapping); |
| 437 | if (tmp->vm_flags & VM_SHARED) | 437 | if (tmp->vm_flags & VM_SHARED) |
| 438 | atomic_inc(&mapping->i_mmap_writable); | 438 | atomic_inc(&mapping->i_mmap_writable); |
| 439 | flush_dcache_mmap_lock(mapping); | 439 | flush_dcache_mmap_lock(mapping); |
| @@ -445,7 +445,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
| 445 | vma_interval_tree_insert_after(tmp, mpnt, | 445 | vma_interval_tree_insert_after(tmp, mpnt, |
| 446 | &mapping->i_mmap); | 446 | &mapping->i_mmap); |
| 447 | flush_dcache_mmap_unlock(mapping); | 447 | flush_dcache_mmap_unlock(mapping); |
| 448 | mutex_unlock(&mapping->i_mmap_mutex); | 448 | i_mmap_unlock_write(mapping); |
| 449 | } | 449 | } |
| 450 | 450 | ||
| 451 | /* | 451 | /* |
| @@ -1022,11 +1022,14 @@ void __cleanup_sighand(struct sighand_struct *sighand) | |||
| 1022 | { | 1022 | { |
| 1023 | if (atomic_dec_and_test(&sighand->count)) { | 1023 | if (atomic_dec_and_test(&sighand->count)) { |
| 1024 | signalfd_cleanup(sighand); | 1024 | signalfd_cleanup(sighand); |
| 1025 | /* | ||
| 1026 | * sighand_cachep is SLAB_DESTROY_BY_RCU so we can free it | ||
| 1027 | * without an RCU grace period, see __lock_task_sighand(). | ||
| 1028 | */ | ||
| 1025 | kmem_cache_free(sighand_cachep, sighand); | 1029 | kmem_cache_free(sighand_cachep, sighand); |
| 1026 | } | 1030 | } |
| 1027 | } | 1031 | } |
| 1028 | 1032 | ||
| 1029 | |||
| 1030 | /* | 1033 | /* |
| 1031 | * Initialize POSIX timer handling for a thread group. | 1034 | * Initialize POSIX timer handling for a thread group. |
| 1032 | */ | 1035 | */ |
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 3b7408759bdf..c92e44855ddd 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig | |||
| @@ -32,10 +32,13 @@ config GCOV_KERNEL | |||
| 32 | Note that the debugfs filesystem has to be mounted to access | 32 | Note that the debugfs filesystem has to be mounted to access |
| 33 | profiling data. | 33 | profiling data. |
| 34 | 34 | ||
| 35 | config ARCH_HAS_GCOV_PROFILE_ALL | ||
| 36 | def_bool n | ||
| 37 | |||
| 35 | config GCOV_PROFILE_ALL | 38 | config GCOV_PROFILE_ALL |
| 36 | bool "Profile entire Kernel" | 39 | bool "Profile entire Kernel" |
| 37 | depends on GCOV_KERNEL | 40 | depends on GCOV_KERNEL |
| 38 | depends on SUPERH || S390 || X86 || PPC || MICROBLAZE || ARM || ARM64 | 41 | depends on ARCH_HAS_GCOV_PROFILE_ALL |
| 39 | default n | 42 | default n |
| 40 | ---help--- | 43 | ---help--- |
| 41 | This options activates profiling for the entire kernel. | 44 | This options activates profiling for the entire kernel. |
diff --git a/kernel/groups.c b/kernel/groups.c index 451698f86cfa..664411f171b5 100644 --- a/kernel/groups.c +++ b/kernel/groups.c | |||
| @@ -6,6 +6,7 @@ | |||
| 6 | #include <linux/slab.h> | 6 | #include <linux/slab.h> |
| 7 | #include <linux/security.h> | 7 | #include <linux/security.h> |
| 8 | #include <linux/syscalls.h> | 8 | #include <linux/syscalls.h> |
| 9 | #include <linux/user_namespace.h> | ||
| 9 | #include <asm/uaccess.h> | 10 | #include <asm/uaccess.h> |
| 10 | 11 | ||
| 11 | /* init to 2 - one for init_task, one to ensure it is never freed */ | 12 | /* init to 2 - one for init_task, one to ensure it is never freed */ |
| @@ -213,6 +214,14 @@ out: | |||
| 213 | return i; | 214 | return i; |
| 214 | } | 215 | } |
| 215 | 216 | ||
| 217 | bool may_setgroups(void) | ||
| 218 | { | ||
| 219 | struct user_namespace *user_ns = current_user_ns(); | ||
| 220 | |||
| 221 | return ns_capable(user_ns, CAP_SETGID) && | ||
| 222 | userns_may_setgroups(user_ns); | ||
| 223 | } | ||
| 224 | |||
| 216 | /* | 225 | /* |
| 217 | * SMP: Our groups are copy-on-write. We can set them safely | 226 | * SMP: Our groups are copy-on-write. We can set them safely |
| 218 | * without another task interfering. | 227 | * without another task interfering. |
| @@ -223,7 +232,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) | |||
| 223 | struct group_info *group_info; | 232 | struct group_info *group_info; |
| 224 | int retval; | 233 | int retval; |
| 225 | 234 | ||
| 226 | if (!ns_capable(current_user_ns(), CAP_SETGID)) | 235 | if (!may_setgroups()) |
| 227 | return -EPERM; | 236 | return -EPERM; |
| 228 | if ((unsigned)gidsetsize > NGROUPS_MAX) | 237 | if ((unsigned)gidsetsize > NGROUPS_MAX) |
| 229 | return -EINVAL; | 238 | return -EINVAL; |
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 225086b2652e..9a76e3beda54 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig | |||
| @@ -55,6 +55,21 @@ config GENERIC_IRQ_CHIP | |||
| 55 | config IRQ_DOMAIN | 55 | config IRQ_DOMAIN |
| 56 | bool | 56 | bool |
| 57 | 57 | ||
| 58 | # Support for hierarchical irq domains | ||
| 59 | config IRQ_DOMAIN_HIERARCHY | ||
| 60 | bool | ||
| 61 | select IRQ_DOMAIN | ||
| 62 | |||
| 63 | # Generic MSI interrupt support | ||
| 64 | config GENERIC_MSI_IRQ | ||
| 65 | bool | ||
| 66 | |||
| 67 | # Generic MSI hierarchical interrupt domain support | ||
| 68 | config GENERIC_MSI_IRQ_DOMAIN | ||
| 69 | bool | ||
| 70 | select IRQ_DOMAIN_HIERARCHY | ||
| 71 | select GENERIC_MSI_IRQ | ||
| 72 | |||
| 58 | config HANDLE_DOMAIN_IRQ | 73 | config HANDLE_DOMAIN_IRQ |
| 59 | bool | 74 | bool |
| 60 | 75 | ||
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index fff17381f0af..d12123526e2b 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile | |||
| @@ -6,3 +6,4 @@ obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o | |||
| 6 | obj-$(CONFIG_PROC_FS) += proc.o | 6 | obj-$(CONFIG_PROC_FS) += proc.o |
| 7 | obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o | 7 | obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o |
| 8 | obj-$(CONFIG_PM_SLEEP) += pm.o | 8 | obj-$(CONFIG_PM_SLEEP) += pm.o |
| 9 | obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o | ||
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index e5202f00cabc..6f1c7a566b95 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
| @@ -15,6 +15,7 @@ | |||
| 15 | #include <linux/module.h> | 15 | #include <linux/module.h> |
| 16 | #include <linux/interrupt.h> | 16 | #include <linux/interrupt.h> |
| 17 | #include <linux/kernel_stat.h> | 17 | #include <linux/kernel_stat.h> |
| 18 | #include <linux/irqdomain.h> | ||
| 18 | 19 | ||
| 19 | #include <trace/events/irq.h> | 20 | #include <trace/events/irq.h> |
| 20 | 21 | ||
| @@ -178,6 +179,7 @@ int irq_startup(struct irq_desc *desc, bool resend) | |||
| 178 | irq_state_clr_disabled(desc); | 179 | irq_state_clr_disabled(desc); |
| 179 | desc->depth = 0; | 180 | desc->depth = 0; |
| 180 | 181 | ||
| 182 | irq_domain_activate_irq(&desc->irq_data); | ||
| 181 | if (desc->irq_data.chip->irq_startup) { | 183 | if (desc->irq_data.chip->irq_startup) { |
| 182 | ret = desc->irq_data.chip->irq_startup(&desc->irq_data); | 184 | ret = desc->irq_data.chip->irq_startup(&desc->irq_data); |
| 183 | irq_state_clr_masked(desc); | 185 | irq_state_clr_masked(desc); |
| @@ -199,6 +201,7 @@ void irq_shutdown(struct irq_desc *desc) | |||
| 199 | desc->irq_data.chip->irq_disable(&desc->irq_data); | 201 | desc->irq_data.chip->irq_disable(&desc->irq_data); |
| 200 | else | 202 | else |
| 201 | desc->irq_data.chip->irq_mask(&desc->irq_data); | 203 | desc->irq_data.chip->irq_mask(&desc->irq_data); |
| 204 | irq_domain_deactivate_irq(&desc->irq_data); | ||
| 202 | irq_state_set_masked(desc); | 205 | irq_state_set_masked(desc); |
| 203 | } | 206 | } |
| 204 | 207 | ||
| @@ -728,7 +731,30 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, | |||
| 728 | if (!handle) { | 731 | if (!handle) { |
| 729 | handle = handle_bad_irq; | 732 | handle = handle_bad_irq; |
| 730 | } else { | 733 | } else { |
| 731 | if (WARN_ON(desc->irq_data.chip == &no_irq_chip)) | 734 | struct irq_data *irq_data = &desc->irq_data; |
| 735 | #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY | ||
| 736 | /* | ||
| 737 | * With hierarchical domains we might run into a | ||
| 738 | * situation where the outermost chip is not yet set | ||
| 739 | * up, but the inner chips are there. Instead of | ||
| 740 | * bailing we install the handler, but obviously we | ||
| 741 | * cannot enable/startup the interrupt at this point. | ||
| 742 | */ | ||
| 743 | while (irq_data) { | ||
| 744 | if (irq_data->chip != &no_irq_chip) | ||
| 745 | break; | ||
| 746 | /* | ||
| 747 | * Bail out if the outer chip is not set up | ||
| 748 | * and the interrrupt supposed to be started | ||
| 749 | * right away. | ||
| 750 | */ | ||
| 751 | if (WARN_ON(is_chained)) | ||
| 752 | goto out; | ||
| 753 | /* Try the parent */ | ||
| 754 | irq_data = irq_data->parent_data; | ||
| 755 | } | ||
| 756 | #endif | ||
| 757 | if (WARN_ON(!irq_data || irq_data->chip == &no_irq_chip)) | ||
| 732 | goto out; | 758 | goto out; |
| 733 | } | 759 | } |
| 734 | 760 | ||
| @@ -847,3 +873,105 @@ void irq_cpu_offline(void) | |||
| 847 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 873 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
| 848 | } | 874 | } |
| 849 | } | 875 | } |
| 876 | |||
| 877 | #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY | ||
| 878 | /** | ||
| 879 | * irq_chip_ack_parent - Acknowledge the parent interrupt | ||
| 880 | * @data: Pointer to interrupt specific data | ||
| 881 | */ | ||
| 882 | void irq_chip_ack_parent(struct irq_data *data) | ||
| 883 | { | ||
| 884 | data = data->parent_data; | ||
| 885 | data->chip->irq_ack(data); | ||
| 886 | } | ||
| 887 | |||
| 888 | /** | ||
| 889 | * irq_chip_mask_parent - Mask the parent interrupt | ||
| 890 | * @data: Pointer to interrupt specific data | ||
| 891 | */ | ||
| 892 | void irq_chip_mask_parent(struct irq_data *data) | ||
| 893 | { | ||
| 894 | data = data->parent_data; | ||
| 895 | data->chip->irq_mask(data); | ||
| 896 | } | ||
| 897 | |||
| 898 | /** | ||
| 899 | * irq_chip_unmask_parent - Unmask the parent interrupt | ||
| 900 | * @data: Pointer to interrupt specific data | ||
| 901 | */ | ||
| 902 | void irq_chip_unmask_parent(struct irq_data *data) | ||
| 903 | { | ||
| 904 | data = data->parent_data; | ||
| 905 | data->chip->irq_unmask(data); | ||
| 906 | } | ||
| 907 | |||
| 908 | /** | ||
| 909 | * irq_chip_eoi_parent - Invoke EOI on the parent interrupt | ||
| 910 | * @data: Pointer to interrupt specific data | ||
| 911 | */ | ||
| 912 | void irq_chip_eoi_parent(struct irq_data *data) | ||
| 913 | { | ||
| 914 | data = data->parent_data; | ||
| 915 | data->chip->irq_eoi(data); | ||
| 916 | } | ||
| 917 | |||
| 918 | /** | ||
| 919 | * irq_chip_set_affinity_parent - Set affinity on the parent interrupt | ||
| 920 | * @data: Pointer to interrupt specific data | ||
| 921 | * @dest: The affinity mask to set | ||
| 922 | * @force: Flag to enforce setting (disable online checks) | ||
| 923 | * | ||
| 924 | * Conditinal, as the underlying parent chip might not implement it. | ||
| 925 | */ | ||
| 926 | int irq_chip_set_affinity_parent(struct irq_data *data, | ||
| 927 | const struct cpumask *dest, bool force) | ||
| 928 | { | ||
| 929 | data = data->parent_data; | ||
| 930 | if (data->chip->irq_set_affinity) | ||
| 931 | return data->chip->irq_set_affinity(data, dest, force); | ||
| 932 | |||
| 933 | return -ENOSYS; | ||
| 934 | } | ||
| 935 | |||
| 936 | /** | ||
| 937 | * irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware | ||
| 938 | * @data: Pointer to interrupt specific data | ||
| 939 | * | ||
| 940 | * Iterate through the domain hierarchy of the interrupt and check | ||
| 941 | * whether a hw retrigger function exists. If yes, invoke it. | ||
| 942 | */ | ||
| 943 | int irq_chip_retrigger_hierarchy(struct irq_data *data) | ||
| 944 | { | ||
| 945 | for (data = data->parent_data; data; data = data->parent_data) | ||
| 946 | if (data->chip && data->chip->irq_retrigger) | ||
| 947 | return data->chip->irq_retrigger(data); | ||
| 948 | |||
| 949 | return -ENOSYS; | ||
| 950 | } | ||
| 951 | #endif | ||
| 952 | |||
| 953 | /** | ||
| 954 | * irq_chip_compose_msi_msg - Componse msi message for a irq chip | ||
| 955 | * @data: Pointer to interrupt specific data | ||
| 956 | * @msg: Pointer to the MSI message | ||
| 957 | * | ||
| 958 | * For hierarchical domains we find the first chip in the hierarchy | ||
| 959 | * which implements the irq_compose_msi_msg callback. For non | ||
| 960 | * hierarchical we use the top level chip. | ||
| 961 | */ | ||
| 962 | int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) | ||
| 963 | { | ||
| 964 | struct irq_data *pos = NULL; | ||
| 965 | |||
| 966 | #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY | ||
| 967 | for (; data; data = data->parent_data) | ||
| 968 | #endif | ||
| 969 | if (data->chip && data->chip->irq_compose_msi_msg) | ||
| 970 | pos = data; | ||
| 971 | if (!pos) | ||
| 972 | return -ENOSYS; | ||
| 973 | |||
| 974 | pos->chip->irq_compose_msi_msg(pos, msg); | ||
| 975 | |||
| 976 | return 0; | ||
| 977 | } | ||
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index cf80e7b0ddab..61024e8abdef 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c | |||
| @@ -39,7 +39,7 @@ void irq_gc_mask_disable_reg(struct irq_data *d) | |||
| 39 | u32 mask = d->mask; | 39 | u32 mask = d->mask; |
| 40 | 40 | ||
| 41 | irq_gc_lock(gc); | 41 | irq_gc_lock(gc); |
| 42 | irq_reg_writel(mask, gc->reg_base + ct->regs.disable); | 42 | irq_reg_writel(gc, mask, ct->regs.disable); |
| 43 | *ct->mask_cache &= ~mask; | 43 | *ct->mask_cache &= ~mask; |
| 44 | irq_gc_unlock(gc); | 44 | irq_gc_unlock(gc); |
| 45 | } | 45 | } |
| @@ -59,7 +59,7 @@ void irq_gc_mask_set_bit(struct irq_data *d) | |||
| 59 | 59 | ||
| 60 | irq_gc_lock(gc); | 60 | irq_gc_lock(gc); |
| 61 | *ct->mask_cache |= mask; | 61 | *ct->mask_cache |= mask; |
| 62 | irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask); | 62 | irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask); |
| 63 | irq_gc_unlock(gc); | 63 | irq_gc_unlock(gc); |
| 64 | } | 64 | } |
| 65 | EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit); | 65 | EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit); |
| @@ -79,7 +79,7 @@ void irq_gc_mask_clr_bit(struct irq_data *d) | |||
| 79 | 79 | ||
| 80 | irq_gc_lock(gc); | 80 | irq_gc_lock(gc); |
| 81 | *ct->mask_cache &= ~mask; | 81 | *ct->mask_cache &= ~mask; |
| 82 | irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask); | 82 | irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask); |
| 83 | irq_gc_unlock(gc); | 83 | irq_gc_unlock(gc); |
| 84 | } | 84 | } |
| 85 | EXPORT_SYMBOL_GPL(irq_gc_mask_clr_bit); | 85 | EXPORT_SYMBOL_GPL(irq_gc_mask_clr_bit); |
| @@ -98,7 +98,7 @@ void irq_gc_unmask_enable_reg(struct irq_data *d) | |||
| 98 | u32 mask = d->mask; | 98 | u32 mask = d->mask; |
| 99 | 99 | ||
| 100 | irq_gc_lock(gc); | 100 | irq_gc_lock(gc); |
| 101 | irq_reg_writel(mask, gc->reg_base + ct->regs.enable); | 101 | irq_reg_writel(gc, mask, ct->regs.enable); |
| 102 | *ct->mask_cache |= mask; | 102 | *ct->mask_cache |= mask; |
| 103 | irq_gc_unlock(gc); | 103 | irq_gc_unlock(gc); |
| 104 | } | 104 | } |
| @@ -114,7 +114,7 @@ void irq_gc_ack_set_bit(struct irq_data *d) | |||
| 114 | u32 mask = d->mask; | 114 | u32 mask = d->mask; |
| 115 | 115 | ||
| 116 | irq_gc_lock(gc); | 116 | irq_gc_lock(gc); |
| 117 | irq_reg_writel(mask, gc->reg_base + ct->regs.ack); | 117 | irq_reg_writel(gc, mask, ct->regs.ack); |
| 118 | irq_gc_unlock(gc); | 118 | irq_gc_unlock(gc); |
| 119 | } | 119 | } |
| 120 | EXPORT_SYMBOL_GPL(irq_gc_ack_set_bit); | 120 | EXPORT_SYMBOL_GPL(irq_gc_ack_set_bit); |
| @@ -130,7 +130,7 @@ void irq_gc_ack_clr_bit(struct irq_data *d) | |||
| 130 | u32 mask = ~d->mask; | 130 | u32 mask = ~d->mask; |
| 131 | 131 | ||
| 132 | irq_gc_lock(gc); | 132 | irq_gc_lock(gc); |
| 133 | irq_reg_writel(mask, gc->reg_base + ct->regs.ack); | 133 | irq_reg_writel(gc, mask, ct->regs.ack); |
| 134 | irq_gc_unlock(gc); | 134 | irq_gc_unlock(gc); |
| 135 | } | 135 | } |
| 136 | 136 | ||
| @@ -145,8 +145,8 @@ void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) | |||
| 145 | u32 mask = d->mask; | 145 | u32 mask = d->mask; |
| 146 | 146 | ||
| 147 | irq_gc_lock(gc); | 147 | irq_gc_lock(gc); |
| 148 | irq_reg_writel(mask, gc->reg_base + ct->regs.mask); | 148 | irq_reg_writel(gc, mask, ct->regs.mask); |
| 149 | irq_reg_writel(mask, gc->reg_base + ct->regs.ack); | 149 | irq_reg_writel(gc, mask, ct->regs.ack); |
| 150 | irq_gc_unlock(gc); | 150 | irq_gc_unlock(gc); |
| 151 | } | 151 | } |
| 152 | 152 | ||
| @@ -161,7 +161,7 @@ void irq_gc_eoi(struct irq_data *d) | |||
| 161 | u32 mask = d->mask; | 161 | u32 mask = d->mask; |
| 162 | 162 | ||
| 163 | irq_gc_lock(gc); | 163 | irq_gc_lock(gc); |
| 164 | irq_reg_writel(mask, gc->reg_base + ct->regs.eoi); | 164 | irq_reg_writel(gc, mask, ct->regs.eoi); |
| 165 | irq_gc_unlock(gc); | 165 | irq_gc_unlock(gc); |
| 166 | } | 166 | } |
| 167 | 167 | ||
| @@ -191,6 +191,16 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on) | |||
| 191 | return 0; | 191 | return 0; |
| 192 | } | 192 | } |
| 193 | 193 | ||
| 194 | static u32 irq_readl_be(void __iomem *addr) | ||
| 195 | { | ||
| 196 | return ioread32be(addr); | ||
| 197 | } | ||
| 198 | |||
| 199 | static void irq_writel_be(u32 val, void __iomem *addr) | ||
| 200 | { | ||
| 201 | iowrite32be(val, addr); | ||
| 202 | } | ||
| 203 | |||
| 194 | static void | 204 | static void |
| 195 | irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, | 205 | irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, |
| 196 | int num_ct, unsigned int irq_base, | 206 | int num_ct, unsigned int irq_base, |
| @@ -245,7 +255,7 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags) | |||
| 245 | } | 255 | } |
| 246 | ct[i].mask_cache = mskptr; | 256 | ct[i].mask_cache = mskptr; |
| 247 | if (flags & IRQ_GC_INIT_MASK_CACHE) | 257 | if (flags & IRQ_GC_INIT_MASK_CACHE) |
| 248 | *mskptr = irq_reg_readl(gc->reg_base + mskreg); | 258 | *mskptr = irq_reg_readl(gc, mskreg); |
| 249 | } | 259 | } |
| 250 | } | 260 | } |
| 251 | 261 | ||
| @@ -300,7 +310,13 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, | |||
| 300 | dgc->gc[i] = gc = tmp; | 310 | dgc->gc[i] = gc = tmp; |
| 301 | irq_init_generic_chip(gc, name, num_ct, i * irqs_per_chip, | 311 | irq_init_generic_chip(gc, name, num_ct, i * irqs_per_chip, |
| 302 | NULL, handler); | 312 | NULL, handler); |
| 313 | |||
| 303 | gc->domain = d; | 314 | gc->domain = d; |
| 315 | if (gcflags & IRQ_GC_BE_IO) { | ||
| 316 | gc->reg_readl = &irq_readl_be; | ||
| 317 | gc->reg_writel = &irq_writel_be; | ||
| 318 | } | ||
| 319 | |||
| 304 | raw_spin_lock_irqsave(&gc_lock, flags); | 320 | raw_spin_lock_irqsave(&gc_lock, flags); |
| 305 | list_add_tail(&gc->list, &gc_list); | 321 | list_add_tail(&gc->list, &gc_list); |
| 306 | raw_spin_unlock_irqrestore(&gc_lock, flags); | 322 | raw_spin_unlock_irqrestore(&gc_lock, flags); |
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 4332d766619d..df553b0af936 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h | |||
| @@ -78,8 +78,12 @@ extern void unmask_threaded_irq(struct irq_desc *desc); | |||
| 78 | 78 | ||
| 79 | #ifdef CONFIG_SPARSE_IRQ | 79 | #ifdef CONFIG_SPARSE_IRQ |
| 80 | static inline void irq_mark_irq(unsigned int irq) { } | 80 | static inline void irq_mark_irq(unsigned int irq) { } |
| 81 | extern void irq_lock_sparse(void); | ||
| 82 | extern void irq_unlock_sparse(void); | ||
| 81 | #else | 83 | #else |
| 82 | extern void irq_mark_irq(unsigned int irq); | 84 | extern void irq_mark_irq(unsigned int irq); |
| 85 | static inline void irq_lock_sparse(void) { } | ||
| 86 | static inline void irq_unlock_sparse(void) { } | ||
| 83 | #endif | 87 | #endif |
| 84 | 88 | ||
| 85 | extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); | 89 | extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); |
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index a1782f88f0af..99793b9b6d23 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c | |||
| @@ -132,6 +132,16 @@ static void free_masks(struct irq_desc *desc) | |||
| 132 | static inline void free_masks(struct irq_desc *desc) { } | 132 | static inline void free_masks(struct irq_desc *desc) { } |
| 133 | #endif | 133 | #endif |
| 134 | 134 | ||
| 135 | void irq_lock_sparse(void) | ||
| 136 | { | ||
| 137 | mutex_lock(&sparse_irq_lock); | ||
| 138 | } | ||
| 139 | |||
| 140 | void irq_unlock_sparse(void) | ||
| 141 | { | ||
| 142 | mutex_unlock(&sparse_irq_lock); | ||
| 143 | } | ||
| 144 | |||
| 135 | static struct irq_desc *alloc_desc(int irq, int node, struct module *owner) | 145 | static struct irq_desc *alloc_desc(int irq, int node, struct module *owner) |
| 136 | { | 146 | { |
| 137 | struct irq_desc *desc; | 147 | struct irq_desc *desc; |
| @@ -168,6 +178,12 @@ static void free_desc(unsigned int irq) | |||
| 168 | 178 | ||
| 169 | unregister_irq_proc(irq, desc); | 179 | unregister_irq_proc(irq, desc); |
| 170 | 180 | ||
| 181 | /* | ||
| 182 | * sparse_irq_lock protects also show_interrupts() and | ||
| 183 | * kstat_irq_usr(). Once we deleted the descriptor from the | ||
| 184 | * sparse tree we can free it. Access in proc will fail to | ||
| 185 | * lookup the descriptor. | ||
| 186 | */ | ||
| 171 | mutex_lock(&sparse_irq_lock); | 187 | mutex_lock(&sparse_irq_lock); |
| 172 | delete_irq_desc(irq); | 188 | delete_irq_desc(irq); |
| 173 | mutex_unlock(&sparse_irq_lock); | 189 | mutex_unlock(&sparse_irq_lock); |
| @@ -574,6 +590,15 @@ void kstat_incr_irq_this_cpu(unsigned int irq) | |||
| 574 | kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); | 590 | kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); |
| 575 | } | 591 | } |
| 576 | 592 | ||
| 593 | /** | ||
| 594 | * kstat_irqs_cpu - Get the statistics for an interrupt on a cpu | ||
| 595 | * @irq: The interrupt number | ||
| 596 | * @cpu: The cpu number | ||
| 597 | * | ||
| 598 | * Returns the sum of interrupt counts on @cpu since boot for | ||
| 599 | * @irq. The caller must ensure that the interrupt is not removed | ||
| 600 | * concurrently. | ||
| 601 | */ | ||
| 577 | unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) | 602 | unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) |
| 578 | { | 603 | { |
| 579 | struct irq_desc *desc = irq_to_desc(irq); | 604 | struct irq_desc *desc = irq_to_desc(irq); |
| @@ -582,6 +607,14 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) | |||
| 582 | *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; | 607 | *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; |
| 583 | } | 608 | } |
| 584 | 609 | ||
| 610 | /** | ||
| 611 | * kstat_irqs - Get the statistics for an interrupt | ||
| 612 | * @irq: The interrupt number | ||
| 613 | * | ||
| 614 | * Returns the sum of interrupt counts on all cpus since boot for | ||
| 615 | * @irq. The caller must ensure that the interrupt is not removed | ||
| 616 | * concurrently. | ||
| 617 | */ | ||
| 585 | unsigned int kstat_irqs(unsigned int irq) | 618 | unsigned int kstat_irqs(unsigned int irq) |
| 586 | { | 619 | { |
| 587 | struct irq_desc *desc = irq_to_desc(irq); | 620 | struct irq_desc *desc = irq_to_desc(irq); |
| @@ -594,3 +627,22 @@ unsigned int kstat_irqs(unsigned int irq) | |||
| 594 | sum += *per_cpu_ptr(desc->kstat_irqs, cpu); | 627 | sum += *per_cpu_ptr(desc->kstat_irqs, cpu); |
| 595 | return sum; | 628 | return sum; |
| 596 | } | 629 | } |
| 630 | |||
| 631 | /** | ||
| 632 | * kstat_irqs_usr - Get the statistics for an interrupt | ||
| 633 | * @irq: The interrupt number | ||
| 634 | * | ||
| 635 | * Returns the sum of interrupt counts on all cpus since boot for | ||
| 636 | * @irq. Contrary to kstat_irqs() this can be called from any | ||
| 637 | * preemptible context. It's protected against concurrent removal of | ||
| 638 | * an interrupt descriptor when sparse irqs are enabled. | ||
| 639 | */ | ||
| 640 | unsigned int kstat_irqs_usr(unsigned int irq) | ||
| 641 | { | ||
| 642 | int sum; | ||
| 643 | |||
| 644 | irq_lock_sparse(); | ||
| 645 | sum = kstat_irqs(irq); | ||
| 646 | irq_unlock_sparse(); | ||
| 647 | return sum; | ||
| 648 | } | ||
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 6534ff6ce02e..7fac311057b8 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c | |||
| @@ -23,6 +23,10 @@ static DEFINE_MUTEX(irq_domain_mutex); | |||
| 23 | static DEFINE_MUTEX(revmap_trees_mutex); | 23 | static DEFINE_MUTEX(revmap_trees_mutex); |
| 24 | static struct irq_domain *irq_default_domain; | 24 | static struct irq_domain *irq_default_domain; |
| 25 | 25 | ||
| 26 | static int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, | ||
| 27 | irq_hw_number_t hwirq, int node); | ||
| 28 | static void irq_domain_check_hierarchy(struct irq_domain *domain); | ||
| 29 | |||
| 26 | /** | 30 | /** |
| 27 | * __irq_domain_add() - Allocate a new irq_domain data structure | 31 | * __irq_domain_add() - Allocate a new irq_domain data structure |
| 28 | * @of_node: optional device-tree node of the interrupt controller | 32 | * @of_node: optional device-tree node of the interrupt controller |
| @@ -30,7 +34,7 @@ static struct irq_domain *irq_default_domain; | |||
| 30 | * @hwirq_max: Maximum number of interrupts supported by controller | 34 | * @hwirq_max: Maximum number of interrupts supported by controller |
| 31 | * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no | 35 | * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no |
| 32 | * direct mapping | 36 | * direct mapping |
| 33 | * @ops: map/unmap domain callbacks | 37 | * @ops: domain callbacks |
| 34 | * @host_data: Controller private data pointer | 38 | * @host_data: Controller private data pointer |
| 35 | * | 39 | * |
| 36 | * Allocates and initialize and irq_domain structure. | 40 | * Allocates and initialize and irq_domain structure. |
| @@ -56,6 +60,7 @@ struct irq_domain *__irq_domain_add(struct device_node *of_node, int size, | |||
| 56 | domain->hwirq_max = hwirq_max; | 60 | domain->hwirq_max = hwirq_max; |
| 57 | domain->revmap_size = size; | 61 | domain->revmap_size = size; |
| 58 | domain->revmap_direct_max_irq = direct_max; | 62 | domain->revmap_direct_max_irq = direct_max; |
| 63 | irq_domain_check_hierarchy(domain); | ||
| 59 | 64 | ||
| 60 | mutex_lock(&irq_domain_mutex); | 65 | mutex_lock(&irq_domain_mutex); |
| 61 | list_add(&domain->link, &irq_domain_list); | 66 | list_add(&domain->link, &irq_domain_list); |
| @@ -109,7 +114,7 @@ EXPORT_SYMBOL_GPL(irq_domain_remove); | |||
| 109 | * @first_irq: first number of irq block assigned to the domain, | 114 | * @first_irq: first number of irq block assigned to the domain, |
| 110 | * pass zero to assign irqs on-the-fly. If first_irq is non-zero, then | 115 | * pass zero to assign irqs on-the-fly. If first_irq is non-zero, then |
| 111 | * pre-map all of the irqs in the domain to virqs starting at first_irq. | 116 | * pre-map all of the irqs in the domain to virqs starting at first_irq. |
| 112 | * @ops: map/unmap domain callbacks | 117 | * @ops: domain callbacks |
| 113 | * @host_data: Controller private data pointer | 118 | * @host_data: Controller private data pointer |
| 114 | * | 119 | * |
| 115 | * Allocates an irq_domain, and optionally if first_irq is positive then also | 120 | * Allocates an irq_domain, and optionally if first_irq is positive then also |
| @@ -174,10 +179,8 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, | |||
| 174 | 179 | ||
| 175 | domain = __irq_domain_add(of_node, first_hwirq + size, | 180 | domain = __irq_domain_add(of_node, first_hwirq + size, |
| 176 | first_hwirq + size, 0, ops, host_data); | 181 | first_hwirq + size, 0, ops, host_data); |
| 177 | if (!domain) | 182 | if (domain) |
| 178 | return NULL; | 183 | irq_domain_associate_many(domain, first_irq, first_hwirq, size); |
| 179 | |||
| 180 | irq_domain_associate_many(domain, first_irq, first_hwirq, size); | ||
| 181 | 184 | ||
| 182 | return domain; | 185 | return domain; |
| 183 | } | 186 | } |
| @@ -388,7 +391,6 @@ EXPORT_SYMBOL_GPL(irq_create_direct_mapping); | |||
| 388 | unsigned int irq_create_mapping(struct irq_domain *domain, | 391 | unsigned int irq_create_mapping(struct irq_domain *domain, |
| 389 | irq_hw_number_t hwirq) | 392 | irq_hw_number_t hwirq) |
| 390 | { | 393 | { |
| 391 | unsigned int hint; | ||
| 392 | int virq; | 394 | int virq; |
| 393 | 395 | ||
| 394 | pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); | 396 | pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); |
| @@ -410,12 +412,8 @@ unsigned int irq_create_mapping(struct irq_domain *domain, | |||
| 410 | } | 412 | } |
| 411 | 413 | ||
| 412 | /* Allocate a virtual interrupt number */ | 414 | /* Allocate a virtual interrupt number */ |
| 413 | hint = hwirq % nr_irqs; | 415 | virq = irq_domain_alloc_descs(-1, 1, hwirq, |
| 414 | if (hint == 0) | 416 | of_node_to_nid(domain->of_node)); |
| 415 | hint++; | ||
| 416 | virq = irq_alloc_desc_from(hint, of_node_to_nid(domain->of_node)); | ||
| 417 | if (virq <= 0) | ||
| 418 | virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node)); | ||
| 419 | if (virq <= 0) { | 417 | if (virq <= 0) { |
| 420 | pr_debug("-> virq allocation failed\n"); | 418 | pr_debug("-> virq allocation failed\n"); |
| 421 | return 0; | 419 | return 0; |
| @@ -471,7 +469,7 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) | |||
| 471 | struct irq_domain *domain; | 469 | struct irq_domain *domain; |
| 472 | irq_hw_number_t hwirq; | 470 | irq_hw_number_t hwirq; |
| 473 | unsigned int type = IRQ_TYPE_NONE; | 471 | unsigned int type = IRQ_TYPE_NONE; |
| 474 | unsigned int virq; | 472 | int virq; |
| 475 | 473 | ||
| 476 | domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain; | 474 | domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain; |
| 477 | if (!domain) { | 475 | if (!domain) { |
| @@ -489,10 +487,24 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) | |||
| 489 | return 0; | 487 | return 0; |
| 490 | } | 488 | } |
| 491 | 489 | ||
| 492 | /* Create mapping */ | 490 | if (irq_domain_is_hierarchy(domain)) { |
| 493 | virq = irq_create_mapping(domain, hwirq); | 491 | /* |
| 494 | if (!virq) | 492 | * If we've already configured this interrupt, |
| 495 | return virq; | 493 | * don't do it again, or hell will break loose. |
| 494 | */ | ||
| 495 | virq = irq_find_mapping(domain, hwirq); | ||
| 496 | if (virq) | ||
| 497 | return virq; | ||
| 498 | |||
| 499 | virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, irq_data); | ||
| 500 | if (virq <= 0) | ||
| 501 | return 0; | ||
| 502 | } else { | ||
| 503 | /* Create mapping */ | ||
| 504 | virq = irq_create_mapping(domain, hwirq); | ||
| 505 | if (!virq) | ||
| 506 | return virq; | ||
| 507 | } | ||
| 496 | 508 | ||
| 497 | /* Set type if specified and different than the current one */ | 509 | /* Set type if specified and different than the current one */ |
| 498 | if (type != IRQ_TYPE_NONE && | 510 | if (type != IRQ_TYPE_NONE && |
| @@ -540,8 +552,8 @@ unsigned int irq_find_mapping(struct irq_domain *domain, | |||
| 540 | return 0; | 552 | return 0; |
| 541 | 553 | ||
| 542 | if (hwirq < domain->revmap_direct_max_irq) { | 554 | if (hwirq < domain->revmap_direct_max_irq) { |
| 543 | data = irq_get_irq_data(hwirq); | 555 | data = irq_domain_get_irq_data(domain, hwirq); |
| 544 | if (data && (data->domain == domain) && (data->hwirq == hwirq)) | 556 | if (data && data->hwirq == hwirq) |
| 545 | return hwirq; | 557 | return hwirq; |
| 546 | } | 558 | } |
| 547 | 559 | ||
| @@ -709,3 +721,518 @@ const struct irq_domain_ops irq_domain_simple_ops = { | |||
| 709 | .xlate = irq_domain_xlate_onetwocell, | 721 | .xlate = irq_domain_xlate_onetwocell, |
| 710 | }; | 722 | }; |
| 711 | EXPORT_SYMBOL_GPL(irq_domain_simple_ops); | 723 | EXPORT_SYMBOL_GPL(irq_domain_simple_ops); |
| 724 | |||
| 725 | static int irq_domain_alloc_descs(int virq, unsigned int cnt, | ||
| 726 | irq_hw_number_t hwirq, int node) | ||
| 727 | { | ||
| 728 | unsigned int hint; | ||
| 729 | |||
| 730 | if (virq >= 0) { | ||
| 731 | virq = irq_alloc_descs(virq, virq, cnt, node); | ||
| 732 | } else { | ||
| 733 | hint = hwirq % nr_irqs; | ||
| 734 | if (hint == 0) | ||
| 735 | hint++; | ||
| 736 | virq = irq_alloc_descs_from(hint, cnt, node); | ||
| 737 | if (virq <= 0 && hint > 1) | ||
| 738 | virq = irq_alloc_descs_from(1, cnt, node); | ||
| 739 | } | ||
| 740 | |||
| 741 | return virq; | ||
| 742 | } | ||
| 743 | |||
| 744 | #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY | ||
| 745 | /** | ||
| 746 | * irq_domain_add_hierarchy - Add a irqdomain into the hierarchy | ||
| 747 | * @parent: Parent irq domain to associate with the new domain | ||
| 748 | * @flags: Irq domain flags associated to the domain | ||
| 749 | * @size: Size of the domain. See below | ||
| 750 | * @node: Optional device-tree node of the interrupt controller | ||
| 751 | * @ops: Pointer to the interrupt domain callbacks | ||
| 752 | * @host_data: Controller private data pointer | ||
| 753 | * | ||
| 754 | * If @size is 0 a tree domain is created, otherwise a linear domain. | ||
| 755 | * | ||
| 756 | * If successful the parent is associated to the new domain and the | ||
| 757 | * domain flags are set. | ||
| 758 | * Returns pointer to IRQ domain, or NULL on failure. | ||
| 759 | */ | ||
| 760 | struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *parent, | ||
| 761 | unsigned int flags, | ||
| 762 | unsigned int size, | ||
| 763 | struct device_node *node, | ||
| 764 | const struct irq_domain_ops *ops, | ||
| 765 | void *host_data) | ||
| 766 | { | ||
| 767 | struct irq_domain *domain; | ||
| 768 | |||
| 769 | if (size) | ||
| 770 | domain = irq_domain_add_linear(node, size, ops, host_data); | ||
| 771 | else | ||
| 772 | domain = irq_domain_add_tree(node, ops, host_data); | ||
| 773 | if (domain) { | ||
| 774 | domain->parent = parent; | ||
| 775 | domain->flags |= flags; | ||
| 776 | } | ||
| 777 | |||
| 778 | return domain; | ||
| 779 | } | ||
| 780 | |||
| 781 | static void irq_domain_insert_irq(int virq) | ||
| 782 | { | ||
| 783 | struct irq_data *data; | ||
| 784 | |||
| 785 | for (data = irq_get_irq_data(virq); data; data = data->parent_data) { | ||
| 786 | struct irq_domain *domain = data->domain; | ||
| 787 | irq_hw_number_t hwirq = data->hwirq; | ||
| 788 | |||
| 789 | if (hwirq < domain->revmap_size) { | ||
| 790 | domain->linear_revmap[hwirq] = virq; | ||
| 791 | } else { | ||
| 792 | mutex_lock(&revmap_trees_mutex); | ||
| 793 | radix_tree_insert(&domain->revmap_tree, hwirq, data); | ||
| 794 | mutex_unlock(&revmap_trees_mutex); | ||
| 795 | } | ||
| 796 | |||
| 797 | /* If not already assigned, give the domain the chip's name */ | ||
| 798 | if (!domain->name && data->chip) | ||
| 799 | domain->name = data->chip->name; | ||
| 800 | } | ||
| 801 | |||
| 802 | irq_clear_status_flags(virq, IRQ_NOREQUEST); | ||
| 803 | } | ||
| 804 | |||
| 805 | static void irq_domain_remove_irq(int virq) | ||
| 806 | { | ||
| 807 | struct irq_data *data; | ||
| 808 | |||
| 809 | irq_set_status_flags(virq, IRQ_NOREQUEST); | ||
| 810 | irq_set_chip_and_handler(virq, NULL, NULL); | ||
| 811 | synchronize_irq(virq); | ||
| 812 | smp_mb(); | ||
| 813 | |||
| 814 | for (data = irq_get_irq_data(virq); data; data = data->parent_data) { | ||
| 815 | struct irq_domain *domain = data->domain; | ||
| 816 | irq_hw_number_t hwirq = data->hwirq; | ||
| 817 | |||
| 818 | if (hwirq < domain->revmap_size) { | ||
| 819 | domain->linear_revmap[hwirq] = 0; | ||
| 820 | } else { | ||
| 821 | mutex_lock(&revmap_trees_mutex); | ||
| 822 | radix_tree_delete(&domain->revmap_tree, hwirq); | ||
| 823 | mutex_unlock(&revmap_trees_mutex); | ||
| 824 | } | ||
| 825 | } | ||
| 826 | } | ||
| 827 | |||
| 828 | static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain, | ||
| 829 | struct irq_data *child) | ||
| 830 | { | ||
| 831 | struct irq_data *irq_data; | ||
| 832 | |||
| 833 | irq_data = kzalloc_node(sizeof(*irq_data), GFP_KERNEL, child->node); | ||
| 834 | if (irq_data) { | ||
| 835 | child->parent_data = irq_data; | ||
| 836 | irq_data->irq = child->irq; | ||
| 837 | irq_data->node = child->node; | ||
| 838 | irq_data->domain = domain; | ||
| 839 | } | ||
| 840 | |||
| 841 | return irq_data; | ||
| 842 | } | ||
| 843 | |||
| 844 | static void irq_domain_free_irq_data(unsigned int virq, unsigned int nr_irqs) | ||
| 845 | { | ||
| 846 | struct irq_data *irq_data, *tmp; | ||
| 847 | int i; | ||
| 848 | |||
| 849 | for (i = 0; i < nr_irqs; i++) { | ||
| 850 | irq_data = irq_get_irq_data(virq + i); | ||
| 851 | tmp = irq_data->parent_data; | ||
| 852 | irq_data->parent_data = NULL; | ||
| 853 | irq_data->domain = NULL; | ||
| 854 | |||
| 855 | while (tmp) { | ||
| 856 | irq_data = tmp; | ||
| 857 | tmp = tmp->parent_data; | ||
| 858 | kfree(irq_data); | ||
| 859 | } | ||
| 860 | } | ||
| 861 | } | ||
| 862 | |||
| 863 | static int irq_domain_alloc_irq_data(struct irq_domain *domain, | ||
| 864 | unsigned int virq, unsigned int nr_irqs) | ||
| 865 | { | ||
| 866 | struct irq_data *irq_data; | ||
| 867 | struct irq_domain *parent; | ||
| 868 | int i; | ||
| 869 | |||
| 870 | /* The outermost irq_data is embedded in struct irq_desc */ | ||
| 871 | for (i = 0; i < nr_irqs; i++) { | ||
| 872 | irq_data = irq_get_irq_data(virq + i); | ||
| 873 | irq_data->domain = domain; | ||
| 874 | |||
| 875 | for (parent = domain->parent; parent; parent = parent->parent) { | ||
| 876 | irq_data = irq_domain_insert_irq_data(parent, irq_data); | ||
| 877 | if (!irq_data) { | ||
| 878 | irq_domain_free_irq_data(virq, i + 1); | ||
| 879 | return -ENOMEM; | ||
| 880 | } | ||
| 881 | } | ||
| 882 | } | ||
| 883 | |||
| 884 | return 0; | ||
| 885 | } | ||
| 886 | |||
| 887 | /** | ||
| 888 | * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain | ||
| 889 | * @domain: domain to match | ||
| 890 | * @virq: IRQ number to get irq_data | ||
| 891 | */ | ||
| 892 | struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, | ||
| 893 | unsigned int virq) | ||
| 894 | { | ||
| 895 | struct irq_data *irq_data; | ||
| 896 | |||
| 897 | for (irq_data = irq_get_irq_data(virq); irq_data; | ||
| 898 | irq_data = irq_data->parent_data) | ||
| 899 | if (irq_data->domain == domain) | ||
| 900 | return irq_data; | ||
| 901 | |||
| 902 | return NULL; | ||
| 903 | } | ||
| 904 | |||
| 905 | /** | ||
| 906 | * irq_domain_set_hwirq_and_chip - Set hwirq and irqchip of @virq at @domain | ||
| 907 | * @domain: Interrupt domain to match | ||
| 908 | * @virq: IRQ number | ||
| 909 | * @hwirq: The hwirq number | ||
| 910 | * @chip: The associated interrupt chip | ||
| 911 | * @chip_data: The associated chip data | ||
| 912 | */ | ||
| 913 | int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq, | ||
| 914 | irq_hw_number_t hwirq, struct irq_chip *chip, | ||
| 915 | void *chip_data) | ||
| 916 | { | ||
| 917 | struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq); | ||
| 918 | |||
| 919 | if (!irq_data) | ||
| 920 | return -ENOENT; | ||
| 921 | |||
| 922 | irq_data->hwirq = hwirq; | ||
| 923 | irq_data->chip = chip ? chip : &no_irq_chip; | ||
| 924 | irq_data->chip_data = chip_data; | ||
| 925 | |||
| 926 | return 0; | ||
| 927 | } | ||
| 928 | |||
| 929 | /** | ||
| 930 | * irq_domain_set_info - Set the complete data for a @virq in @domain | ||
| 931 | * @domain: Interrupt domain to match | ||
| 932 | * @virq: IRQ number | ||
| 933 | * @hwirq: The hardware interrupt number | ||
| 934 | * @chip: The associated interrupt chip | ||
| 935 | * @chip_data: The associated interrupt chip data | ||
| 936 | * @handler: The interrupt flow handler | ||
| 937 | * @handler_data: The interrupt flow handler data | ||
| 938 | * @handler_name: The interrupt handler name | ||
| 939 | */ | ||
| 940 | void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, | ||
| 941 | irq_hw_number_t hwirq, struct irq_chip *chip, | ||
| 942 | void *chip_data, irq_flow_handler_t handler, | ||
| 943 | void *handler_data, const char *handler_name) | ||
| 944 | { | ||
| 945 | irq_domain_set_hwirq_and_chip(domain, virq, hwirq, chip, chip_data); | ||
| 946 | __irq_set_handler(virq, handler, 0, handler_name); | ||
| 947 | irq_set_handler_data(virq, handler_data); | ||
| 948 | } | ||
| 949 | |||
| 950 | /** | ||
| 951 | * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data | ||
| 952 | * @irq_data: The pointer to irq_data | ||
| 953 | */ | ||
| 954 | void irq_domain_reset_irq_data(struct irq_data *irq_data) | ||
| 955 | { | ||
| 956 | irq_data->hwirq = 0; | ||
| 957 | irq_data->chip = &no_irq_chip; | ||
| 958 | irq_data->chip_data = NULL; | ||
| 959 | } | ||
| 960 | |||
| 961 | /** | ||
| 962 | * irq_domain_free_irqs_common - Clear irq_data and free the parent | ||
| 963 | * @domain: Interrupt domain to match | ||
| 964 | * @virq: IRQ number to start with | ||
| 965 | * @nr_irqs: The number of irqs to free | ||
| 966 | */ | ||
| 967 | void irq_domain_free_irqs_common(struct irq_domain *domain, unsigned int virq, | ||
| 968 | unsigned int nr_irqs) | ||
| 969 | { | ||
| 970 | struct irq_data *irq_data; | ||
| 971 | int i; | ||
| 972 | |||
| 973 | for (i = 0; i < nr_irqs; i++) { | ||
| 974 | irq_data = irq_domain_get_irq_data(domain, virq + i); | ||
| 975 | if (irq_data) | ||
| 976 | irq_domain_reset_irq_data(irq_data); | ||
| 977 | } | ||
| 978 | irq_domain_free_irqs_parent(domain, virq, nr_irqs); | ||
| 979 | } | ||
| 980 | |||
| 981 | /** | ||
| 982 | * irq_domain_free_irqs_top - Clear handler and handler data, clear irqdata and free parent | ||
| 983 | * @domain: Interrupt domain to match | ||
| 984 | * @virq: IRQ number to start with | ||
| 985 | * @nr_irqs: The number of irqs to free | ||
| 986 | */ | ||
| 987 | void irq_domain_free_irqs_top(struct irq_domain *domain, unsigned int virq, | ||
| 988 | unsigned int nr_irqs) | ||
| 989 | { | ||
| 990 | int i; | ||
| 991 | |||
| 992 | for (i = 0; i < nr_irqs; i++) { | ||
| 993 | irq_set_handler_data(virq + i, NULL); | ||
| 994 | irq_set_handler(virq + i, NULL); | ||
| 995 | } | ||
| 996 | irq_domain_free_irqs_common(domain, virq, nr_irqs); | ||
| 997 | } | ||
| 998 | |||
| 999 | static bool irq_domain_is_auto_recursive(struct irq_domain *domain) | ||
| 1000 | { | ||
| 1001 | return domain->flags & IRQ_DOMAIN_FLAG_AUTO_RECURSIVE; | ||
| 1002 | } | ||
| 1003 | |||
| 1004 | static void irq_domain_free_irqs_recursive(struct irq_domain *domain, | ||
| 1005 | unsigned int irq_base, | ||
| 1006 | unsigned int nr_irqs) | ||
| 1007 | { | ||
| 1008 | domain->ops->free(domain, irq_base, nr_irqs); | ||
| 1009 | if (irq_domain_is_auto_recursive(domain)) { | ||
| 1010 | BUG_ON(!domain->parent); | ||
| 1011 | irq_domain_free_irqs_recursive(domain->parent, irq_base, | ||
| 1012 | nr_irqs); | ||
| 1013 | } | ||
| 1014 | } | ||
| 1015 | |||
| 1016 | static int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, | ||
| 1017 | unsigned int irq_base, | ||
| 1018 | unsigned int nr_irqs, void *arg) | ||
| 1019 | { | ||
| 1020 | int ret = 0; | ||
| 1021 | struct irq_domain *parent = domain->parent; | ||
| 1022 | bool recursive = irq_domain_is_auto_recursive(domain); | ||
| 1023 | |||
| 1024 | BUG_ON(recursive && !parent); | ||
| 1025 | if (recursive) | ||
| 1026 | ret = irq_domain_alloc_irqs_recursive(parent, irq_base, | ||
| 1027 | nr_irqs, arg); | ||
| 1028 | if (ret >= 0) | ||
| 1029 | ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg); | ||
| 1030 | if (ret < 0 && recursive) | ||
| 1031 | irq_domain_free_irqs_recursive(parent, irq_base, nr_irqs); | ||
| 1032 | |||
| 1033 | return ret; | ||
| 1034 | } | ||
| 1035 | |||
| 1036 | /** | ||
| 1037 | * __irq_domain_alloc_irqs - Allocate IRQs from domain | ||
| 1038 | * @domain: domain to allocate from | ||
| 1039 | * @irq_base: allocate specified IRQ nubmer if irq_base >= 0 | ||
| 1040 | * @nr_irqs: number of IRQs to allocate | ||
| 1041 | * @node: NUMA node id for memory allocation | ||
| 1042 | * @arg: domain specific argument | ||
| 1043 | * @realloc: IRQ descriptors have already been allocated if true | ||
| 1044 | * | ||
| 1045 | * Allocate IRQ numbers and initialized all data structures to support | ||
| 1046 | * hierarchy IRQ domains. | ||
| 1047 | * Parameter @realloc is mainly to support legacy IRQs. | ||
| 1048 | * Returns error code or allocated IRQ number | ||
| 1049 | * | ||
| 1050 | * The whole process to setup an IRQ has been split into two steps. | ||
| 1051 | * The first step, __irq_domain_alloc_irqs(), is to allocate IRQ | ||
| 1052 | * descriptor and required hardware resources. The second step, | ||
| 1053 | * irq_domain_activate_irq(), is to program hardwares with preallocated | ||
| 1054 | * resources. In this way, it's easier to rollback when failing to | ||
| 1055 | * allocate resources. | ||
| 1056 | */ | ||
| 1057 | int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, | ||
| 1058 | unsigned int nr_irqs, int node, void *arg, | ||
| 1059 | bool realloc) | ||
| 1060 | { | ||
| 1061 | int i, ret, virq; | ||
| 1062 | |||
| 1063 | if (domain == NULL) { | ||
| 1064 | domain = irq_default_domain; | ||
| 1065 | if (WARN(!domain, "domain is NULL; cannot allocate IRQ\n")) | ||
| 1066 | return -EINVAL; | ||
| 1067 | } | ||
| 1068 | |||
| 1069 | if (!domain->ops->alloc) { | ||
| 1070 | pr_debug("domain->ops->alloc() is NULL\n"); | ||
| 1071 | return -ENOSYS; | ||
| 1072 | } | ||
| 1073 | |||
| 1074 | if (realloc && irq_base >= 0) { | ||
| 1075 | virq = irq_base; | ||
| 1076 | } else { | ||
| 1077 | virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node); | ||
| 1078 | if (virq < 0) { | ||
| 1079 | pr_debug("cannot allocate IRQ(base %d, count %d)\n", | ||
| 1080 | irq_base, nr_irqs); | ||
| 1081 | return virq; | ||
| 1082 | } | ||
| 1083 | } | ||
| 1084 | |||
| 1085 | if (irq_domain_alloc_irq_data(domain, virq, nr_irqs)) { | ||
| 1086 | pr_debug("cannot allocate memory for IRQ%d\n", virq); | ||
| 1087 | ret = -ENOMEM; | ||
| 1088 | goto out_free_desc; | ||
| 1089 | } | ||
| 1090 | |||
| 1091 | mutex_lock(&irq_domain_mutex); | ||
| 1092 | ret = irq_domain_alloc_irqs_recursive(domain, virq, nr_irqs, arg); | ||
| 1093 | if (ret < 0) { | ||
| 1094 | mutex_unlock(&irq_domain_mutex); | ||
| 1095 | goto out_free_irq_data; | ||
| 1096 | } | ||
| 1097 | for (i = 0; i < nr_irqs; i++) | ||
| 1098 | irq_domain_insert_irq(virq + i); | ||
| 1099 | mutex_unlock(&irq_domain_mutex); | ||
| 1100 | |||
| 1101 | return virq; | ||
| 1102 | |||
| 1103 | out_free_irq_data: | ||
| 1104 | irq_domain_free_irq_data(virq, nr_irqs); | ||
| 1105 | out_free_desc: | ||
| 1106 | irq_free_descs(virq, nr_irqs); | ||
| 1107 | return ret; | ||
| 1108 | } | ||
| 1109 | |||
| 1110 | /** | ||
| 1111 | * irq_domain_free_irqs - Free IRQ number and associated data structures | ||
| 1112 | * @virq: base IRQ number | ||
| 1113 | * @nr_irqs: number of IRQs to free | ||
| 1114 | */ | ||
| 1115 | void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs) | ||
| 1116 | { | ||
| 1117 | struct irq_data *data = irq_get_irq_data(virq); | ||
| 1118 | int i; | ||
| 1119 | |||
| 1120 | if (WARN(!data || !data->domain || !data->domain->ops->free, | ||
| 1121 | "NULL pointer, cannot free irq\n")) | ||
| 1122 | return; | ||
| 1123 | |||
| 1124 | mutex_lock(&irq_domain_mutex); | ||
| 1125 | for (i = 0; i < nr_irqs; i++) | ||
| 1126 | irq_domain_remove_irq(virq + i); | ||
| 1127 | irq_domain_free_irqs_recursive(data->domain, virq, nr_irqs); | ||
| 1128 | mutex_unlock(&irq_domain_mutex); | ||
| 1129 | |||
| 1130 | irq_domain_free_irq_data(virq, nr_irqs); | ||
| 1131 | irq_free_descs(virq, nr_irqs); | ||
| 1132 | } | ||
| 1133 | |||
| 1134 | /** | ||
| 1135 | * irq_domain_alloc_irqs_parent - Allocate interrupts from parent domain | ||
| 1136 | * @irq_base: Base IRQ number | ||
| 1137 | * @nr_irqs: Number of IRQs to allocate | ||
| 1138 | * @arg: Allocation data (arch/domain specific) | ||
| 1139 | * | ||
| 1140 | * Check whether the domain has been setup recursive. If not allocate | ||
| 1141 | * through the parent domain. | ||
| 1142 | */ | ||
| 1143 | int irq_domain_alloc_irqs_parent(struct irq_domain *domain, | ||
| 1144 | unsigned int irq_base, unsigned int nr_irqs, | ||
| 1145 | void *arg) | ||
| 1146 | { | ||
| 1147 | /* irq_domain_alloc_irqs_recursive() has called parent's alloc() */ | ||
| 1148 | if (irq_domain_is_auto_recursive(domain)) | ||
| 1149 | return 0; | ||
| 1150 | |||
| 1151 | domain = domain->parent; | ||
| 1152 | if (domain) | ||
| 1153 | return irq_domain_alloc_irqs_recursive(domain, irq_base, | ||
| 1154 | nr_irqs, arg); | ||
| 1155 | return -ENOSYS; | ||
| 1156 | } | ||
| 1157 | |||
| 1158 | /** | ||
| 1159 | * irq_domain_free_irqs_parent - Free interrupts from parent domain | ||
| 1160 | * @irq_base: Base IRQ number | ||
| 1161 | * @nr_irqs: Number of IRQs to free | ||
| 1162 | * | ||
| 1163 | * Check whether the domain has been setup recursive. If not free | ||
| 1164 | * through the parent domain. | ||
| 1165 | */ | ||
| 1166 | void irq_domain_free_irqs_parent(struct irq_domain *domain, | ||
| 1167 | unsigned int irq_base, unsigned int nr_irqs) | ||
| 1168 | { | ||
| 1169 | /* irq_domain_free_irqs_recursive() will call parent's free */ | ||
| 1170 | if (!irq_domain_is_auto_recursive(domain) && domain->parent) | ||
| 1171 | irq_domain_free_irqs_recursive(domain->parent, irq_base, | ||
| 1172 | nr_irqs); | ||
| 1173 | } | ||
| 1174 | |||
| 1175 | /** | ||
| 1176 | * irq_domain_activate_irq - Call domain_ops->activate recursively to activate | ||
| 1177 | * interrupt | ||
| 1178 | * @irq_data: outermost irq_data associated with interrupt | ||
| 1179 | * | ||
| 1180 | * This is the second step to call domain_ops->activate to program interrupt | ||
| 1181 | * controllers, so the interrupt could actually get delivered. | ||
| 1182 | */ | ||
| 1183 | void irq_domain_activate_irq(struct irq_data *irq_data) | ||
| 1184 | { | ||
| 1185 | if (irq_data && irq_data->domain) { | ||
| 1186 | struct irq_domain *domain = irq_data->domain; | ||
| 1187 | |||
| 1188 | if (irq_data->parent_data) | ||
| 1189 | irq_domain_activate_irq(irq_data->parent_data); | ||
| 1190 | if (domain->ops->activate) | ||
| 1191 | domain->ops->activate(domain, irq_data); | ||
| 1192 | } | ||
| 1193 | } | ||
| 1194 | |||
| 1195 | /** | ||
| 1196 | * irq_domain_deactivate_irq - Call domain_ops->deactivate recursively to | ||
| 1197 | * deactivate interrupt | ||
| 1198 | * @irq_data: outermost irq_data associated with interrupt | ||
| 1199 | * | ||
| 1200 | * It calls domain_ops->deactivate to program interrupt controllers to disable | ||
| 1201 | * interrupt delivery. | ||
| 1202 | */ | ||
| 1203 | void irq_domain_deactivate_irq(struct irq_data *irq_data) | ||
| 1204 | { | ||
| 1205 | if (irq_data && irq_data->domain) { | ||
| 1206 | struct irq_domain *domain = irq_data->domain; | ||
| 1207 | |||
| 1208 | if (domain->ops->deactivate) | ||
| 1209 | domain->ops->deactivate(domain, irq_data); | ||
| 1210 | if (irq_data->parent_data) | ||
| 1211 | irq_domain_deactivate_irq(irq_data->parent_data); | ||
| 1212 | } | ||
| 1213 | } | ||
| 1214 | |||
| 1215 | static void irq_domain_check_hierarchy(struct irq_domain *domain) | ||
| 1216 | { | ||
| 1217 | /* Hierarchy irq_domains must implement callback alloc() */ | ||
| 1218 | if (domain->ops->alloc) | ||
| 1219 | domain->flags |= IRQ_DOMAIN_FLAG_HIERARCHY; | ||
| 1220 | } | ||
| 1221 | #else /* CONFIG_IRQ_DOMAIN_HIERARCHY */ | ||
| 1222 | /** | ||
| 1223 | * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain | ||
| 1224 | * @domain: domain to match | ||
| 1225 | * @virq: IRQ number to get irq_data | ||
| 1226 | */ | ||
| 1227 | struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, | ||
| 1228 | unsigned int virq) | ||
| 1229 | { | ||
| 1230 | struct irq_data *irq_data = irq_get_irq_data(virq); | ||
| 1231 | |||
| 1232 | return (irq_data && irq_data->domain == domain) ? irq_data : NULL; | ||
| 1233 | } | ||
| 1234 | |||
| 1235 | static void irq_domain_check_hierarchy(struct irq_domain *domain) | ||
| 1236 | { | ||
| 1237 | } | ||
| 1238 | #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ | ||
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0a9104b4608b..80692373abd6 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
| @@ -183,6 +183,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, | |||
| 183 | ret = chip->irq_set_affinity(data, mask, force); | 183 | ret = chip->irq_set_affinity(data, mask, force); |
| 184 | switch (ret) { | 184 | switch (ret) { |
| 185 | case IRQ_SET_MASK_OK: | 185 | case IRQ_SET_MASK_OK: |
| 186 | case IRQ_SET_MASK_OK_DONE: | ||
| 186 | cpumask_copy(data->affinity, mask); | 187 | cpumask_copy(data->affinity, mask); |
| 187 | case IRQ_SET_MASK_OK_NOCOPY: | 188 | case IRQ_SET_MASK_OK_NOCOPY: |
| 188 | irq_set_thread_affinity(desc); | 189 | irq_set_thread_affinity(desc); |
| @@ -600,6 +601,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | |||
| 600 | 601 | ||
| 601 | switch (ret) { | 602 | switch (ret) { |
| 602 | case IRQ_SET_MASK_OK: | 603 | case IRQ_SET_MASK_OK: |
| 604 | case IRQ_SET_MASK_OK_DONE: | ||
| 603 | irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK); | 605 | irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK); |
| 604 | irqd_set(&desc->irq_data, flags); | 606 | irqd_set(&desc->irq_data, flags); |
| 605 | 607 | ||
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c new file mode 100644 index 000000000000..3e18163f336f --- /dev/null +++ b/kernel/irq/msi.c | |||
| @@ -0,0 +1,330 @@ | |||
| 1 | /* | ||
| 2 | * linux/kernel/irq/msi.c | ||
| 3 | * | ||
| 4 | * Copyright (C) 2014 Intel Corp. | ||
| 5 | * Author: Jiang Liu <jiang.liu@linux.intel.com> | ||
| 6 | * | ||
| 7 | * This file is licensed under GPLv2. | ||
| 8 | * | ||
| 9 | * This file contains common code to support Message Signalled Interrupt for | ||
| 10 | * PCI compatible and non PCI compatible devices. | ||
| 11 | */ | ||
| 12 | #include <linux/types.h> | ||
| 13 | #include <linux/device.h> | ||
| 14 | #include <linux/irq.h> | ||
| 15 | #include <linux/irqdomain.h> | ||
| 16 | #include <linux/msi.h> | ||
| 17 | |||
| 18 | /* Temparory solution for building, will be removed later */ | ||
| 19 | #include <linux/pci.h> | ||
| 20 | |||
| 21 | void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg) | ||
| 22 | { | ||
| 23 | *msg = entry->msg; | ||
| 24 | } | ||
| 25 | |||
| 26 | void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg) | ||
| 27 | { | ||
| 28 | struct msi_desc *entry = irq_get_msi_desc(irq); | ||
| 29 | |||
| 30 | __get_cached_msi_msg(entry, msg); | ||
| 31 | } | ||
| 32 | EXPORT_SYMBOL_GPL(get_cached_msi_msg); | ||
| 33 | |||
| 34 | #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN | ||
| 35 | static inline void irq_chip_write_msi_msg(struct irq_data *data, | ||
| 36 | struct msi_msg *msg) | ||
| 37 | { | ||
| 38 | data->chip->irq_write_msi_msg(data, msg); | ||
| 39 | } | ||
| 40 | |||
| 41 | /** | ||
| 42 | * msi_domain_set_affinity - Generic affinity setter function for MSI domains | ||
| 43 | * @irq_data: The irq data associated to the interrupt | ||
| 44 | * @mask: The affinity mask to set | ||
| 45 | * @force: Flag to enforce setting (disable online checks) | ||
| 46 | * | ||
| 47 | * Intended to be used by MSI interrupt controllers which are | ||
| 48 | * implemented with hierarchical domains. | ||
| 49 | */ | ||
| 50 | int msi_domain_set_affinity(struct irq_data *irq_data, | ||
| 51 | const struct cpumask *mask, bool force) | ||
| 52 | { | ||
| 53 | struct irq_data *parent = irq_data->parent_data; | ||
| 54 | struct msi_msg msg; | ||
| 55 | int ret; | ||
| 56 | |||
| 57 | ret = parent->chip->irq_set_affinity(parent, mask, force); | ||
| 58 | if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) { | ||
| 59 | BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg)); | ||
| 60 | irq_chip_write_msi_msg(irq_data, &msg); | ||
| 61 | } | ||
| 62 | |||
| 63 | return ret; | ||
| 64 | } | ||
| 65 | |||
| 66 | static void msi_domain_activate(struct irq_domain *domain, | ||
| 67 | struct irq_data *irq_data) | ||
| 68 | { | ||
| 69 | struct msi_msg msg; | ||
| 70 | |||
| 71 | BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg)); | ||
| 72 | irq_chip_write_msi_msg(irq_data, &msg); | ||
| 73 | } | ||
| 74 | |||
| 75 | static void msi_domain_deactivate(struct irq_domain *domain, | ||
| 76 | struct irq_data *irq_data) | ||
| 77 | { | ||
| 78 | struct msi_msg msg; | ||
| 79 | |||
| 80 | memset(&msg, 0, sizeof(msg)); | ||
| 81 | irq_chip_write_msi_msg(irq_data, &msg); | ||
| 82 | } | ||
| 83 | |||
| 84 | static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq, | ||
| 85 | unsigned int nr_irqs, void *arg) | ||
| 86 | { | ||
| 87 | struct msi_domain_info *info = domain->host_data; | ||
| 88 | struct msi_domain_ops *ops = info->ops; | ||
| 89 | irq_hw_number_t hwirq = ops->get_hwirq(info, arg); | ||
| 90 | int i, ret; | ||
| 91 | |||
| 92 | if (irq_find_mapping(domain, hwirq) > 0) | ||
| 93 | return -EEXIST; | ||
| 94 | |||
| 95 | ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); | ||
| 96 | if (ret < 0) | ||
| 97 | return ret; | ||
| 98 | |||
| 99 | for (i = 0; i < nr_irqs; i++) { | ||
| 100 | ret = ops->msi_init(domain, info, virq + i, hwirq + i, arg); | ||
| 101 | if (ret < 0) { | ||
| 102 | if (ops->msi_free) { | ||
| 103 | for (i--; i > 0; i--) | ||
| 104 | ops->msi_free(domain, info, virq + i); | ||
| 105 | } | ||
| 106 | irq_domain_free_irqs_top(domain, virq, nr_irqs); | ||
| 107 | return ret; | ||
| 108 | } | ||
| 109 | } | ||
| 110 | |||
| 111 | return 0; | ||
| 112 | } | ||
| 113 | |||
| 114 | static void msi_domain_free(struct irq_domain *domain, unsigned int virq, | ||
| 115 | unsigned int nr_irqs) | ||
| 116 | { | ||
| 117 | struct msi_domain_info *info = domain->host_data; | ||
| 118 | int i; | ||
| 119 | |||
| 120 | if (info->ops->msi_free) { | ||
| 121 | for (i = 0; i < nr_irqs; i++) | ||
| 122 | info->ops->msi_free(domain, info, virq + i); | ||
| 123 | } | ||
| 124 | irq_domain_free_irqs_top(domain, virq, nr_irqs); | ||
| 125 | } | ||
| 126 | |||
| 127 | static struct irq_domain_ops msi_domain_ops = { | ||
| 128 | .alloc = msi_domain_alloc, | ||
| 129 | .free = msi_domain_free, | ||
| 130 | .activate = msi_domain_activate, | ||
| 131 | .deactivate = msi_domain_deactivate, | ||
| 132 | }; | ||
| 133 | |||
| 134 | #ifdef GENERIC_MSI_DOMAIN_OPS | ||
| 135 | static irq_hw_number_t msi_domain_ops_get_hwirq(struct msi_domain_info *info, | ||
| 136 | msi_alloc_info_t *arg) | ||
| 137 | { | ||
| 138 | return arg->hwirq; | ||
| 139 | } | ||
| 140 | |||
| 141 | static int msi_domain_ops_prepare(struct irq_domain *domain, struct device *dev, | ||
| 142 | int nvec, msi_alloc_info_t *arg) | ||
| 143 | { | ||
| 144 | memset(arg, 0, sizeof(*arg)); | ||
| 145 | return 0; | ||
| 146 | } | ||
| 147 | |||
| 148 | static void msi_domain_ops_set_desc(msi_alloc_info_t *arg, | ||
| 149 | struct msi_desc *desc) | ||
| 150 | { | ||
| 151 | arg->desc = desc; | ||
| 152 | } | ||
| 153 | #else | ||
| 154 | #define msi_domain_ops_get_hwirq NULL | ||
| 155 | #define msi_domain_ops_prepare NULL | ||
| 156 | #define msi_domain_ops_set_desc NULL | ||
| 157 | #endif /* !GENERIC_MSI_DOMAIN_OPS */ | ||
| 158 | |||
| 159 | static int msi_domain_ops_init(struct irq_domain *domain, | ||
| 160 | struct msi_domain_info *info, | ||
| 161 | unsigned int virq, irq_hw_number_t hwirq, | ||
| 162 | msi_alloc_info_t *arg) | ||
| 163 | { | ||
| 164 | irq_domain_set_hwirq_and_chip(domain, virq, hwirq, info->chip, | ||
| 165 | info->chip_data); | ||
| 166 | if (info->handler && info->handler_name) { | ||
| 167 | __irq_set_handler(virq, info->handler, 0, info->handler_name); | ||
| 168 | if (info->handler_data) | ||
| 169 | irq_set_handler_data(virq, info->handler_data); | ||
| 170 | } | ||
| 171 | return 0; | ||
| 172 | } | ||
| 173 | |||
| 174 | static int msi_domain_ops_check(struct irq_domain *domain, | ||
| 175 | struct msi_domain_info *info, | ||
| 176 | struct device *dev) | ||
| 177 | { | ||
| 178 | return 0; | ||
| 179 | } | ||
| 180 | |||
| 181 | static struct msi_domain_ops msi_domain_ops_default = { | ||
| 182 | .get_hwirq = msi_domain_ops_get_hwirq, | ||
| 183 | .msi_init = msi_domain_ops_init, | ||
| 184 | .msi_check = msi_domain_ops_check, | ||
| 185 | .msi_prepare = msi_domain_ops_prepare, | ||
| 186 | .set_desc = msi_domain_ops_set_desc, | ||
| 187 | }; | ||
| 188 | |||
| 189 | static void msi_domain_update_dom_ops(struct msi_domain_info *info) | ||
| 190 | { | ||
| 191 | struct msi_domain_ops *ops = info->ops; | ||
| 192 | |||
| 193 | if (ops == NULL) { | ||
| 194 | info->ops = &msi_domain_ops_default; | ||
| 195 | return; | ||
| 196 | } | ||
| 197 | |||
| 198 | if (ops->get_hwirq == NULL) | ||
| 199 | ops->get_hwirq = msi_domain_ops_default.get_hwirq; | ||
| 200 | if (ops->msi_init == NULL) | ||
| 201 | ops->msi_init = msi_domain_ops_default.msi_init; | ||
| 202 | if (ops->msi_check == NULL) | ||
| 203 | ops->msi_check = msi_domain_ops_default.msi_check; | ||
| 204 | if (ops->msi_prepare == NULL) | ||
| 205 | ops->msi_prepare = msi_domain_ops_default.msi_prepare; | ||
| 206 | if (ops->set_desc == NULL) | ||
| 207 | ops->set_desc = msi_domain_ops_default.set_desc; | ||
| 208 | } | ||
| 209 | |||
| 210 | static void msi_domain_update_chip_ops(struct msi_domain_info *info) | ||
| 211 | { | ||
| 212 | struct irq_chip *chip = info->chip; | ||
| 213 | |||
| 214 | BUG_ON(!chip); | ||
| 215 | if (!chip->irq_mask) | ||
| 216 | chip->irq_mask = pci_msi_mask_irq; | ||
| 217 | if (!chip->irq_unmask) | ||
| 218 | chip->irq_unmask = pci_msi_unmask_irq; | ||
| 219 | if (!chip->irq_set_affinity) | ||
| 220 | chip->irq_set_affinity = msi_domain_set_affinity; | ||
| 221 | } | ||
| 222 | |||
| 223 | /** | ||
| 224 | * msi_create_irq_domain - Create a MSI interrupt domain | ||
| 225 | * @of_node: Optional device-tree node of the interrupt controller | ||
| 226 | * @info: MSI domain info | ||
| 227 | * @parent: Parent irq domain | ||
| 228 | */ | ||
| 229 | struct irq_domain *msi_create_irq_domain(struct device_node *node, | ||
| 230 | struct msi_domain_info *info, | ||
| 231 | struct irq_domain *parent) | ||
| 232 | { | ||
| 233 | if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS) | ||
| 234 | msi_domain_update_dom_ops(info); | ||
| 235 | if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS) | ||
| 236 | msi_domain_update_chip_ops(info); | ||
| 237 | |||
| 238 | return irq_domain_add_hierarchy(parent, 0, 0, node, &msi_domain_ops, | ||
| 239 | info); | ||
| 240 | } | ||
| 241 | |||
| 242 | /** | ||
| 243 | * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain | ||
| 244 | * @domain: The domain to allocate from | ||
| 245 | * @dev: Pointer to device struct of the device for which the interrupts | ||
| 246 | * are allocated | ||
| 247 | * @nvec: The number of interrupts to allocate | ||
| 248 | * | ||
| 249 | * Returns 0 on success or an error code. | ||
| 250 | */ | ||
| 251 | int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, | ||
| 252 | int nvec) | ||
| 253 | { | ||
| 254 | struct msi_domain_info *info = domain->host_data; | ||
| 255 | struct msi_domain_ops *ops = info->ops; | ||
| 256 | msi_alloc_info_t arg; | ||
| 257 | struct msi_desc *desc; | ||
| 258 | int i, ret, virq = -1; | ||
| 259 | |||
| 260 | ret = ops->msi_check(domain, info, dev); | ||
| 261 | if (ret == 0) | ||
| 262 | ret = ops->msi_prepare(domain, dev, nvec, &arg); | ||
| 263 | if (ret) | ||
| 264 | return ret; | ||
| 265 | |||
| 266 | for_each_msi_entry(desc, dev) { | ||
| 267 | ops->set_desc(&arg, desc); | ||
| 268 | if (info->flags & MSI_FLAG_IDENTITY_MAP) | ||
| 269 | virq = (int)ops->get_hwirq(info, &arg); | ||
| 270 | else | ||
| 271 | virq = -1; | ||
| 272 | |||
| 273 | virq = __irq_domain_alloc_irqs(domain, virq, desc->nvec_used, | ||
| 274 | dev_to_node(dev), &arg, false); | ||
| 275 | if (virq < 0) { | ||
| 276 | ret = -ENOSPC; | ||
| 277 | if (ops->handle_error) | ||
| 278 | ret = ops->handle_error(domain, desc, ret); | ||
| 279 | if (ops->msi_finish) | ||
| 280 | ops->msi_finish(&arg, ret); | ||
| 281 | return ret; | ||
| 282 | } | ||
| 283 | |||
| 284 | for (i = 0; i < desc->nvec_used; i++) | ||
| 285 | irq_set_msi_desc_off(virq, i, desc); | ||
| 286 | } | ||
| 287 | |||
| 288 | if (ops->msi_finish) | ||
| 289 | ops->msi_finish(&arg, 0); | ||
| 290 | |||
| 291 | for_each_msi_entry(desc, dev) { | ||
| 292 | if (desc->nvec_used == 1) | ||
| 293 | dev_dbg(dev, "irq %d for MSI\n", virq); | ||
| 294 | else | ||
| 295 | dev_dbg(dev, "irq [%d-%d] for MSI\n", | ||
| 296 | virq, virq + desc->nvec_used - 1); | ||
| 297 | } | ||
| 298 | |||
| 299 | return 0; | ||
| 300 | } | ||
| 301 | |||
| 302 | /** | ||
| 303 | * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated tp @dev | ||
| 304 | * @domain: The domain to managing the interrupts | ||
| 305 | * @dev: Pointer to device struct of the device for which the interrupts | ||
| 306 | * are free | ||
| 307 | */ | ||
| 308 | void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) | ||
| 309 | { | ||
| 310 | struct msi_desc *desc; | ||
| 311 | |||
| 312 | for_each_msi_entry(desc, dev) { | ||
| 313 | irq_domain_free_irqs(desc->irq, desc->nvec_used); | ||
| 314 | desc->irq = 0; | ||
| 315 | } | ||
| 316 | } | ||
| 317 | |||
| 318 | /** | ||
| 319 | * msi_get_domain_info - Get the MSI interrupt domain info for @domain | ||
| 320 | * @domain: The interrupt domain to retrieve data from | ||
| 321 | * | ||
| 322 | * Returns the pointer to the msi_domain_info stored in | ||
| 323 | * @domain->host_data. | ||
| 324 | */ | ||
| 325 | struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain) | ||
| 326 | { | ||
| 327 | return (struct msi_domain_info *)domain->host_data; | ||
| 328 | } | ||
| 329 | |||
| 330 | #endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */ | ||
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index ac1ba2f11032..9dc9bfd8a678 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
| @@ -15,6 +15,23 @@ | |||
| 15 | 15 | ||
| 16 | #include "internals.h" | 16 | #include "internals.h" |
| 17 | 17 | ||
| 18 | /* | ||
| 19 | * Access rules: | ||
| 20 | * | ||
| 21 | * procfs protects read/write of /proc/irq/N/ files against a | ||
| 22 | * concurrent free of the interrupt descriptor. remove_proc_entry() | ||
| 23 | * immediately prevents new read/writes to happen and waits for | ||
| 24 | * already running read/write functions to complete. | ||
| 25 | * | ||
| 26 | * We remove the proc entries first and then delete the interrupt | ||
| 27 | * descriptor from the radix tree and free it. So it is guaranteed | ||
| 28 | * that irq_to_desc(N) is valid as long as the read/writes are | ||
| 29 | * permitted by procfs. | ||
| 30 | * | ||
| 31 | * The read from /proc/interrupts is a different problem because there | ||
| 32 | * is no protection. So the lookup and the access to irqdesc | ||
| 33 | * information must be protected by sparse_irq_lock. | ||
| 34 | */ | ||
| 18 | static struct proc_dir_entry *root_irq_dir; | 35 | static struct proc_dir_entry *root_irq_dir; |
| 19 | 36 | ||
| 20 | #ifdef CONFIG_SMP | 37 | #ifdef CONFIG_SMP |
| @@ -437,9 +454,10 @@ int show_interrupts(struct seq_file *p, void *v) | |||
| 437 | seq_putc(p, '\n'); | 454 | seq_putc(p, '\n'); |
| 438 | } | 455 | } |
| 439 | 456 | ||
| 457 | irq_lock_sparse(); | ||
| 440 | desc = irq_to_desc(i); | 458 | desc = irq_to_desc(i); |
| 441 | if (!desc) | 459 | if (!desc) |
| 442 | return 0; | 460 | goto outsparse; |
| 443 | 461 | ||
| 444 | raw_spin_lock_irqsave(&desc->lock, flags); | 462 | raw_spin_lock_irqsave(&desc->lock, flags); |
| 445 | for_each_online_cpu(j) | 463 | for_each_online_cpu(j) |
| @@ -479,6 +497,8 @@ int show_interrupts(struct seq_file *p, void *v) | |||
| 479 | seq_putc(p, '\n'); | 497 | seq_putc(p, '\n'); |
| 480 | out: | 498 | out: |
| 481 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 499 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
| 500 | outsparse: | ||
| 501 | irq_unlock_sparse(); | ||
| 482 | return 0; | 502 | return 0; |
| 483 | } | 503 | } |
| 484 | #endif | 504 | #endif |
diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 3ab9048483fa..cbf9fb899d92 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c | |||
| @@ -175,11 +175,11 @@ EXPORT_SYMBOL_GPL(irq_work_run); | |||
| 175 | 175 | ||
| 176 | void irq_work_tick(void) | 176 | void irq_work_tick(void) |
| 177 | { | 177 | { |
| 178 | struct llist_head *raised = &__get_cpu_var(raised_list); | 178 | struct llist_head *raised = this_cpu_ptr(&raised_list); |
| 179 | 179 | ||
| 180 | if (!llist_empty(raised) && !arch_irq_work_has_interrupt()) | 180 | if (!llist_empty(raised) && !arch_irq_work_has_interrupt()) |
| 181 | irq_work_run_list(raised); | 181 | irq_work_run_list(raised); |
| 182 | irq_work_run_list(&__get_cpu_var(lazy_list)); | 182 | irq_work_run_list(this_cpu_ptr(&lazy_list)); |
| 183 | } | 183 | } |
| 184 | 184 | ||
| 185 | /* | 185 | /* |
diff --git a/kernel/kexec.c b/kernel/kexec.c index 2abf9f6e9a61..9a8a01abbaed 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
| @@ -600,7 +600,7 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, | |||
| 600 | if (!kexec_on_panic) { | 600 | if (!kexec_on_panic) { |
| 601 | image->swap_page = kimage_alloc_control_pages(image, 0); | 601 | image->swap_page = kimage_alloc_control_pages(image, 0); |
| 602 | if (!image->swap_page) { | 602 | if (!image->swap_page) { |
| 603 | pr_err(KERN_ERR "Could not allocate swap buffer\n"); | 603 | pr_err("Could not allocate swap buffer\n"); |
| 604 | goto out_free_control_pages; | 604 | goto out_free_control_pages; |
| 605 | } | 605 | } |
| 606 | } | 606 | } |
diff --git a/kernel/kmod.c b/kernel/kmod.c index 80f7a6d00519..2777f40a9c7b 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
| @@ -47,13 +47,6 @@ extern int max_threads; | |||
| 47 | 47 | ||
| 48 | static struct workqueue_struct *khelper_wq; | 48 | static struct workqueue_struct *khelper_wq; |
| 49 | 49 | ||
| 50 | /* | ||
| 51 | * kmod_thread_locker is used for deadlock avoidance. There is no explicit | ||
| 52 | * locking to protect this global - it is private to the singleton khelper | ||
| 53 | * thread and should only ever be modified by that thread. | ||
| 54 | */ | ||
| 55 | static const struct task_struct *kmod_thread_locker; | ||
| 56 | |||
| 57 | #define CAP_BSET (void *)1 | 50 | #define CAP_BSET (void *)1 |
| 58 | #define CAP_PI (void *)2 | 51 | #define CAP_PI (void *)2 |
| 59 | 52 | ||
| @@ -223,7 +216,6 @@ static void umh_complete(struct subprocess_info *sub_info) | |||
| 223 | static int ____call_usermodehelper(void *data) | 216 | static int ____call_usermodehelper(void *data) |
| 224 | { | 217 | { |
| 225 | struct subprocess_info *sub_info = data; | 218 | struct subprocess_info *sub_info = data; |
| 226 | int wait = sub_info->wait & ~UMH_KILLABLE; | ||
| 227 | struct cred *new; | 219 | struct cred *new; |
| 228 | int retval; | 220 | int retval; |
| 229 | 221 | ||
| @@ -267,20 +259,13 @@ static int ____call_usermodehelper(void *data) | |||
| 267 | out: | 259 | out: |
| 268 | sub_info->retval = retval; | 260 | sub_info->retval = retval; |
| 269 | /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */ | 261 | /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */ |
| 270 | if (wait != UMH_WAIT_PROC) | 262 | if (!(sub_info->wait & UMH_WAIT_PROC)) |
| 271 | umh_complete(sub_info); | 263 | umh_complete(sub_info); |
| 272 | if (!retval) | 264 | if (!retval) |
| 273 | return 0; | 265 | return 0; |
| 274 | do_exit(0); | 266 | do_exit(0); |
| 275 | } | 267 | } |
| 276 | 268 | ||
| 277 | static int call_helper(void *data) | ||
| 278 | { | ||
| 279 | /* Worker thread started blocking khelper thread. */ | ||
| 280 | kmod_thread_locker = current; | ||
| 281 | return ____call_usermodehelper(data); | ||
| 282 | } | ||
| 283 | |||
| 284 | /* Keventd can't block, but this (a child) can. */ | 269 | /* Keventd can't block, but this (a child) can. */ |
| 285 | static int wait_for_helper(void *data) | 270 | static int wait_for_helper(void *data) |
| 286 | { | 271 | { |
| @@ -323,21 +308,14 @@ static void __call_usermodehelper(struct work_struct *work) | |||
| 323 | { | 308 | { |
| 324 | struct subprocess_info *sub_info = | 309 | struct subprocess_info *sub_info = |
| 325 | container_of(work, struct subprocess_info, work); | 310 | container_of(work, struct subprocess_info, work); |
| 326 | int wait = sub_info->wait & ~UMH_KILLABLE; | ||
| 327 | pid_t pid; | 311 | pid_t pid; |
| 328 | 312 | ||
| 329 | /* CLONE_VFORK: wait until the usermode helper has execve'd | 313 | if (sub_info->wait & UMH_WAIT_PROC) |
| 330 | * successfully We need the data structures to stay around | ||
| 331 | * until that is done. */ | ||
| 332 | if (wait == UMH_WAIT_PROC) | ||
| 333 | pid = kernel_thread(wait_for_helper, sub_info, | 314 | pid = kernel_thread(wait_for_helper, sub_info, |
| 334 | CLONE_FS | CLONE_FILES | SIGCHLD); | 315 | CLONE_FS | CLONE_FILES | SIGCHLD); |
| 335 | else { | 316 | else |
| 336 | pid = kernel_thread(call_helper, sub_info, | 317 | pid = kernel_thread(____call_usermodehelper, sub_info, |
| 337 | CLONE_VFORK | SIGCHLD); | 318 | SIGCHLD); |
| 338 | /* Worker thread stopped blocking khelper thread. */ | ||
| 339 | kmod_thread_locker = NULL; | ||
| 340 | } | ||
| 341 | 319 | ||
| 342 | if (pid < 0) { | 320 | if (pid < 0) { |
| 343 | sub_info->retval = pid; | 321 | sub_info->retval = pid; |
| @@ -571,17 +549,6 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) | |||
| 571 | goto out; | 549 | goto out; |
| 572 | } | 550 | } |
| 573 | /* | 551 | /* |
| 574 | * Worker thread must not wait for khelper thread at below | ||
| 575 | * wait_for_completion() if the thread was created with CLONE_VFORK | ||
| 576 | * flag, for khelper thread is already waiting for the thread at | ||
| 577 | * wait_for_completion() in do_fork(). | ||
| 578 | */ | ||
| 579 | if (wait != UMH_NO_WAIT && current == kmod_thread_locker) { | ||
| 580 | retval = -EBUSY; | ||
| 581 | goto out; | ||
| 582 | } | ||
| 583 | |||
| 584 | /* | ||
| 585 | * Set the completion pointer only if there is a waiter. | 552 | * Set the completion pointer only if there is a waiter. |
| 586 | * This makes it possible to use umh_complete to free | 553 | * This makes it possible to use umh_complete to free |
| 587 | * the data structure in case of UMH_NO_WAIT. | 554 | * the data structure in case of UMH_NO_WAIT. |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 3995f546d0f3..ee619929cf90 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
| @@ -127,7 +127,7 @@ static void *alloc_insn_page(void) | |||
| 127 | 127 | ||
| 128 | static void free_insn_page(void *page) | 128 | static void free_insn_page(void *page) |
| 129 | { | 129 | { |
| 130 | module_free(NULL, page); | 130 | module_memfree(page); |
| 131 | } | 131 | } |
| 132 | 132 | ||
| 133 | struct kprobe_insn_cache kprobe_insn_slots = { | 133 | struct kprobe_insn_cache kprobe_insn_slots = { |
| @@ -915,7 +915,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p) | |||
| 915 | #ifdef CONFIG_KPROBES_ON_FTRACE | 915 | #ifdef CONFIG_KPROBES_ON_FTRACE |
| 916 | static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { | 916 | static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { |
| 917 | .func = kprobe_ftrace_handler, | 917 | .func = kprobe_ftrace_handler, |
| 918 | .flags = FTRACE_OPS_FL_SAVE_REGS, | 918 | .flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_IPMODIFY, |
| 919 | }; | 919 | }; |
| 920 | static int kprobe_ftrace_enabled; | 920 | static int kprobe_ftrace_enabled; |
| 921 | 921 | ||
| @@ -1410,16 +1410,10 @@ static inline int check_kprobe_rereg(struct kprobe *p) | |||
| 1410 | return ret; | 1410 | return ret; |
| 1411 | } | 1411 | } |
| 1412 | 1412 | ||
| 1413 | static int check_kprobe_address_safe(struct kprobe *p, | 1413 | int __weak arch_check_ftrace_location(struct kprobe *p) |
| 1414 | struct module **probed_mod) | ||
| 1415 | { | 1414 | { |
| 1416 | int ret = 0; | ||
| 1417 | unsigned long ftrace_addr; | 1415 | unsigned long ftrace_addr; |
| 1418 | 1416 | ||
| 1419 | /* | ||
| 1420 | * If the address is located on a ftrace nop, set the | ||
| 1421 | * breakpoint to the following instruction. | ||
| 1422 | */ | ||
| 1423 | ftrace_addr = ftrace_location((unsigned long)p->addr); | 1417 | ftrace_addr = ftrace_location((unsigned long)p->addr); |
| 1424 | if (ftrace_addr) { | 1418 | if (ftrace_addr) { |
| 1425 | #ifdef CONFIG_KPROBES_ON_FTRACE | 1419 | #ifdef CONFIG_KPROBES_ON_FTRACE |
| @@ -1431,7 +1425,17 @@ static int check_kprobe_address_safe(struct kprobe *p, | |||
| 1431 | return -EINVAL; | 1425 | return -EINVAL; |
| 1432 | #endif | 1426 | #endif |
| 1433 | } | 1427 | } |
| 1428 | return 0; | ||
| 1429 | } | ||
| 1434 | 1430 | ||
| 1431 | static int check_kprobe_address_safe(struct kprobe *p, | ||
| 1432 | struct module **probed_mod) | ||
| 1433 | { | ||
| 1434 | int ret; | ||
| 1435 | |||
| 1436 | ret = arch_check_ftrace_location(p); | ||
| 1437 | if (ret) | ||
| 1438 | return ret; | ||
| 1435 | jump_label_lock(); | 1439 | jump_label_lock(); |
| 1436 | preempt_disable(); | 1440 | preempt_disable(); |
| 1437 | 1441 | ||
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 5cf6731b98e9..3ef3736002d8 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c | |||
| @@ -80,13 +80,13 @@ void debug_mutex_unlock(struct mutex *lock) | |||
| 80 | DEBUG_LOCKS_WARN_ON(lock->owner != current); | 80 | DEBUG_LOCKS_WARN_ON(lock->owner != current); |
| 81 | 81 | ||
| 82 | DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); | 82 | DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); |
| 83 | mutex_clear_owner(lock); | ||
| 84 | } | 83 | } |
| 85 | 84 | ||
| 86 | /* | 85 | /* |
| 87 | * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug | 86 | * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug |
| 88 | * mutexes so that we can do it here after we've verified state. | 87 | * mutexes so that we can do it here after we've verified state. |
| 89 | */ | 88 | */ |
| 89 | mutex_clear_owner(lock); | ||
| 90 | atomic_set(&lock->count, 1); | 90 | atomic_set(&lock->count, 1); |
| 91 | } | 91 | } |
| 92 | 92 | ||
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index dadbf88c22c4..454195194d4a 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c | |||
| @@ -378,8 +378,14 @@ done: | |||
| 378 | * reschedule now, before we try-lock the mutex. This avoids getting | 378 | * reschedule now, before we try-lock the mutex. This avoids getting |
| 379 | * scheduled out right after we obtained the mutex. | 379 | * scheduled out right after we obtained the mutex. |
| 380 | */ | 380 | */ |
| 381 | if (need_resched()) | 381 | if (need_resched()) { |
| 382 | /* | ||
| 383 | * We _should_ have TASK_RUNNING here, but just in case | ||
| 384 | * we do not, make it so, otherwise we might get stuck. | ||
| 385 | */ | ||
| 386 | __set_current_state(TASK_RUNNING); | ||
| 382 | schedule_preempt_disabled(); | 387 | schedule_preempt_disabled(); |
| 388 | } | ||
| 383 | 389 | ||
| 384 | return false; | 390 | return false; |
| 385 | } | 391 | } |
diff --git a/kernel/module.c b/kernel/module.c index 88cec1ddb1e3..d856e96a3cce 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
| @@ -42,7 +42,6 @@ | |||
| 42 | #include <linux/vermagic.h> | 42 | #include <linux/vermagic.h> |
| 43 | #include <linux/notifier.h> | 43 | #include <linux/notifier.h> |
| 44 | #include <linux/sched.h> | 44 | #include <linux/sched.h> |
| 45 | #include <linux/stop_machine.h> | ||
| 46 | #include <linux/device.h> | 45 | #include <linux/device.h> |
| 47 | #include <linux/string.h> | 46 | #include <linux/string.h> |
| 48 | #include <linux/mutex.h> | 47 | #include <linux/mutex.h> |
| @@ -98,7 +97,7 @@ | |||
| 98 | * 1) List of modules (also safely readable with preempt_disable), | 97 | * 1) List of modules (also safely readable with preempt_disable), |
| 99 | * 2) module_use links, | 98 | * 2) module_use links, |
| 100 | * 3) module_addr_min/module_addr_max. | 99 | * 3) module_addr_min/module_addr_max. |
| 101 | * (delete uses stop_machine/add uses RCU list operations). */ | 100 | * (delete and add uses RCU list operations). */ |
| 102 | DEFINE_MUTEX(module_mutex); | 101 | DEFINE_MUTEX(module_mutex); |
| 103 | EXPORT_SYMBOL_GPL(module_mutex); | 102 | EXPORT_SYMBOL_GPL(module_mutex); |
| 104 | static LIST_HEAD(modules); | 103 | static LIST_HEAD(modules); |
| @@ -158,13 +157,13 @@ static BLOCKING_NOTIFIER_HEAD(module_notify_list); | |||
| 158 | * Protected by module_mutex. */ | 157 | * Protected by module_mutex. */ |
| 159 | static unsigned long module_addr_min = -1UL, module_addr_max = 0; | 158 | static unsigned long module_addr_min = -1UL, module_addr_max = 0; |
| 160 | 159 | ||
| 161 | int register_module_notifier(struct notifier_block * nb) | 160 | int register_module_notifier(struct notifier_block *nb) |
| 162 | { | 161 | { |
| 163 | return blocking_notifier_chain_register(&module_notify_list, nb); | 162 | return blocking_notifier_chain_register(&module_notify_list, nb); |
| 164 | } | 163 | } |
| 165 | EXPORT_SYMBOL(register_module_notifier); | 164 | EXPORT_SYMBOL(register_module_notifier); |
| 166 | 165 | ||
| 167 | int unregister_module_notifier(struct notifier_block * nb) | 166 | int unregister_module_notifier(struct notifier_block *nb) |
| 168 | { | 167 | { |
| 169 | return blocking_notifier_chain_unregister(&module_notify_list, nb); | 168 | return blocking_notifier_chain_unregister(&module_notify_list, nb); |
| 170 | } | 169 | } |
| @@ -628,18 +627,23 @@ static char last_unloaded_module[MODULE_NAME_LEN+1]; | |||
| 628 | 627 | ||
| 629 | EXPORT_TRACEPOINT_SYMBOL(module_get); | 628 | EXPORT_TRACEPOINT_SYMBOL(module_get); |
| 630 | 629 | ||
| 630 | /* MODULE_REF_BASE is the base reference count by kmodule loader. */ | ||
| 631 | #define MODULE_REF_BASE 1 | ||
| 632 | |||
| 631 | /* Init the unload section of the module. */ | 633 | /* Init the unload section of the module. */ |
| 632 | static int module_unload_init(struct module *mod) | 634 | static int module_unload_init(struct module *mod) |
| 633 | { | 635 | { |
| 634 | mod->refptr = alloc_percpu(struct module_ref); | 636 | /* |
| 635 | if (!mod->refptr) | 637 | * Initialize reference counter to MODULE_REF_BASE. |
| 636 | return -ENOMEM; | 638 | * refcnt == 0 means module is going. |
| 639 | */ | ||
| 640 | atomic_set(&mod->refcnt, MODULE_REF_BASE); | ||
| 637 | 641 | ||
| 638 | INIT_LIST_HEAD(&mod->source_list); | 642 | INIT_LIST_HEAD(&mod->source_list); |
| 639 | INIT_LIST_HEAD(&mod->target_list); | 643 | INIT_LIST_HEAD(&mod->target_list); |
| 640 | 644 | ||
| 641 | /* Hold reference count during initialization. */ | 645 | /* Hold reference count during initialization. */ |
| 642 | raw_cpu_write(mod->refptr->incs, 1); | 646 | atomic_inc(&mod->refcnt); |
| 643 | 647 | ||
| 644 | return 0; | 648 | return 0; |
| 645 | } | 649 | } |
| @@ -721,8 +725,6 @@ static void module_unload_free(struct module *mod) | |||
| 721 | kfree(use); | 725 | kfree(use); |
| 722 | } | 726 | } |
| 723 | mutex_unlock(&module_mutex); | 727 | mutex_unlock(&module_mutex); |
| 724 | |||
| 725 | free_percpu(mod->refptr); | ||
| 726 | } | 728 | } |
| 727 | 729 | ||
| 728 | #ifdef CONFIG_MODULE_FORCE_UNLOAD | 730 | #ifdef CONFIG_MODULE_FORCE_UNLOAD |
| @@ -740,60 +742,48 @@ static inline int try_force_unload(unsigned int flags) | |||
| 740 | } | 742 | } |
| 741 | #endif /* CONFIG_MODULE_FORCE_UNLOAD */ | 743 | #endif /* CONFIG_MODULE_FORCE_UNLOAD */ |
| 742 | 744 | ||
| 743 | struct stopref | 745 | /* Try to release refcount of module, 0 means success. */ |
| 746 | static int try_release_module_ref(struct module *mod) | ||
| 744 | { | 747 | { |
| 745 | struct module *mod; | 748 | int ret; |
| 746 | int flags; | ||
| 747 | int *forced; | ||
| 748 | }; | ||
| 749 | 749 | ||
| 750 | /* Whole machine is stopped with interrupts off when this runs. */ | 750 | /* Try to decrement refcnt which we set at loading */ |
| 751 | static int __try_stop_module(void *_sref) | 751 | ret = atomic_sub_return(MODULE_REF_BASE, &mod->refcnt); |
| 752 | { | 752 | BUG_ON(ret < 0); |
| 753 | struct stopref *sref = _sref; | 753 | if (ret) |
| 754 | /* Someone can put this right now, recover with checking */ | ||
| 755 | ret = atomic_add_unless(&mod->refcnt, MODULE_REF_BASE, 0); | ||
| 756 | |||
| 757 | return ret; | ||
| 758 | } | ||
| 754 | 759 | ||
| 760 | static int try_stop_module(struct module *mod, int flags, int *forced) | ||
| 761 | { | ||
| 755 | /* If it's not unused, quit unless we're forcing. */ | 762 | /* If it's not unused, quit unless we're forcing. */ |
| 756 | if (module_refcount(sref->mod) != 0) { | 763 | if (try_release_module_ref(mod) != 0) { |
| 757 | if (!(*sref->forced = try_force_unload(sref->flags))) | 764 | *forced = try_force_unload(flags); |
| 765 | if (!(*forced)) | ||
| 758 | return -EWOULDBLOCK; | 766 | return -EWOULDBLOCK; |
| 759 | } | 767 | } |
| 760 | 768 | ||
| 761 | /* Mark it as dying. */ | 769 | /* Mark it as dying. */ |
| 762 | sref->mod->state = MODULE_STATE_GOING; | 770 | mod->state = MODULE_STATE_GOING; |
| 763 | return 0; | ||
| 764 | } | ||
| 765 | 771 | ||
| 766 | static int try_stop_module(struct module *mod, int flags, int *forced) | 772 | return 0; |
| 767 | { | ||
| 768 | struct stopref sref = { mod, flags, forced }; | ||
| 769 | |||
| 770 | return stop_machine(__try_stop_module, &sref, NULL); | ||
| 771 | } | 773 | } |
| 772 | 774 | ||
| 773 | unsigned long module_refcount(struct module *mod) | 775 | /** |
| 776 | * module_refcount - return the refcount or -1 if unloading | ||
| 777 | * | ||
| 778 | * @mod: the module we're checking | ||
| 779 | * | ||
| 780 | * Returns: | ||
| 781 | * -1 if the module is in the process of unloading | ||
| 782 | * otherwise the number of references in the kernel to the module | ||
| 783 | */ | ||
| 784 | int module_refcount(struct module *mod) | ||
| 774 | { | 785 | { |
| 775 | unsigned long incs = 0, decs = 0; | 786 | return atomic_read(&mod->refcnt) - MODULE_REF_BASE; |
| 776 | int cpu; | ||
| 777 | |||
| 778 | for_each_possible_cpu(cpu) | ||
| 779 | decs += per_cpu_ptr(mod->refptr, cpu)->decs; | ||
| 780 | /* | ||
| 781 | * ensure the incs are added up after the decs. | ||
| 782 | * module_put ensures incs are visible before decs with smp_wmb. | ||
| 783 | * | ||
| 784 | * This 2-count scheme avoids the situation where the refcount | ||
| 785 | * for CPU0 is read, then CPU0 increments the module refcount, | ||
| 786 | * then CPU1 drops that refcount, then the refcount for CPU1 is | ||
| 787 | * read. We would record a decrement but not its corresponding | ||
| 788 | * increment so we would see a low count (disaster). | ||
| 789 | * | ||
| 790 | * Rare situation? But module_refcount can be preempted, and we | ||
| 791 | * might be tallying up 4096+ CPUs. So it is not impossible. | ||
| 792 | */ | ||
| 793 | smp_rmb(); | ||
| 794 | for_each_possible_cpu(cpu) | ||
| 795 | incs += per_cpu_ptr(mod->refptr, cpu)->incs; | ||
| 796 | return incs - decs; | ||
| 797 | } | 787 | } |
| 798 | EXPORT_SYMBOL(module_refcount); | 788 | EXPORT_SYMBOL(module_refcount); |
| 799 | 789 | ||
| @@ -875,10 +865,12 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod) | |||
| 875 | struct module_use *use; | 865 | struct module_use *use; |
| 876 | int printed_something = 0; | 866 | int printed_something = 0; |
| 877 | 867 | ||
| 878 | seq_printf(m, " %lu ", module_refcount(mod)); | 868 | seq_printf(m, " %i ", module_refcount(mod)); |
| 879 | 869 | ||
| 880 | /* Always include a trailing , so userspace can differentiate | 870 | /* |
| 881 | between this and the old multi-field proc format. */ | 871 | * Always include a trailing , so userspace can differentiate |
| 872 | * between this and the old multi-field proc format. | ||
| 873 | */ | ||
| 882 | list_for_each_entry(use, &mod->source_list, source_list) { | 874 | list_for_each_entry(use, &mod->source_list, source_list) { |
| 883 | printed_something = 1; | 875 | printed_something = 1; |
| 884 | seq_printf(m, "%s,", use->source->name); | 876 | seq_printf(m, "%s,", use->source->name); |
| @@ -886,11 +878,11 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod) | |||
| 886 | 878 | ||
| 887 | if (mod->init != NULL && mod->exit == NULL) { | 879 | if (mod->init != NULL && mod->exit == NULL) { |
| 888 | printed_something = 1; | 880 | printed_something = 1; |
| 889 | seq_printf(m, "[permanent],"); | 881 | seq_puts(m, "[permanent],"); |
| 890 | } | 882 | } |
| 891 | 883 | ||
| 892 | if (!printed_something) | 884 | if (!printed_something) |
| 893 | seq_printf(m, "-"); | 885 | seq_puts(m, "-"); |
| 894 | } | 886 | } |
| 895 | 887 | ||
| 896 | void __symbol_put(const char *symbol) | 888 | void __symbol_put(const char *symbol) |
| @@ -925,7 +917,7 @@ EXPORT_SYMBOL_GPL(symbol_put_addr); | |||
| 925 | static ssize_t show_refcnt(struct module_attribute *mattr, | 917 | static ssize_t show_refcnt(struct module_attribute *mattr, |
| 926 | struct module_kobject *mk, char *buffer) | 918 | struct module_kobject *mk, char *buffer) |
| 927 | { | 919 | { |
| 928 | return sprintf(buffer, "%lu\n", module_refcount(mk->mod)); | 920 | return sprintf(buffer, "%i\n", module_refcount(mk->mod)); |
| 929 | } | 921 | } |
| 930 | 922 | ||
| 931 | static struct module_attribute modinfo_refcnt = | 923 | static struct module_attribute modinfo_refcnt = |
| @@ -935,7 +927,7 @@ void __module_get(struct module *module) | |||
| 935 | { | 927 | { |
| 936 | if (module) { | 928 | if (module) { |
| 937 | preempt_disable(); | 929 | preempt_disable(); |
| 938 | __this_cpu_inc(module->refptr->incs); | 930 | atomic_inc(&module->refcnt); |
| 939 | trace_module_get(module, _RET_IP_); | 931 | trace_module_get(module, _RET_IP_); |
| 940 | preempt_enable(); | 932 | preempt_enable(); |
| 941 | } | 933 | } |
| @@ -948,11 +940,11 @@ bool try_module_get(struct module *module) | |||
| 948 | 940 | ||
| 949 | if (module) { | 941 | if (module) { |
| 950 | preempt_disable(); | 942 | preempt_disable(); |
| 951 | 943 | /* Note: here, we can fail to get a reference */ | |
| 952 | if (likely(module_is_live(module))) { | 944 | if (likely(module_is_live(module) && |
| 953 | __this_cpu_inc(module->refptr->incs); | 945 | atomic_inc_not_zero(&module->refcnt) != 0)) |
| 954 | trace_module_get(module, _RET_IP_); | 946 | trace_module_get(module, _RET_IP_); |
| 955 | } else | 947 | else |
| 956 | ret = false; | 948 | ret = false; |
| 957 | 949 | ||
| 958 | preempt_enable(); | 950 | preempt_enable(); |
| @@ -963,11 +955,12 @@ EXPORT_SYMBOL(try_module_get); | |||
| 963 | 955 | ||
| 964 | void module_put(struct module *module) | 956 | void module_put(struct module *module) |
| 965 | { | 957 | { |
| 958 | int ret; | ||
| 959 | |||
| 966 | if (module) { | 960 | if (module) { |
| 967 | preempt_disable(); | 961 | preempt_disable(); |
| 968 | smp_wmb(); /* see comment in module_refcount */ | 962 | ret = atomic_dec_if_positive(&module->refcnt); |
| 969 | __this_cpu_inc(module->refptr->decs); | 963 | WARN_ON(ret < 0); /* Failed to put refcount */ |
| 970 | |||
| 971 | trace_module_put(module, _RET_IP_); | 964 | trace_module_put(module, _RET_IP_); |
| 972 | preempt_enable(); | 965 | preempt_enable(); |
| 973 | } | 966 | } |
| @@ -978,7 +971,7 @@ EXPORT_SYMBOL(module_put); | |||
| 978 | static inline void print_unload_info(struct seq_file *m, struct module *mod) | 971 | static inline void print_unload_info(struct seq_file *m, struct module *mod) |
| 979 | { | 972 | { |
| 980 | /* We don't know the usage count, or what modules are using. */ | 973 | /* We don't know the usage count, or what modules are using. */ |
| 981 | seq_printf(m, " - -"); | 974 | seq_puts(m, " - -"); |
| 982 | } | 975 | } |
| 983 | 976 | ||
| 984 | static inline void module_unload_free(struct module *mod) | 977 | static inline void module_unload_free(struct module *mod) |
| @@ -1131,7 +1124,7 @@ static unsigned long maybe_relocated(unsigned long crc, | |||
| 1131 | static int check_version(Elf_Shdr *sechdrs, | 1124 | static int check_version(Elf_Shdr *sechdrs, |
| 1132 | unsigned int versindex, | 1125 | unsigned int versindex, |
| 1133 | const char *symname, | 1126 | const char *symname, |
| 1134 | struct module *mod, | 1127 | struct module *mod, |
| 1135 | const unsigned long *crc, | 1128 | const unsigned long *crc, |
| 1136 | const struct module *crc_owner) | 1129 | const struct module *crc_owner) |
| 1137 | { | 1130 | { |
| @@ -1165,7 +1158,7 @@ static int check_version(Elf_Shdr *sechdrs, | |||
| 1165 | return 0; | 1158 | return 0; |
| 1166 | 1159 | ||
| 1167 | bad_version: | 1160 | bad_version: |
| 1168 | printk("%s: disagrees about version of symbol %s\n", | 1161 | pr_warn("%s: disagrees about version of symbol %s\n", |
| 1169 | mod->name, symname); | 1162 | mod->name, symname); |
| 1170 | return 0; | 1163 | return 0; |
| 1171 | } | 1164 | } |
| @@ -1200,7 +1193,7 @@ static inline int same_magic(const char *amagic, const char *bmagic, | |||
| 1200 | static inline int check_version(Elf_Shdr *sechdrs, | 1193 | static inline int check_version(Elf_Shdr *sechdrs, |
| 1201 | unsigned int versindex, | 1194 | unsigned int versindex, |
| 1202 | const char *symname, | 1195 | const char *symname, |
| 1203 | struct module *mod, | 1196 | struct module *mod, |
| 1204 | const unsigned long *crc, | 1197 | const unsigned long *crc, |
| 1205 | const struct module *crc_owner) | 1198 | const struct module *crc_owner) |
| 1206 | { | 1199 | { |
| @@ -1288,15 +1281,13 @@ static inline bool sect_empty(const Elf_Shdr *sect) | |||
| 1288 | return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; | 1281 | return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; |
| 1289 | } | 1282 | } |
| 1290 | 1283 | ||
| 1291 | struct module_sect_attr | 1284 | struct module_sect_attr { |
| 1292 | { | ||
| 1293 | struct module_attribute mattr; | 1285 | struct module_attribute mattr; |
| 1294 | char *name; | 1286 | char *name; |
| 1295 | unsigned long address; | 1287 | unsigned long address; |
| 1296 | }; | 1288 | }; |
| 1297 | 1289 | ||
| 1298 | struct module_sect_attrs | 1290 | struct module_sect_attrs { |
| 1299 | { | ||
| 1300 | struct attribute_group grp; | 1291 | struct attribute_group grp; |
| 1301 | unsigned int nsections; | 1292 | unsigned int nsections; |
| 1302 | struct module_sect_attr attrs[0]; | 1293 | struct module_sect_attr attrs[0]; |
| @@ -1550,7 +1541,8 @@ static int module_add_modinfo_attrs(struct module *mod) | |||
| 1550 | (attr->test && attr->test(mod))) { | 1541 | (attr->test && attr->test(mod))) { |
| 1551 | memcpy(temp_attr, attr, sizeof(*temp_attr)); | 1542 | memcpy(temp_attr, attr, sizeof(*temp_attr)); |
| 1552 | sysfs_attr_init(&temp_attr->attr); | 1543 | sysfs_attr_init(&temp_attr->attr); |
| 1553 | error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr); | 1544 | error = sysfs_create_file(&mod->mkobj.kobj, |
| 1545 | &temp_attr->attr); | ||
| 1554 | ++temp_attr; | 1546 | ++temp_attr; |
| 1555 | } | 1547 | } |
| 1556 | } | 1548 | } |
| @@ -1566,7 +1558,7 @@ static void module_remove_modinfo_attrs(struct module *mod) | |||
| 1566 | /* pick a field to test for end of list */ | 1558 | /* pick a field to test for end of list */ |
| 1567 | if (!attr->attr.name) | 1559 | if (!attr->attr.name) |
| 1568 | break; | 1560 | break; |
| 1569 | sysfs_remove_file(&mod->mkobj.kobj,&attr->attr); | 1561 | sysfs_remove_file(&mod->mkobj.kobj, &attr->attr); |
| 1570 | if (attr->free) | 1562 | if (attr->free) |
| 1571 | attr->free(mod); | 1563 | attr->free(mod); |
| 1572 | } | 1564 | } |
| @@ -1697,18 +1689,6 @@ static void mod_sysfs_teardown(struct module *mod) | |||
| 1697 | mod_sysfs_fini(mod); | 1689 | mod_sysfs_fini(mod); |
| 1698 | } | 1690 | } |
| 1699 | 1691 | ||
| 1700 | /* | ||
| 1701 | * unlink the module with the whole machine is stopped with interrupts off | ||
| 1702 | * - this defends against kallsyms not taking locks | ||
| 1703 | */ | ||
| 1704 | static int __unlink_module(void *_mod) | ||
| 1705 | { | ||
| 1706 | struct module *mod = _mod; | ||
| 1707 | list_del(&mod->list); | ||
| 1708 | module_bug_cleanup(mod); | ||
| 1709 | return 0; | ||
| 1710 | } | ||
| 1711 | |||
| 1712 | #ifdef CONFIG_DEBUG_SET_MODULE_RONX | 1692 | #ifdef CONFIG_DEBUG_SET_MODULE_RONX |
| 1713 | /* | 1693 | /* |
| 1714 | * LKM RO/NX protection: protect module's text/ro-data | 1694 | * LKM RO/NX protection: protect module's text/ro-data |
| @@ -1824,7 +1804,7 @@ static void unset_module_core_ro_nx(struct module *mod) { } | |||
| 1824 | static void unset_module_init_ro_nx(struct module *mod) { } | 1804 | static void unset_module_init_ro_nx(struct module *mod) { } |
| 1825 | #endif | 1805 | #endif |
| 1826 | 1806 | ||
| 1827 | void __weak module_free(struct module *mod, void *module_region) | 1807 | void __weak module_memfree(void *module_region) |
| 1828 | { | 1808 | { |
| 1829 | vfree(module_region); | 1809 | vfree(module_region); |
| 1830 | } | 1810 | } |
| @@ -1833,6 +1813,10 @@ void __weak module_arch_cleanup(struct module *mod) | |||
| 1833 | { | 1813 | { |
| 1834 | } | 1814 | } |
| 1835 | 1815 | ||
| 1816 | void __weak module_arch_freeing_init(struct module *mod) | ||
| 1817 | { | ||
| 1818 | } | ||
| 1819 | |||
| 1836 | /* Free a module, remove from lists, etc. */ | 1820 | /* Free a module, remove from lists, etc. */ |
| 1837 | static void free_module(struct module *mod) | 1821 | static void free_module(struct module *mod) |
| 1838 | { | 1822 | { |
| @@ -1860,12 +1844,18 @@ static void free_module(struct module *mod) | |||
| 1860 | 1844 | ||
| 1861 | /* Now we can delete it from the lists */ | 1845 | /* Now we can delete it from the lists */ |
| 1862 | mutex_lock(&module_mutex); | 1846 | mutex_lock(&module_mutex); |
| 1863 | stop_machine(__unlink_module, mod, NULL); | 1847 | /* Unlink carefully: kallsyms could be walking list. */ |
| 1848 | list_del_rcu(&mod->list); | ||
| 1849 | /* Remove this module from bug list, this uses list_del_rcu */ | ||
| 1850 | module_bug_cleanup(mod); | ||
| 1851 | /* Wait for RCU synchronizing before releasing mod->list and buglist. */ | ||
| 1852 | synchronize_rcu(); | ||
| 1864 | mutex_unlock(&module_mutex); | 1853 | mutex_unlock(&module_mutex); |
| 1865 | 1854 | ||
| 1866 | /* This may be NULL, but that's OK */ | 1855 | /* This may be NULL, but that's OK */ |
| 1867 | unset_module_init_ro_nx(mod); | 1856 | unset_module_init_ro_nx(mod); |
| 1868 | module_free(mod, mod->module_init); | 1857 | module_arch_freeing_init(mod); |
| 1858 | module_memfree(mod->module_init); | ||
| 1869 | kfree(mod->args); | 1859 | kfree(mod->args); |
| 1870 | percpu_modfree(mod); | 1860 | percpu_modfree(mod); |
| 1871 | 1861 | ||
| @@ -1874,7 +1864,7 @@ static void free_module(struct module *mod) | |||
| 1874 | 1864 | ||
| 1875 | /* Finally, free the core (containing the module structure) */ | 1865 | /* Finally, free the core (containing the module structure) */ |
| 1876 | unset_module_core_ro_nx(mod); | 1866 | unset_module_core_ro_nx(mod); |
| 1877 | module_free(mod, mod->module_core); | 1867 | module_memfree(mod->module_core); |
| 1878 | 1868 | ||
| 1879 | #ifdef CONFIG_MPU | 1869 | #ifdef CONFIG_MPU |
| 1880 | update_protections(current->mm); | 1870 | update_protections(current->mm); |
| @@ -1955,7 +1945,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) | |||
| 1955 | /* We compiled with -fno-common. These are not | 1945 | /* We compiled with -fno-common. These are not |
| 1956 | supposed to happen. */ | 1946 | supposed to happen. */ |
| 1957 | pr_debug("Common symbol: %s\n", name); | 1947 | pr_debug("Common symbol: %s\n", name); |
| 1958 | printk("%s: please compile with -fno-common\n", | 1948 | pr_warn("%s: please compile with -fno-common\n", |
| 1959 | mod->name); | 1949 | mod->name); |
| 1960 | ret = -ENOEXEC; | 1950 | ret = -ENOEXEC; |
| 1961 | break; | 1951 | break; |
| @@ -2259,7 +2249,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info) | |||
| 2259 | } | 2249 | } |
| 2260 | 2250 | ||
| 2261 | static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, | 2251 | static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, |
| 2262 | unsigned int shnum) | 2252 | unsigned int shnum) |
| 2263 | { | 2253 | { |
| 2264 | const Elf_Shdr *sec; | 2254 | const Elf_Shdr *sec; |
| 2265 | 2255 | ||
| @@ -2735,7 +2725,7 @@ static int find_module_sections(struct module *mod, struct load_info *info) | |||
| 2735 | * This shouldn't happen with same compiler and binutils | 2725 | * This shouldn't happen with same compiler and binutils |
| 2736 | * building all parts of the module. | 2726 | * building all parts of the module. |
| 2737 | */ | 2727 | */ |
| 2738 | printk(KERN_WARNING "%s: has both .ctors and .init_array.\n", | 2728 | pr_warn("%s: has both .ctors and .init_array.\n", |
| 2739 | mod->name); | 2729 | mod->name); |
| 2740 | return -EINVAL; | 2730 | return -EINVAL; |
| 2741 | } | 2731 | } |
| @@ -2809,7 +2799,7 @@ static int move_module(struct module *mod, struct load_info *info) | |||
| 2809 | */ | 2799 | */ |
| 2810 | kmemleak_ignore(ptr); | 2800 | kmemleak_ignore(ptr); |
| 2811 | if (!ptr) { | 2801 | if (!ptr) { |
| 2812 | module_free(mod, mod->module_core); | 2802 | module_memfree(mod->module_core); |
| 2813 | return -ENOMEM; | 2803 | return -ENOMEM; |
| 2814 | } | 2804 | } |
| 2815 | memset(ptr, 0, mod->init_size); | 2805 | memset(ptr, 0, mod->init_size); |
| @@ -2954,8 +2944,9 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) | |||
| 2954 | static void module_deallocate(struct module *mod, struct load_info *info) | 2944 | static void module_deallocate(struct module *mod, struct load_info *info) |
| 2955 | { | 2945 | { |
| 2956 | percpu_modfree(mod); | 2946 | percpu_modfree(mod); |
| 2957 | module_free(mod, mod->module_init); | 2947 | module_arch_freeing_init(mod); |
| 2958 | module_free(mod, mod->module_core); | 2948 | module_memfree(mod->module_init); |
| 2949 | module_memfree(mod->module_core); | ||
| 2959 | } | 2950 | } |
| 2960 | 2951 | ||
| 2961 | int __weak module_finalize(const Elf_Ehdr *hdr, | 2952 | int __weak module_finalize(const Elf_Ehdr *hdr, |
| @@ -3007,10 +2998,31 @@ static void do_mod_ctors(struct module *mod) | |||
| 3007 | #endif | 2998 | #endif |
| 3008 | } | 2999 | } |
| 3009 | 3000 | ||
| 3001 | /* For freeing module_init on success, in case kallsyms traversing */ | ||
| 3002 | struct mod_initfree { | ||
| 3003 | struct rcu_head rcu; | ||
| 3004 | void *module_init; | ||
| 3005 | }; | ||
| 3006 | |||
| 3007 | static void do_free_init(struct rcu_head *head) | ||
| 3008 | { | ||
| 3009 | struct mod_initfree *m = container_of(head, struct mod_initfree, rcu); | ||
| 3010 | module_memfree(m->module_init); | ||
| 3011 | kfree(m); | ||
| 3012 | } | ||
| 3013 | |||
| 3010 | /* This is where the real work happens */ | 3014 | /* This is where the real work happens */ |
| 3011 | static int do_init_module(struct module *mod) | 3015 | static int do_init_module(struct module *mod) |
| 3012 | { | 3016 | { |
| 3013 | int ret = 0; | 3017 | int ret = 0; |
| 3018 | struct mod_initfree *freeinit; | ||
| 3019 | |||
| 3020 | freeinit = kmalloc(sizeof(*freeinit), GFP_KERNEL); | ||
| 3021 | if (!freeinit) { | ||
| 3022 | ret = -ENOMEM; | ||
| 3023 | goto fail; | ||
| 3024 | } | ||
| 3025 | freeinit->module_init = mod->module_init; | ||
| 3014 | 3026 | ||
| 3015 | /* | 3027 | /* |
| 3016 | * We want to find out whether @mod uses async during init. Clear | 3028 | * We want to find out whether @mod uses async during init. Clear |
| @@ -3023,16 +3035,7 @@ static int do_init_module(struct module *mod) | |||
| 3023 | if (mod->init != NULL) | 3035 | if (mod->init != NULL) |
| 3024 | ret = do_one_initcall(mod->init); | 3036 | ret = do_one_initcall(mod->init); |
| 3025 | if (ret < 0) { | 3037 | if (ret < 0) { |
| 3026 | /* Init routine failed: abort. Try to protect us from | 3038 | goto fail_free_freeinit; |
| 3027 | buggy refcounters. */ | ||
| 3028 | mod->state = MODULE_STATE_GOING; | ||
| 3029 | synchronize_sched(); | ||
| 3030 | module_put(mod); | ||
| 3031 | blocking_notifier_call_chain(&module_notify_list, | ||
| 3032 | MODULE_STATE_GOING, mod); | ||
| 3033 | free_module(mod); | ||
| 3034 | wake_up_all(&module_wq); | ||
| 3035 | return ret; | ||
| 3036 | } | 3039 | } |
| 3037 | if (ret > 0) { | 3040 | if (ret > 0) { |
| 3038 | pr_warn("%s: '%s'->init suspiciously returned %d, it should " | 3041 | pr_warn("%s: '%s'->init suspiciously returned %d, it should " |
| @@ -3077,15 +3080,35 @@ static int do_init_module(struct module *mod) | |||
| 3077 | mod->strtab = mod->core_strtab; | 3080 | mod->strtab = mod->core_strtab; |
| 3078 | #endif | 3081 | #endif |
| 3079 | unset_module_init_ro_nx(mod); | 3082 | unset_module_init_ro_nx(mod); |
| 3080 | module_free(mod, mod->module_init); | 3083 | module_arch_freeing_init(mod); |
| 3081 | mod->module_init = NULL; | 3084 | mod->module_init = NULL; |
| 3082 | mod->init_size = 0; | 3085 | mod->init_size = 0; |
| 3083 | mod->init_ro_size = 0; | 3086 | mod->init_ro_size = 0; |
| 3084 | mod->init_text_size = 0; | 3087 | mod->init_text_size = 0; |
| 3088 | /* | ||
| 3089 | * We want to free module_init, but be aware that kallsyms may be | ||
| 3090 | * walking this with preempt disabled. In all the failure paths, | ||
| 3091 | * we call synchronize_rcu/synchronize_sched, but we don't want | ||
| 3092 | * to slow down the success path, so use actual RCU here. | ||
| 3093 | */ | ||
| 3094 | call_rcu(&freeinit->rcu, do_free_init); | ||
| 3085 | mutex_unlock(&module_mutex); | 3095 | mutex_unlock(&module_mutex); |
| 3086 | wake_up_all(&module_wq); | 3096 | wake_up_all(&module_wq); |
| 3087 | 3097 | ||
| 3088 | return 0; | 3098 | return 0; |
| 3099 | |||
| 3100 | fail_free_freeinit: | ||
| 3101 | kfree(freeinit); | ||
| 3102 | fail: | ||
| 3103 | /* Try to protect us from buggy refcounters. */ | ||
| 3104 | mod->state = MODULE_STATE_GOING; | ||
| 3105 | synchronize_sched(); | ||
| 3106 | module_put(mod); | ||
| 3107 | blocking_notifier_call_chain(&module_notify_list, | ||
| 3108 | MODULE_STATE_GOING, mod); | ||
| 3109 | free_module(mod); | ||
| 3110 | wake_up_all(&module_wq); | ||
| 3111 | return ret; | ||
| 3089 | } | 3112 | } |
| 3090 | 3113 | ||
| 3091 | static int may_init_module(void) | 3114 | static int may_init_module(void) |
| @@ -3097,6 +3120,32 @@ static int may_init_module(void) | |||
| 3097 | } | 3120 | } |
| 3098 | 3121 | ||
| 3099 | /* | 3122 | /* |
| 3123 | * Can't use wait_event_interruptible() because our condition | ||
| 3124 | * 'finished_loading()' contains a blocking primitive itself (mutex_lock). | ||
| 3125 | */ | ||
| 3126 | static int wait_finished_loading(struct module *mod) | ||
| 3127 | { | ||
| 3128 | DEFINE_WAIT_FUNC(wait, woken_wake_function); | ||
| 3129 | int ret = 0; | ||
| 3130 | |||
| 3131 | add_wait_queue(&module_wq, &wait); | ||
| 3132 | for (;;) { | ||
| 3133 | if (finished_loading(mod->name)) | ||
| 3134 | break; | ||
| 3135 | |||
| 3136 | if (signal_pending(current)) { | ||
| 3137 | ret = -ERESTARTSYS; | ||
| 3138 | break; | ||
| 3139 | } | ||
| 3140 | |||
| 3141 | wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); | ||
| 3142 | } | ||
| 3143 | remove_wait_queue(&module_wq, &wait); | ||
| 3144 | |||
| 3145 | return ret; | ||
| 3146 | } | ||
| 3147 | |||
| 3148 | /* | ||
| 3100 | * We try to place it in the list now to make sure it's unique before | 3149 | * We try to place it in the list now to make sure it's unique before |
| 3101 | * we dedicate too many resources. In particular, temporary percpu | 3150 | * we dedicate too many resources. In particular, temporary percpu |
| 3102 | * memory exhaustion. | 3151 | * memory exhaustion. |
| @@ -3116,8 +3165,8 @@ again: | |||
| 3116 | || old->state == MODULE_STATE_UNFORMED) { | 3165 | || old->state == MODULE_STATE_UNFORMED) { |
| 3117 | /* Wait in case it fails to load. */ | 3166 | /* Wait in case it fails to load. */ |
| 3118 | mutex_unlock(&module_mutex); | 3167 | mutex_unlock(&module_mutex); |
| 3119 | err = wait_event_interruptible(module_wq, | 3168 | |
| 3120 | finished_loading(mod->name)); | 3169 | err = wait_finished_loading(mod); |
| 3121 | if (err) | 3170 | if (err) |
| 3122 | goto out_unlocked; | 3171 | goto out_unlocked; |
| 3123 | goto again; | 3172 | goto again; |
| @@ -3176,7 +3225,7 @@ out: | |||
| 3176 | 3225 | ||
| 3177 | static int unknown_module_param_cb(char *param, char *val, const char *modname) | 3226 | static int unknown_module_param_cb(char *param, char *val, const char *modname) |
| 3178 | { | 3227 | { |
| 3179 | /* Check for magic 'dyndbg' arg */ | 3228 | /* Check for magic 'dyndbg' arg */ |
| 3180 | int ret = ddebug_dyndbg_module_param_cb(param, val, modname); | 3229 | int ret = ddebug_dyndbg_module_param_cb(param, val, modname); |
| 3181 | if (ret != 0) | 3230 | if (ret != 0) |
| 3182 | pr_warn("%s: unknown parameter '%s' ignored\n", modname, param); | 3231 | pr_warn("%s: unknown parameter '%s' ignored\n", modname, param); |
| @@ -3326,6 +3375,8 @@ static int load_module(struct load_info *info, const char __user *uargs, | |||
| 3326 | /* Unlink carefully: kallsyms could be walking list. */ | 3375 | /* Unlink carefully: kallsyms could be walking list. */ |
| 3327 | list_del_rcu(&mod->list); | 3376 | list_del_rcu(&mod->list); |
| 3328 | wake_up_all(&module_wq); | 3377 | wake_up_all(&module_wq); |
| 3378 | /* Wait for RCU synchronizing before releasing mod->list. */ | ||
| 3379 | synchronize_rcu(); | ||
| 3329 | mutex_unlock(&module_mutex); | 3380 | mutex_unlock(&module_mutex); |
| 3330 | free_module: | 3381 | free_module: |
| 3331 | module_deallocate(mod, info); | 3382 | module_deallocate(mod, info); |
| @@ -3659,8 +3710,8 @@ static int m_show(struct seq_file *m, void *p) | |||
| 3659 | 3710 | ||
| 3660 | /* Informative for users. */ | 3711 | /* Informative for users. */ |
| 3661 | seq_printf(m, " %s", | 3712 | seq_printf(m, " %s", |
| 3662 | mod->state == MODULE_STATE_GOING ? "Unloading": | 3713 | mod->state == MODULE_STATE_GOING ? "Unloading" : |
| 3663 | mod->state == MODULE_STATE_COMING ? "Loading": | 3714 | mod->state == MODULE_STATE_COMING ? "Loading" : |
| 3664 | "Live"); | 3715 | "Live"); |
| 3665 | /* Used by oprofile and other similar tools. */ | 3716 | /* Used by oprofile and other similar tools. */ |
| 3666 | seq_printf(m, " 0x%pK", mod->module_core); | 3717 | seq_printf(m, " 0x%pK", mod->module_core); |
| @@ -3669,7 +3720,7 @@ static int m_show(struct seq_file *m, void *p) | |||
| 3669 | if (mod->taints) | 3720 | if (mod->taints) |
| 3670 | seq_printf(m, " %s", module_flags(mod, buf)); | 3721 | seq_printf(m, " %s", module_flags(mod, buf)); |
| 3671 | 3722 | ||
| 3672 | seq_printf(m, "\n"); | 3723 | seq_puts(m, "\n"); |
| 3673 | return 0; | 3724 | return 0; |
| 3674 | } | 3725 | } |
| 3675 | 3726 | ||
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index ef42d0ab3115..49746c81ad8d 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c | |||
| @@ -220,11 +220,10 @@ void exit_task_namespaces(struct task_struct *p) | |||
| 220 | 220 | ||
| 221 | SYSCALL_DEFINE2(setns, int, fd, int, nstype) | 221 | SYSCALL_DEFINE2(setns, int, fd, int, nstype) |
| 222 | { | 222 | { |
| 223 | const struct proc_ns_operations *ops; | ||
| 224 | struct task_struct *tsk = current; | 223 | struct task_struct *tsk = current; |
| 225 | struct nsproxy *new_nsproxy; | 224 | struct nsproxy *new_nsproxy; |
| 226 | struct proc_ns *ei; | ||
| 227 | struct file *file; | 225 | struct file *file; |
| 226 | struct ns_common *ns; | ||
| 228 | int err; | 227 | int err; |
| 229 | 228 | ||
| 230 | file = proc_ns_fget(fd); | 229 | file = proc_ns_fget(fd); |
| @@ -232,9 +231,8 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) | |||
| 232 | return PTR_ERR(file); | 231 | return PTR_ERR(file); |
| 233 | 232 | ||
| 234 | err = -EINVAL; | 233 | err = -EINVAL; |
| 235 | ei = get_proc_ns(file_inode(file)); | 234 | ns = get_proc_ns(file_inode(file)); |
| 236 | ops = ei->ns_ops; | 235 | if (nstype && (ns->ops->type != nstype)) |
| 237 | if (nstype && (ops->type != nstype)) | ||
| 238 | goto out; | 236 | goto out; |
| 239 | 237 | ||
| 240 | new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs); | 238 | new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs); |
| @@ -243,7 +241,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) | |||
| 243 | goto out; | 241 | goto out; |
| 244 | } | 242 | } |
| 245 | 243 | ||
| 246 | err = ops->install(new_nsproxy, ei->ns); | 244 | err = ns->ops->install(new_nsproxy, ns); |
| 247 | if (err) { | 245 | if (err) { |
| 248 | free_nsproxy(new_nsproxy); | 246 | free_nsproxy(new_nsproxy); |
| 249 | goto out; | 247 | goto out; |
diff --git a/kernel/panic.c b/kernel/panic.c index cf80672b7924..4d8d6f906dec 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
| @@ -33,6 +33,7 @@ static int pause_on_oops; | |||
| 33 | static int pause_on_oops_flag; | 33 | static int pause_on_oops_flag; |
| 34 | static DEFINE_SPINLOCK(pause_on_oops_lock); | 34 | static DEFINE_SPINLOCK(pause_on_oops_lock); |
| 35 | static bool crash_kexec_post_notifiers; | 35 | static bool crash_kexec_post_notifiers; |
| 36 | int panic_on_warn __read_mostly; | ||
| 36 | 37 | ||
| 37 | int panic_timeout = CONFIG_PANIC_TIMEOUT; | 38 | int panic_timeout = CONFIG_PANIC_TIMEOUT; |
| 38 | EXPORT_SYMBOL_GPL(panic_timeout); | 39 | EXPORT_SYMBOL_GPL(panic_timeout); |
| @@ -428,6 +429,17 @@ static void warn_slowpath_common(const char *file, int line, void *caller, | |||
| 428 | if (args) | 429 | if (args) |
| 429 | vprintk(args->fmt, args->args); | 430 | vprintk(args->fmt, args->args); |
| 430 | 431 | ||
| 432 | if (panic_on_warn) { | ||
| 433 | /* | ||
| 434 | * This thread may hit another WARN() in the panic path. | ||
| 435 | * Resetting this prevents additional WARN() from panicking the | ||
| 436 | * system on this thread. Other threads are blocked by the | ||
| 437 | * panic_mutex in panic(). | ||
| 438 | */ | ||
| 439 | panic_on_warn = 0; | ||
| 440 | panic("panic_on_warn set ...\n"); | ||
| 441 | } | ||
| 442 | |||
| 431 | print_modules(); | 443 | print_modules(); |
| 432 | dump_stack(); | 444 | dump_stack(); |
| 433 | print_oops_end_marker(); | 445 | print_oops_end_marker(); |
| @@ -485,6 +497,7 @@ EXPORT_SYMBOL(__stack_chk_fail); | |||
| 485 | 497 | ||
| 486 | core_param(panic, panic_timeout, int, 0644); | 498 | core_param(panic, panic_timeout, int, 0644); |
| 487 | core_param(pause_on_oops, pause_on_oops, int, 0644); | 499 | core_param(pause_on_oops, pause_on_oops, int, 0644); |
| 500 | core_param(panic_on_warn, panic_on_warn, int, 0644); | ||
| 488 | 501 | ||
| 489 | static int __init setup_crash_kexec_post_notifiers(char *s) | 502 | static int __init setup_crash_kexec_post_notifiers(char *s) |
| 490 | { | 503 | { |
diff --git a/kernel/params.c b/kernel/params.c index db97b791390f..728e05b167de 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
| @@ -603,74 +603,70 @@ static __modinit int add_sysfs_param(struct module_kobject *mk, | |||
| 603 | const struct kernel_param *kp, | 603 | const struct kernel_param *kp, |
| 604 | const char *name) | 604 | const char *name) |
| 605 | { | 605 | { |
| 606 | struct module_param_attrs *new; | 606 | struct module_param_attrs *new_mp; |
| 607 | struct attribute **attrs; | 607 | struct attribute **new_attrs; |
| 608 | int err, num; | 608 | unsigned int i; |
| 609 | 609 | ||
| 610 | /* We don't bother calling this with invisible parameters. */ | 610 | /* We don't bother calling this with invisible parameters. */ |
| 611 | BUG_ON(!kp->perm); | 611 | BUG_ON(!kp->perm); |
| 612 | 612 | ||
| 613 | if (!mk->mp) { | 613 | if (!mk->mp) { |
| 614 | num = 0; | 614 | /* First allocation. */ |
| 615 | attrs = NULL; | 615 | mk->mp = kzalloc(sizeof(*mk->mp), GFP_KERNEL); |
| 616 | } else { | 616 | if (!mk->mp) |
| 617 | num = mk->mp->num; | 617 | return -ENOMEM; |
| 618 | attrs = mk->mp->grp.attrs; | 618 | mk->mp->grp.name = "parameters"; |
| 619 | /* NULL-terminated attribute array. */ | ||
| 620 | mk->mp->grp.attrs = kzalloc(sizeof(mk->mp->grp.attrs[0]), | ||
| 621 | GFP_KERNEL); | ||
| 622 | /* Caller will cleanup via free_module_param_attrs */ | ||
| 623 | if (!mk->mp->grp.attrs) | ||
| 624 | return -ENOMEM; | ||
| 619 | } | 625 | } |
| 620 | 626 | ||
| 621 | /* Enlarge. */ | 627 | /* Enlarge allocations. */ |
| 622 | new = krealloc(mk->mp, | 628 | new_mp = krealloc(mk->mp, |
| 623 | sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1), | 629 | sizeof(*mk->mp) + |
| 624 | GFP_KERNEL); | 630 | sizeof(mk->mp->attrs[0]) * (mk->mp->num + 1), |
| 625 | if (!new) { | 631 | GFP_KERNEL); |
| 626 | kfree(attrs); | 632 | if (!new_mp) |
| 627 | err = -ENOMEM; | 633 | return -ENOMEM; |
| 628 | goto fail; | 634 | mk->mp = new_mp; |
| 629 | } | ||
| 630 | /* Despite looking like the typical realloc() bug, this is safe. | ||
| 631 | * We *want* the old 'attrs' to be freed either way, and we'll store | ||
| 632 | * the new one in the success case. */ | ||
| 633 | attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL); | ||
| 634 | if (!attrs) { | ||
| 635 | err = -ENOMEM; | ||
| 636 | goto fail_free_new; | ||
| 637 | } | ||
| 638 | 635 | ||
| 639 | /* Sysfs wants everything zeroed. */ | 636 | /* Extra pointer for NULL terminator */ |
| 640 | memset(new, 0, sizeof(*new)); | 637 | new_attrs = krealloc(mk->mp->grp.attrs, |
| 641 | memset(&new->attrs[num], 0, sizeof(new->attrs[num])); | 638 | sizeof(mk->mp->grp.attrs[0]) * (mk->mp->num + 2), |
| 642 | memset(&attrs[num], 0, sizeof(attrs[num])); | 639 | GFP_KERNEL); |
| 643 | new->grp.name = "parameters"; | 640 | if (!new_attrs) |
| 644 | new->grp.attrs = attrs; | 641 | return -ENOMEM; |
| 642 | mk->mp->grp.attrs = new_attrs; | ||
| 645 | 643 | ||
| 646 | /* Tack new one on the end. */ | 644 | /* Tack new one on the end. */ |
| 647 | sysfs_attr_init(&new->attrs[num].mattr.attr); | 645 | memset(&mk->mp->attrs[mk->mp->num], 0, sizeof(mk->mp->attrs[0])); |
| 648 | new->attrs[num].param = kp; | 646 | sysfs_attr_init(&mk->mp->attrs[mk->mp->num].mattr.attr); |
| 649 | new->attrs[num].mattr.show = param_attr_show; | 647 | mk->mp->attrs[mk->mp->num].param = kp; |
| 650 | new->attrs[num].mattr.store = param_attr_store; | 648 | mk->mp->attrs[mk->mp->num].mattr.show = param_attr_show; |
| 651 | new->attrs[num].mattr.attr.name = (char *)name; | 649 | /* Do not allow runtime DAC changes to make param writable. */ |
| 652 | new->attrs[num].mattr.attr.mode = kp->perm; | 650 | if ((kp->perm & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0) |
| 653 | new->num = num+1; | 651 | mk->mp->attrs[mk->mp->num].mattr.store = param_attr_store; |
| 652 | else | ||
| 653 | mk->mp->attrs[mk->mp->num].mattr.store = NULL; | ||
| 654 | mk->mp->attrs[mk->mp->num].mattr.attr.name = (char *)name; | ||
| 655 | mk->mp->attrs[mk->mp->num].mattr.attr.mode = kp->perm; | ||
| 656 | mk->mp->num++; | ||
| 654 | 657 | ||
| 655 | /* Fix up all the pointers, since krealloc can move us */ | 658 | /* Fix up all the pointers, since krealloc can move us */ |
| 656 | for (num = 0; num < new->num; num++) | 659 | for (i = 0; i < mk->mp->num; i++) |
| 657 | new->grp.attrs[num] = &new->attrs[num].mattr.attr; | 660 | mk->mp->grp.attrs[i] = &mk->mp->attrs[i].mattr.attr; |
| 658 | new->grp.attrs[num] = NULL; | 661 | mk->mp->grp.attrs[mk->mp->num] = NULL; |
| 659 | |||
| 660 | mk->mp = new; | ||
| 661 | return 0; | 662 | return 0; |
| 662 | |||
| 663 | fail_free_new: | ||
| 664 | kfree(new); | ||
| 665 | fail: | ||
| 666 | mk->mp = NULL; | ||
| 667 | return err; | ||
| 668 | } | 663 | } |
| 669 | 664 | ||
| 670 | #ifdef CONFIG_MODULES | 665 | #ifdef CONFIG_MODULES |
| 671 | static void free_module_param_attrs(struct module_kobject *mk) | 666 | static void free_module_param_attrs(struct module_kobject *mk) |
| 672 | { | 667 | { |
| 673 | kfree(mk->mp->grp.attrs); | 668 | if (mk->mp) |
| 669 | kfree(mk->mp->grp.attrs); | ||
| 674 | kfree(mk->mp); | 670 | kfree(mk->mp); |
| 675 | mk->mp = NULL; | 671 | mk->mp = NULL; |
| 676 | } | 672 | } |
| @@ -695,8 +691,10 @@ int module_param_sysfs_setup(struct module *mod, | |||
| 695 | if (kparam[i].perm == 0) | 691 | if (kparam[i].perm == 0) |
| 696 | continue; | 692 | continue; |
| 697 | err = add_sysfs_param(&mod->mkobj, &kparam[i], kparam[i].name); | 693 | err = add_sysfs_param(&mod->mkobj, &kparam[i], kparam[i].name); |
| 698 | if (err) | 694 | if (err) { |
| 695 | free_module_param_attrs(&mod->mkobj); | ||
| 699 | return err; | 696 | return err; |
| 697 | } | ||
| 700 | params = true; | 698 | params = true; |
| 701 | } | 699 | } |
| 702 | 700 | ||
diff --git a/kernel/pid.c b/kernel/pid.c index 9b9a26698144..cd36a5e0d173 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
| @@ -79,7 +79,10 @@ struct pid_namespace init_pid_ns = { | |||
| 79 | .level = 0, | 79 | .level = 0, |
| 80 | .child_reaper = &init_task, | 80 | .child_reaper = &init_task, |
| 81 | .user_ns = &init_user_ns, | 81 | .user_ns = &init_user_ns, |
| 82 | .proc_inum = PROC_PID_INIT_INO, | 82 | .ns.inum = PROC_PID_INIT_INO, |
| 83 | #ifdef CONFIG_PID_NS | ||
| 84 | .ns.ops = &pidns_operations, | ||
| 85 | #endif | ||
| 83 | }; | 86 | }; |
| 84 | EXPORT_SYMBOL_GPL(init_pid_ns); | 87 | EXPORT_SYMBOL_GPL(init_pid_ns); |
| 85 | 88 | ||
| @@ -341,6 +344,8 @@ out: | |||
| 341 | 344 | ||
| 342 | out_unlock: | 345 | out_unlock: |
| 343 | spin_unlock_irq(&pidmap_lock); | 346 | spin_unlock_irq(&pidmap_lock); |
| 347 | put_pid_ns(ns); | ||
| 348 | |||
| 344 | out_free: | 349 | out_free: |
| 345 | while (++i <= ns->level) | 350 | while (++i <= ns->level) |
| 346 | free_pidmap(pid->numbers + i); | 351 | free_pidmap(pid->numbers + i); |
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index db95d8eb761b..a65ba137fd15 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c | |||
| @@ -105,9 +105,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns | |||
| 105 | if (ns->pid_cachep == NULL) | 105 | if (ns->pid_cachep == NULL) |
| 106 | goto out_free_map; | 106 | goto out_free_map; |
| 107 | 107 | ||
| 108 | err = proc_alloc_inum(&ns->proc_inum); | 108 | err = ns_alloc_inum(&ns->ns); |
| 109 | if (err) | 109 | if (err) |
| 110 | goto out_free_map; | 110 | goto out_free_map; |
| 111 | ns->ns.ops = &pidns_operations; | ||
| 111 | 112 | ||
| 112 | kref_init(&ns->kref); | 113 | kref_init(&ns->kref); |
| 113 | ns->level = level; | 114 | ns->level = level; |
| @@ -142,7 +143,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns) | |||
| 142 | { | 143 | { |
| 143 | int i; | 144 | int i; |
| 144 | 145 | ||
| 145 | proc_free_inum(ns->proc_inum); | 146 | ns_free_inum(&ns->ns); |
| 146 | for (i = 0; i < PIDMAP_ENTRIES; i++) | 147 | for (i = 0; i < PIDMAP_ENTRIES; i++) |
| 147 | kfree(ns->pidmap[i].page); | 148 | kfree(ns->pidmap[i].page); |
| 148 | put_user_ns(ns->user_ns); | 149 | put_user_ns(ns->user_ns); |
| @@ -190,7 +191,11 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) | |||
| 190 | /* Don't allow any more processes into the pid namespace */ | 191 | /* Don't allow any more processes into the pid namespace */ |
| 191 | disable_pid_allocation(pid_ns); | 192 | disable_pid_allocation(pid_ns); |
| 192 | 193 | ||
| 193 | /* Ignore SIGCHLD causing any terminated children to autoreap */ | 194 | /* |
| 195 | * Ignore SIGCHLD causing any terminated children to autoreap. | ||
| 196 | * This speeds up the namespace shutdown, plus see the comment | ||
| 197 | * below. | ||
| 198 | */ | ||
| 194 | spin_lock_irq(&me->sighand->siglock); | 199 | spin_lock_irq(&me->sighand->siglock); |
| 195 | me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN; | 200 | me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN; |
| 196 | spin_unlock_irq(&me->sighand->siglock); | 201 | spin_unlock_irq(&me->sighand->siglock); |
| @@ -223,15 +228,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) | |||
| 223 | } | 228 | } |
| 224 | read_unlock(&tasklist_lock); | 229 | read_unlock(&tasklist_lock); |
| 225 | 230 | ||
| 226 | /* Firstly reap the EXIT_ZOMBIE children we may have. */ | 231 | /* |
| 232 | * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD. | ||
| 233 | * sys_wait4() will also block until our children traced from the | ||
| 234 | * parent namespace are detached and become EXIT_DEAD. | ||
| 235 | */ | ||
| 227 | do { | 236 | do { |
| 228 | clear_thread_flag(TIF_SIGPENDING); | 237 | clear_thread_flag(TIF_SIGPENDING); |
| 229 | rc = sys_wait4(-1, NULL, __WALL, NULL); | 238 | rc = sys_wait4(-1, NULL, __WALL, NULL); |
| 230 | } while (rc != -ECHILD); | 239 | } while (rc != -ECHILD); |
| 231 | 240 | ||
| 232 | /* | 241 | /* |
| 233 | * sys_wait4() above can't reap the TASK_DEAD children. | 242 | * sys_wait4() above can't reap the EXIT_DEAD children but we do not |
| 234 | * Make sure they all go away, see free_pid(). | 243 | * really care, we could reparent them to the global init. We could |
| 244 | * exit and reap ->child_reaper even if it is not the last thread in | ||
| 245 | * this pid_ns, free_pid(nr_hashed == 0) calls proc_cleanup_work(), | ||
| 246 | * pid_ns can not go away until proc_kill_sb() drops the reference. | ||
| 247 | * | ||
| 248 | * But this ns can also have other tasks injected by setns()+fork(). | ||
| 249 | * Again, ignoring the user visible semantics we do not really need | ||
| 250 | * to wait until they are all reaped, but they can be reparented to | ||
| 251 | * us and thus we need to ensure that pid->child_reaper stays valid | ||
| 252 | * until they all go away. See free_pid()->wake_up_process(). | ||
| 253 | * | ||
| 254 | * We rely on ignored SIGCHLD, an injected zombie must be autoreaped | ||
| 255 | * if reparented. | ||
| 235 | */ | 256 | */ |
| 236 | for (;;) { | 257 | for (;;) { |
| 237 | set_current_state(TASK_UNINTERRUPTIBLE); | 258 | set_current_state(TASK_UNINTERRUPTIBLE); |
| @@ -313,7 +334,12 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) | |||
| 313 | return 0; | 334 | return 0; |
| 314 | } | 335 | } |
| 315 | 336 | ||
| 316 | static void *pidns_get(struct task_struct *task) | 337 | static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) |
| 338 | { | ||
| 339 | return container_of(ns, struct pid_namespace, ns); | ||
| 340 | } | ||
| 341 | |||
| 342 | static struct ns_common *pidns_get(struct task_struct *task) | ||
| 317 | { | 343 | { |
| 318 | struct pid_namespace *ns; | 344 | struct pid_namespace *ns; |
| 319 | 345 | ||
| @@ -323,18 +349,18 @@ static void *pidns_get(struct task_struct *task) | |||
| 323 | get_pid_ns(ns); | 349 | get_pid_ns(ns); |
| 324 | rcu_read_unlock(); | 350 | rcu_read_unlock(); |
| 325 | 351 | ||
| 326 | return ns; | 352 | return ns ? &ns->ns : NULL; |
| 327 | } | 353 | } |
| 328 | 354 | ||
| 329 | static void pidns_put(void *ns) | 355 | static void pidns_put(struct ns_common *ns) |
| 330 | { | 356 | { |
| 331 | put_pid_ns(ns); | 357 | put_pid_ns(to_pid_ns(ns)); |
| 332 | } | 358 | } |
| 333 | 359 | ||
| 334 | static int pidns_install(struct nsproxy *nsproxy, void *ns) | 360 | static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns) |
| 335 | { | 361 | { |
| 336 | struct pid_namespace *active = task_active_pid_ns(current); | 362 | struct pid_namespace *active = task_active_pid_ns(current); |
| 337 | struct pid_namespace *ancestor, *new = ns; | 363 | struct pid_namespace *ancestor, *new = to_pid_ns(ns); |
| 338 | 364 | ||
| 339 | if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || | 365 | if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || |
| 340 | !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) | 366 | !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) |
| @@ -362,19 +388,12 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns) | |||
| 362 | return 0; | 388 | return 0; |
| 363 | } | 389 | } |
| 364 | 390 | ||
| 365 | static unsigned int pidns_inum(void *ns) | ||
| 366 | { | ||
| 367 | struct pid_namespace *pid_ns = ns; | ||
| 368 | return pid_ns->proc_inum; | ||
| 369 | } | ||
| 370 | |||
| 371 | const struct proc_ns_operations pidns_operations = { | 391 | const struct proc_ns_operations pidns_operations = { |
| 372 | .name = "pid", | 392 | .name = "pid", |
| 373 | .type = CLONE_NEWPID, | 393 | .type = CLONE_NEWPID, |
| 374 | .get = pidns_get, | 394 | .get = pidns_get, |
| 375 | .put = pidns_put, | 395 | .put = pidns_put, |
| 376 | .install = pidns_install, | 396 | .install = pidns_install, |
| 377 | .inum = pidns_inum, | ||
| 378 | }; | 397 | }; |
| 379 | 398 | ||
| 380 | static __init int pid_namespaces_init(void) | 399 | static __init int pid_namespaces_init(void) |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index bbef57f5bdfd..48b28d387c7f 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
| @@ -94,6 +94,7 @@ config PM_STD_PARTITION | |||
| 94 | config PM_SLEEP | 94 | config PM_SLEEP |
| 95 | def_bool y | 95 | def_bool y |
| 96 | depends on SUSPEND || HIBERNATE_CALLBACKS | 96 | depends on SUSPEND || HIBERNATE_CALLBACKS |
| 97 | select PM | ||
| 97 | 98 | ||
| 98 | config PM_SLEEP_SMP | 99 | config PM_SLEEP_SMP |
| 99 | def_bool y | 100 | def_bool y |
| @@ -129,24 +130,19 @@ config PM_WAKELOCKS_GC | |||
| 129 | depends on PM_WAKELOCKS | 130 | depends on PM_WAKELOCKS |
| 130 | default y | 131 | default y |
| 131 | 132 | ||
| 132 | config PM_RUNTIME | 133 | config PM |
| 133 | bool "Run-time PM core functionality" | 134 | bool "Device power management core functionality" |
| 134 | depends on !IA64_HP_SIM | ||
| 135 | ---help--- | 135 | ---help--- |
| 136 | Enable functionality allowing I/O devices to be put into energy-saving | 136 | Enable functionality allowing I/O devices to be put into energy-saving |
| 137 | (low power) states at run time (or autosuspended) after a specified | 137 | (low power) states, for example after a specified period of inactivity |
| 138 | period of inactivity and woken up in response to a hardware-generated | 138 | (autosuspended), and woken up in response to a hardware-generated |
| 139 | wake-up event or a driver's request. | 139 | wake-up event or a driver's request. |
| 140 | 140 | ||
| 141 | Hardware support is generally required for this functionality to work | 141 | Hardware support is generally required for this functionality to work |
| 142 | and the bus type drivers of the buses the devices are on are | 142 | and the bus type drivers of the buses the devices are on are |
| 143 | responsible for the actual handling of the autosuspend requests and | 143 | responsible for the actual handling of device suspend requests and |
| 144 | wake-up events. | 144 | wake-up events. |
| 145 | 145 | ||
| 146 | config PM | ||
| 147 | def_bool y | ||
| 148 | depends on PM_SLEEP || PM_RUNTIME | ||
| 149 | |||
| 150 | config PM_DEBUG | 146 | config PM_DEBUG |
| 151 | bool "Power Management Debug Support" | 147 | bool "Power Management Debug Support" |
| 152 | depends on PM | 148 | depends on PM |
| @@ -298,14 +294,9 @@ config PM_GENERIC_DOMAINS_SLEEP | |||
| 298 | def_bool y | 294 | def_bool y |
| 299 | depends on PM_SLEEP && PM_GENERIC_DOMAINS | 295 | depends on PM_SLEEP && PM_GENERIC_DOMAINS |
| 300 | 296 | ||
| 301 | config PM_GENERIC_DOMAINS_RUNTIME | ||
| 302 | def_bool y | ||
| 303 | depends on PM_RUNTIME && PM_GENERIC_DOMAINS | ||
| 304 | |||
| 305 | config PM_GENERIC_DOMAINS_OF | 297 | config PM_GENERIC_DOMAINS_OF |
| 306 | def_bool y | 298 | def_bool y |
| 307 | depends on PM_GENERIC_DOMAINS && OF | 299 | depends on PM_GENERIC_DOMAINS && OF |
| 308 | 300 | ||
| 309 | config CPU_PM | 301 | config CPU_PM |
| 310 | bool | 302 | bool |
| 311 | depends on SUSPEND || CPU_IDLE | ||
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 1f35a3478f3c..2329daae5255 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
| @@ -28,6 +28,7 @@ | |||
| 28 | #include <linux/syscore_ops.h> | 28 | #include <linux/syscore_ops.h> |
| 29 | #include <linux/ctype.h> | 29 | #include <linux/ctype.h> |
| 30 | #include <linux/genhd.h> | 30 | #include <linux/genhd.h> |
| 31 | #include <linux/ktime.h> | ||
| 31 | #include <trace/events/power.h> | 32 | #include <trace/events/power.h> |
| 32 | 33 | ||
| 33 | #include "power.h" | 34 | #include "power.h" |
| @@ -232,20 +233,17 @@ static void platform_recover(int platform_mode) | |||
| 232 | * @nr_pages: Number of memory pages processed between @start and @stop. | 233 | * @nr_pages: Number of memory pages processed between @start and @stop. |
| 233 | * @msg: Additional diagnostic message to print. | 234 | * @msg: Additional diagnostic message to print. |
| 234 | */ | 235 | */ |
| 235 | void swsusp_show_speed(struct timeval *start, struct timeval *stop, | 236 | void swsusp_show_speed(ktime_t start, ktime_t stop, |
| 236 | unsigned nr_pages, char *msg) | 237 | unsigned nr_pages, char *msg) |
| 237 | { | 238 | { |
| 239 | ktime_t diff; | ||
| 238 | u64 elapsed_centisecs64; | 240 | u64 elapsed_centisecs64; |
| 239 | unsigned int centisecs; | 241 | unsigned int centisecs; |
| 240 | unsigned int k; | 242 | unsigned int k; |
| 241 | unsigned int kps; | 243 | unsigned int kps; |
| 242 | 244 | ||
| 243 | elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); | 245 | diff = ktime_sub(stop, start); |
| 244 | /* | 246 | elapsed_centisecs64 = ktime_divns(diff, 10*NSEC_PER_MSEC); |
| 245 | * If "(s64)elapsed_centisecs64 < 0", it will print long elapsed time, | ||
| 246 | * it is obvious enough for what went wrong. | ||
| 247 | */ | ||
| 248 | do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); | ||
| 249 | centisecs = elapsed_centisecs64; | 247 | centisecs = elapsed_centisecs64; |
| 250 | if (centisecs == 0) | 248 | if (centisecs == 0) |
| 251 | centisecs = 1; /* avoid div-by-zero */ | 249 | centisecs = 1; /* avoid div-by-zero */ |
diff --git a/kernel/power/power.h b/kernel/power/power.h index 2df883a9d3cb..ce9b8328a689 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
| @@ -174,8 +174,7 @@ extern int hib_wait_on_bio_chain(struct bio **bio_chain); | |||
| 174 | 174 | ||
| 175 | struct timeval; | 175 | struct timeval; |
| 176 | /* kernel/power/swsusp.c */ | 176 | /* kernel/power/swsusp.c */ |
| 177 | extern void swsusp_show_speed(struct timeval *, struct timeval *, | 177 | extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *); |
| 178 | unsigned int, char *); | ||
| 179 | 178 | ||
| 180 | #ifdef CONFIG_SUSPEND | 179 | #ifdef CONFIG_SUSPEND |
| 181 | /* kernel/power/suspend.c */ | 180 | /* kernel/power/suspend.c */ |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 791a61892bb5..0c40c16174b4 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
| @@ -28,6 +28,7 @@ | |||
| 28 | #include <linux/list.h> | 28 | #include <linux/list.h> |
| 29 | #include <linux/slab.h> | 29 | #include <linux/slab.h> |
| 30 | #include <linux/compiler.h> | 30 | #include <linux/compiler.h> |
| 31 | #include <linux/ktime.h> | ||
| 31 | 32 | ||
| 32 | #include <asm/uaccess.h> | 33 | #include <asm/uaccess.h> |
| 33 | #include <asm/mmu_context.h> | 34 | #include <asm/mmu_context.h> |
| @@ -1576,11 +1577,11 @@ int hibernate_preallocate_memory(void) | |||
| 1576 | struct zone *zone; | 1577 | struct zone *zone; |
| 1577 | unsigned long saveable, size, max_size, count, highmem, pages = 0; | 1578 | unsigned long saveable, size, max_size, count, highmem, pages = 0; |
| 1578 | unsigned long alloc, save_highmem, pages_highmem, avail_normal; | 1579 | unsigned long alloc, save_highmem, pages_highmem, avail_normal; |
| 1579 | struct timeval start, stop; | 1580 | ktime_t start, stop; |
| 1580 | int error; | 1581 | int error; |
| 1581 | 1582 | ||
| 1582 | printk(KERN_INFO "PM: Preallocating image memory... "); | 1583 | printk(KERN_INFO "PM: Preallocating image memory... "); |
| 1583 | do_gettimeofday(&start); | 1584 | start = ktime_get(); |
| 1584 | 1585 | ||
| 1585 | error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY); | 1586 | error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY); |
| 1586 | if (error) | 1587 | if (error) |
| @@ -1709,9 +1710,9 @@ int hibernate_preallocate_memory(void) | |||
| 1709 | free_unnecessary_pages(); | 1710 | free_unnecessary_pages(); |
| 1710 | 1711 | ||
| 1711 | out: | 1712 | out: |
| 1712 | do_gettimeofday(&stop); | 1713 | stop = ktime_get(); |
| 1713 | printk(KERN_CONT "done (allocated %lu pages)\n", pages); | 1714 | printk(KERN_CONT "done (allocated %lu pages)\n", pages); |
| 1714 | swsusp_show_speed(&start, &stop, pages, "Allocated"); | 1715 | swsusp_show_speed(start, stop, pages, "Allocated"); |
| 1715 | 1716 | ||
| 1716 | return 0; | 1717 | return 0; |
| 1717 | 1718 | ||
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index aaa3261dea5d..570aff817543 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
| @@ -30,6 +30,7 @@ | |||
| 30 | #include <linux/atomic.h> | 30 | #include <linux/atomic.h> |
| 31 | #include <linux/kthread.h> | 31 | #include <linux/kthread.h> |
| 32 | #include <linux/crc32.h> | 32 | #include <linux/crc32.h> |
| 33 | #include <linux/ktime.h> | ||
| 33 | 34 | ||
| 34 | #include "power.h" | 35 | #include "power.h" |
| 35 | 36 | ||
| @@ -445,8 +446,8 @@ static int save_image(struct swap_map_handle *handle, | |||
| 445 | int nr_pages; | 446 | int nr_pages; |
| 446 | int err2; | 447 | int err2; |
| 447 | struct bio *bio; | 448 | struct bio *bio; |
| 448 | struct timeval start; | 449 | ktime_t start; |
| 449 | struct timeval stop; | 450 | ktime_t stop; |
| 450 | 451 | ||
| 451 | printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n", | 452 | printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n", |
| 452 | nr_to_write); | 453 | nr_to_write); |
| @@ -455,7 +456,7 @@ static int save_image(struct swap_map_handle *handle, | |||
| 455 | m = 1; | 456 | m = 1; |
| 456 | nr_pages = 0; | 457 | nr_pages = 0; |
| 457 | bio = NULL; | 458 | bio = NULL; |
| 458 | do_gettimeofday(&start); | 459 | start = ktime_get(); |
| 459 | while (1) { | 460 | while (1) { |
| 460 | ret = snapshot_read_next(snapshot); | 461 | ret = snapshot_read_next(snapshot); |
| 461 | if (ret <= 0) | 462 | if (ret <= 0) |
| @@ -469,12 +470,12 @@ static int save_image(struct swap_map_handle *handle, | |||
| 469 | nr_pages++; | 470 | nr_pages++; |
| 470 | } | 471 | } |
| 471 | err2 = hib_wait_on_bio_chain(&bio); | 472 | err2 = hib_wait_on_bio_chain(&bio); |
| 472 | do_gettimeofday(&stop); | 473 | stop = ktime_get(); |
| 473 | if (!ret) | 474 | if (!ret) |
| 474 | ret = err2; | 475 | ret = err2; |
| 475 | if (!ret) | 476 | if (!ret) |
| 476 | printk(KERN_INFO "PM: Image saving done.\n"); | 477 | printk(KERN_INFO "PM: Image saving done.\n"); |
| 477 | swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); | 478 | swsusp_show_speed(start, stop, nr_to_write, "Wrote"); |
| 478 | return ret; | 479 | return ret; |
| 479 | } | 480 | } |
| 480 | 481 | ||
| @@ -580,8 +581,8 @@ static int save_image_lzo(struct swap_map_handle *handle, | |||
| 580 | int nr_pages; | 581 | int nr_pages; |
| 581 | int err2; | 582 | int err2; |
| 582 | struct bio *bio; | 583 | struct bio *bio; |
| 583 | struct timeval start; | 584 | ktime_t start; |
| 584 | struct timeval stop; | 585 | ktime_t stop; |
| 585 | size_t off; | 586 | size_t off; |
| 586 | unsigned thr, run_threads, nr_threads; | 587 | unsigned thr, run_threads, nr_threads; |
| 587 | unsigned char *page = NULL; | 588 | unsigned char *page = NULL; |
| @@ -674,7 +675,7 @@ static int save_image_lzo(struct swap_map_handle *handle, | |||
| 674 | m = 1; | 675 | m = 1; |
| 675 | nr_pages = 0; | 676 | nr_pages = 0; |
| 676 | bio = NULL; | 677 | bio = NULL; |
| 677 | do_gettimeofday(&start); | 678 | start = ktime_get(); |
| 678 | for (;;) { | 679 | for (;;) { |
| 679 | for (thr = 0; thr < nr_threads; thr++) { | 680 | for (thr = 0; thr < nr_threads; thr++) { |
| 680 | for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { | 681 | for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { |
| @@ -759,12 +760,12 @@ static int save_image_lzo(struct swap_map_handle *handle, | |||
| 759 | 760 | ||
| 760 | out_finish: | 761 | out_finish: |
| 761 | err2 = hib_wait_on_bio_chain(&bio); | 762 | err2 = hib_wait_on_bio_chain(&bio); |
| 762 | do_gettimeofday(&stop); | 763 | stop = ktime_get(); |
| 763 | if (!ret) | 764 | if (!ret) |
| 764 | ret = err2; | 765 | ret = err2; |
| 765 | if (!ret) | 766 | if (!ret) |
| 766 | printk(KERN_INFO "PM: Image saving done.\n"); | 767 | printk(KERN_INFO "PM: Image saving done.\n"); |
| 767 | swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); | 768 | swsusp_show_speed(start, stop, nr_to_write, "Wrote"); |
| 768 | out_clean: | 769 | out_clean: |
| 769 | if (crc) { | 770 | if (crc) { |
| 770 | if (crc->thr) | 771 | if (crc->thr) |
| @@ -965,8 +966,8 @@ static int load_image(struct swap_map_handle *handle, | |||
| 965 | { | 966 | { |
| 966 | unsigned int m; | 967 | unsigned int m; |
| 967 | int ret = 0; | 968 | int ret = 0; |
| 968 | struct timeval start; | 969 | ktime_t start; |
| 969 | struct timeval stop; | 970 | ktime_t stop; |
| 970 | struct bio *bio; | 971 | struct bio *bio; |
| 971 | int err2; | 972 | int err2; |
| 972 | unsigned nr_pages; | 973 | unsigned nr_pages; |
| @@ -978,7 +979,7 @@ static int load_image(struct swap_map_handle *handle, | |||
| 978 | m = 1; | 979 | m = 1; |
| 979 | nr_pages = 0; | 980 | nr_pages = 0; |
| 980 | bio = NULL; | 981 | bio = NULL; |
| 981 | do_gettimeofday(&start); | 982 | start = ktime_get(); |
| 982 | for ( ; ; ) { | 983 | for ( ; ; ) { |
| 983 | ret = snapshot_write_next(snapshot); | 984 | ret = snapshot_write_next(snapshot); |
| 984 | if (ret <= 0) | 985 | if (ret <= 0) |
| @@ -996,7 +997,7 @@ static int load_image(struct swap_map_handle *handle, | |||
| 996 | nr_pages++; | 997 | nr_pages++; |
| 997 | } | 998 | } |
| 998 | err2 = hib_wait_on_bio_chain(&bio); | 999 | err2 = hib_wait_on_bio_chain(&bio); |
| 999 | do_gettimeofday(&stop); | 1000 | stop = ktime_get(); |
| 1000 | if (!ret) | 1001 | if (!ret) |
| 1001 | ret = err2; | 1002 | ret = err2; |
| 1002 | if (!ret) { | 1003 | if (!ret) { |
| @@ -1005,7 +1006,7 @@ static int load_image(struct swap_map_handle *handle, | |||
| 1005 | if (!snapshot_image_loaded(snapshot)) | 1006 | if (!snapshot_image_loaded(snapshot)) |
| 1006 | ret = -ENODATA; | 1007 | ret = -ENODATA; |
| 1007 | } | 1008 | } |
| 1008 | swsusp_show_speed(&start, &stop, nr_to_read, "Read"); | 1009 | swsusp_show_speed(start, stop, nr_to_read, "Read"); |
| 1009 | return ret; | 1010 | return ret; |
| 1010 | } | 1011 | } |
| 1011 | 1012 | ||
| @@ -1067,8 +1068,8 @@ static int load_image_lzo(struct swap_map_handle *handle, | |||
| 1067 | int ret = 0; | 1068 | int ret = 0; |
| 1068 | int eof = 0; | 1069 | int eof = 0; |
| 1069 | struct bio *bio; | 1070 | struct bio *bio; |
| 1070 | struct timeval start; | 1071 | ktime_t start; |
| 1071 | struct timeval stop; | 1072 | ktime_t stop; |
| 1072 | unsigned nr_pages; | 1073 | unsigned nr_pages; |
| 1073 | size_t off; | 1074 | size_t off; |
| 1074 | unsigned i, thr, run_threads, nr_threads; | 1075 | unsigned i, thr, run_threads, nr_threads; |
| @@ -1190,7 +1191,7 @@ static int load_image_lzo(struct swap_map_handle *handle, | |||
| 1190 | m = 1; | 1191 | m = 1; |
| 1191 | nr_pages = 0; | 1192 | nr_pages = 0; |
| 1192 | bio = NULL; | 1193 | bio = NULL; |
| 1193 | do_gettimeofday(&start); | 1194 | start = ktime_get(); |
| 1194 | 1195 | ||
| 1195 | ret = snapshot_write_next(snapshot); | 1196 | ret = snapshot_write_next(snapshot); |
| 1196 | if (ret <= 0) | 1197 | if (ret <= 0) |
| @@ -1343,7 +1344,7 @@ out_finish: | |||
| 1343 | wait_event(crc->done, atomic_read(&crc->stop)); | 1344 | wait_event(crc->done, atomic_read(&crc->stop)); |
| 1344 | atomic_set(&crc->stop, 0); | 1345 | atomic_set(&crc->stop, 0); |
| 1345 | } | 1346 | } |
| 1346 | do_gettimeofday(&stop); | 1347 | stop = ktime_get(); |
| 1347 | if (!ret) { | 1348 | if (!ret) { |
| 1348 | printk(KERN_INFO "PM: Image loading done.\n"); | 1349 | printk(KERN_INFO "PM: Image loading done.\n"); |
| 1349 | snapshot_write_finalize(snapshot); | 1350 | snapshot_write_finalize(snapshot); |
| @@ -1359,7 +1360,7 @@ out_finish: | |||
| 1359 | } | 1360 | } |
| 1360 | } | 1361 | } |
| 1361 | } | 1362 | } |
| 1362 | swsusp_show_speed(&start, &stop, nr_to_read, "Read"); | 1363 | swsusp_show_speed(start, stop, nr_to_read, "Read"); |
| 1363 | out_clean: | 1364 | out_clean: |
| 1364 | for (i = 0; i < ring_size; i++) | 1365 | for (i = 0; i < ring_size; i++) |
| 1365 | free_page((unsigned long)page[i]); | 1366 | free_page((unsigned long)page[i]); |
| @@ -1374,7 +1375,7 @@ out_clean: | |||
| 1374 | kthread_stop(data[thr].thr); | 1375 | kthread_stop(data[thr].thr); |
| 1375 | vfree(data); | 1376 | vfree(data); |
| 1376 | } | 1377 | } |
| 1377 | if (page) vfree(page); | 1378 | vfree(page); |
| 1378 | 1379 | ||
| 1379 | return ret; | 1380 | return ret; |
| 1380 | } | 1381 | } |
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index ced2b84b1cb7..02d6b6d28796 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c | |||
| @@ -62,9 +62,6 @@ int console_printk[4] = { | |||
| 62 | CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ | 62 | CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ |
| 63 | }; | 63 | }; |
| 64 | 64 | ||
| 65 | /* Deferred messaged from sched code are marked by this special level */ | ||
| 66 | #define SCHED_MESSAGE_LOGLEVEL -2 | ||
| 67 | |||
| 68 | /* | 65 | /* |
| 69 | * Low level drivers may need that to know if they can schedule in | 66 | * Low level drivers may need that to know if they can schedule in |
| 70 | * their unblank() callback or not. So let's export it. | 67 | * their unblank() callback or not. So let's export it. |
| @@ -480,7 +477,7 @@ static int syslog_action_restricted(int type) | |||
| 480 | type != SYSLOG_ACTION_SIZE_BUFFER; | 477 | type != SYSLOG_ACTION_SIZE_BUFFER; |
| 481 | } | 478 | } |
| 482 | 479 | ||
| 483 | static int check_syslog_permissions(int type, bool from_file) | 480 | int check_syslog_permissions(int type, bool from_file) |
| 484 | { | 481 | { |
| 485 | /* | 482 | /* |
| 486 | * If this is from /proc/kmsg and we've already opened it, then we've | 483 | * If this is from /proc/kmsg and we've already opened it, then we've |
| @@ -1259,7 +1256,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) | |||
| 1259 | int do_syslog(int type, char __user *buf, int len, bool from_file) | 1256 | int do_syslog(int type, char __user *buf, int len, bool from_file) |
| 1260 | { | 1257 | { |
| 1261 | bool clear = false; | 1258 | bool clear = false; |
| 1262 | static int saved_console_loglevel = -1; | 1259 | static int saved_console_loglevel = LOGLEVEL_DEFAULT; |
| 1263 | int error; | 1260 | int error; |
| 1264 | 1261 | ||
| 1265 | error = check_syslog_permissions(type, from_file); | 1262 | error = check_syslog_permissions(type, from_file); |
| @@ -1316,15 +1313,15 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
| 1316 | break; | 1313 | break; |
| 1317 | /* Disable logging to console */ | 1314 | /* Disable logging to console */ |
| 1318 | case SYSLOG_ACTION_CONSOLE_OFF: | 1315 | case SYSLOG_ACTION_CONSOLE_OFF: |
| 1319 | if (saved_console_loglevel == -1) | 1316 | if (saved_console_loglevel == LOGLEVEL_DEFAULT) |
| 1320 | saved_console_loglevel = console_loglevel; | 1317 | saved_console_loglevel = console_loglevel; |
| 1321 | console_loglevel = minimum_console_loglevel; | 1318 | console_loglevel = minimum_console_loglevel; |
| 1322 | break; | 1319 | break; |
| 1323 | /* Enable logging to console */ | 1320 | /* Enable logging to console */ |
| 1324 | case SYSLOG_ACTION_CONSOLE_ON: | 1321 | case SYSLOG_ACTION_CONSOLE_ON: |
| 1325 | if (saved_console_loglevel != -1) { | 1322 | if (saved_console_loglevel != LOGLEVEL_DEFAULT) { |
| 1326 | console_loglevel = saved_console_loglevel; | 1323 | console_loglevel = saved_console_loglevel; |
| 1327 | saved_console_loglevel = -1; | 1324 | saved_console_loglevel = LOGLEVEL_DEFAULT; |
| 1328 | } | 1325 | } |
| 1329 | break; | 1326 | break; |
| 1330 | /* Set level of messages printed to console */ | 1327 | /* Set level of messages printed to console */ |
| @@ -1336,7 +1333,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
| 1336 | len = minimum_console_loglevel; | 1333 | len = minimum_console_loglevel; |
| 1337 | console_loglevel = len; | 1334 | console_loglevel = len; |
| 1338 | /* Implicitly re-enable logging to console */ | 1335 | /* Implicitly re-enable logging to console */ |
| 1339 | saved_console_loglevel = -1; | 1336 | saved_console_loglevel = LOGLEVEL_DEFAULT; |
| 1340 | error = 0; | 1337 | error = 0; |
| 1341 | break; | 1338 | break; |
| 1342 | /* Number of chars in the log buffer */ | 1339 | /* Number of chars in the log buffer */ |
| @@ -1627,10 +1624,10 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
| 1627 | int printed_len = 0; | 1624 | int printed_len = 0; |
| 1628 | bool in_sched = false; | 1625 | bool in_sched = false; |
| 1629 | /* cpu currently holding logbuf_lock in this function */ | 1626 | /* cpu currently holding logbuf_lock in this function */ |
| 1630 | static volatile unsigned int logbuf_cpu = UINT_MAX; | 1627 | static unsigned int logbuf_cpu = UINT_MAX; |
| 1631 | 1628 | ||
| 1632 | if (level == SCHED_MESSAGE_LOGLEVEL) { | 1629 | if (level == LOGLEVEL_SCHED) { |
| 1633 | level = -1; | 1630 | level = LOGLEVEL_DEFAULT; |
| 1634 | in_sched = true; | 1631 | in_sched = true; |
| 1635 | } | 1632 | } |
| 1636 | 1633 | ||
| @@ -1695,8 +1692,9 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
| 1695 | const char *end_of_header = printk_skip_level(text); | 1692 | const char *end_of_header = printk_skip_level(text); |
| 1696 | switch (kern_level) { | 1693 | switch (kern_level) { |
| 1697 | case '0' ... '7': | 1694 | case '0' ... '7': |
| 1698 | if (level == -1) | 1695 | if (level == LOGLEVEL_DEFAULT) |
| 1699 | level = kern_level - '0'; | 1696 | level = kern_level - '0'; |
| 1697 | /* fallthrough */ | ||
| 1700 | case 'd': /* KERN_DEFAULT */ | 1698 | case 'd': /* KERN_DEFAULT */ |
| 1701 | lflags |= LOG_PREFIX; | 1699 | lflags |= LOG_PREFIX; |
| 1702 | } | 1700 | } |
| @@ -1710,7 +1708,7 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
| 1710 | } | 1708 | } |
| 1711 | } | 1709 | } |
| 1712 | 1710 | ||
| 1713 | if (level == -1) | 1711 | if (level == LOGLEVEL_DEFAULT) |
| 1714 | level = default_message_loglevel; | 1712 | level = default_message_loglevel; |
| 1715 | 1713 | ||
| 1716 | if (dict) | 1714 | if (dict) |
| @@ -1788,7 +1786,7 @@ EXPORT_SYMBOL(vprintk_emit); | |||
| 1788 | 1786 | ||
| 1789 | asmlinkage int vprintk(const char *fmt, va_list args) | 1787 | asmlinkage int vprintk(const char *fmt, va_list args) |
| 1790 | { | 1788 | { |
| 1791 | return vprintk_emit(0, -1, NULL, 0, fmt, args); | 1789 | return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); |
| 1792 | } | 1790 | } |
| 1793 | EXPORT_SYMBOL(vprintk); | 1791 | EXPORT_SYMBOL(vprintk); |
| 1794 | 1792 | ||
| @@ -1807,6 +1805,30 @@ asmlinkage int printk_emit(int facility, int level, | |||
| 1807 | } | 1805 | } |
| 1808 | EXPORT_SYMBOL(printk_emit); | 1806 | EXPORT_SYMBOL(printk_emit); |
| 1809 | 1807 | ||
| 1808 | int vprintk_default(const char *fmt, va_list args) | ||
| 1809 | { | ||
| 1810 | int r; | ||
| 1811 | |||
| 1812 | #ifdef CONFIG_KGDB_KDB | ||
| 1813 | if (unlikely(kdb_trap_printk)) { | ||
| 1814 | r = vkdb_printf(fmt, args); | ||
| 1815 | return r; | ||
| 1816 | } | ||
| 1817 | #endif | ||
| 1818 | r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); | ||
| 1819 | |||
| 1820 | return r; | ||
| 1821 | } | ||
| 1822 | EXPORT_SYMBOL_GPL(vprintk_default); | ||
| 1823 | |||
| 1824 | /* | ||
| 1825 | * This allows printk to be diverted to another function per cpu. | ||
| 1826 | * This is useful for calling printk functions from within NMI | ||
| 1827 | * without worrying about race conditions that can lock up the | ||
| 1828 | * box. | ||
| 1829 | */ | ||
| 1830 | DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; | ||
| 1831 | |||
| 1810 | /** | 1832 | /** |
| 1811 | * printk - print a kernel message | 1833 | * printk - print a kernel message |
| 1812 | * @fmt: format string | 1834 | * @fmt: format string |
| @@ -1830,19 +1852,21 @@ EXPORT_SYMBOL(printk_emit); | |||
| 1830 | */ | 1852 | */ |
| 1831 | asmlinkage __visible int printk(const char *fmt, ...) | 1853 | asmlinkage __visible int printk(const char *fmt, ...) |
| 1832 | { | 1854 | { |
| 1855 | printk_func_t vprintk_func; | ||
| 1833 | va_list args; | 1856 | va_list args; |
| 1834 | int r; | 1857 | int r; |
| 1835 | 1858 | ||
| 1836 | #ifdef CONFIG_KGDB_KDB | ||
| 1837 | if (unlikely(kdb_trap_printk)) { | ||
| 1838 | va_start(args, fmt); | ||
| 1839 | r = vkdb_printf(fmt, args); | ||
| 1840 | va_end(args); | ||
| 1841 | return r; | ||
| 1842 | } | ||
| 1843 | #endif | ||
| 1844 | va_start(args, fmt); | 1859 | va_start(args, fmt); |
| 1845 | r = vprintk_emit(0, -1, NULL, 0, fmt, args); | 1860 | |
| 1861 | /* | ||
| 1862 | * If a caller overrides the per_cpu printk_func, then it needs | ||
| 1863 | * to disable preemption when calling printk(). Otherwise | ||
| 1864 | * the printk_func should be set to the default. No need to | ||
| 1865 | * disable preemption here. | ||
| 1866 | */ | ||
| 1867 | vprintk_func = this_cpu_read(printk_func); | ||
| 1868 | r = vprintk_func(fmt, args); | ||
| 1869 | |||
| 1846 | va_end(args); | 1870 | va_end(args); |
| 1847 | 1871 | ||
| 1848 | return r; | 1872 | return r; |
| @@ -1876,28 +1900,28 @@ static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev, | |||
| 1876 | bool syslog, char *buf, size_t size) { return 0; } | 1900 | bool syslog, char *buf, size_t size) { return 0; } |
| 1877 | static size_t cont_print_text(char *text, size_t size) { return 0; } | 1901 | static size_t cont_print_text(char *text, size_t size) { return 0; } |
| 1878 | 1902 | ||
| 1903 | /* Still needs to be defined for users */ | ||
| 1904 | DEFINE_PER_CPU(printk_func_t, printk_func); | ||
| 1905 | |||
| 1879 | #endif /* CONFIG_PRINTK */ | 1906 | #endif /* CONFIG_PRINTK */ |
| 1880 | 1907 | ||
| 1881 | #ifdef CONFIG_EARLY_PRINTK | 1908 | #ifdef CONFIG_EARLY_PRINTK |
| 1882 | struct console *early_console; | 1909 | struct console *early_console; |
| 1883 | 1910 | ||
| 1884 | void early_vprintk(const char *fmt, va_list ap) | ||
| 1885 | { | ||
| 1886 | if (early_console) { | ||
| 1887 | char buf[512]; | ||
| 1888 | int n = vscnprintf(buf, sizeof(buf), fmt, ap); | ||
| 1889 | |||
| 1890 | early_console->write(early_console, buf, n); | ||
| 1891 | } | ||
| 1892 | } | ||
| 1893 | |||
| 1894 | asmlinkage __visible void early_printk(const char *fmt, ...) | 1911 | asmlinkage __visible void early_printk(const char *fmt, ...) |
| 1895 | { | 1912 | { |
| 1896 | va_list ap; | 1913 | va_list ap; |
| 1914 | char buf[512]; | ||
| 1915 | int n; | ||
| 1916 | |||
| 1917 | if (!early_console) | ||
| 1918 | return; | ||
| 1897 | 1919 | ||
| 1898 | va_start(ap, fmt); | 1920 | va_start(ap, fmt); |
| 1899 | early_vprintk(fmt, ap); | 1921 | n = vscnprintf(buf, sizeof(buf), fmt, ap); |
| 1900 | va_end(ap); | 1922 | va_end(ap); |
| 1923 | |||
| 1924 | early_console->write(early_console, buf, n); | ||
| 1901 | } | 1925 | } |
| 1902 | #endif | 1926 | #endif |
| 1903 | 1927 | ||
| @@ -2634,7 +2658,7 @@ int printk_deferred(const char *fmt, ...) | |||
| 2634 | 2658 | ||
| 2635 | preempt_disable(); | 2659 | preempt_disable(); |
| 2636 | va_start(args, fmt); | 2660 | va_start(args, fmt); |
| 2637 | r = vprintk_emit(0, SCHED_MESSAGE_LOGLEVEL, NULL, 0, fmt, args); | 2661 | r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args); |
| 2638 | va_end(args); | 2662 | va_end(args); |
| 2639 | 2663 | ||
| 2640 | __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); | 2664 | __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 54e75226c2c4..1eb9d90c3af9 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
| @@ -485,36 +485,19 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) | |||
| 485 | 485 | ||
| 486 | /* | 486 | /* |
| 487 | * Detach all tasks we were using ptrace on. Called with tasklist held | 487 | * Detach all tasks we were using ptrace on. Called with tasklist held |
| 488 | * for writing, and returns with it held too. But note it can release | 488 | * for writing. |
| 489 | * and reacquire the lock. | ||
| 490 | */ | 489 | */ |
| 491 | void exit_ptrace(struct task_struct *tracer) | 490 | void exit_ptrace(struct task_struct *tracer, struct list_head *dead) |
| 492 | __releases(&tasklist_lock) | ||
| 493 | __acquires(&tasklist_lock) | ||
| 494 | { | 491 | { |
| 495 | struct task_struct *p, *n; | 492 | struct task_struct *p, *n; |
| 496 | LIST_HEAD(ptrace_dead); | ||
| 497 | |||
| 498 | if (likely(list_empty(&tracer->ptraced))) | ||
| 499 | return; | ||
| 500 | 493 | ||
| 501 | list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { | 494 | list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { |
| 502 | if (unlikely(p->ptrace & PT_EXITKILL)) | 495 | if (unlikely(p->ptrace & PT_EXITKILL)) |
| 503 | send_sig_info(SIGKILL, SEND_SIG_FORCED, p); | 496 | send_sig_info(SIGKILL, SEND_SIG_FORCED, p); |
| 504 | 497 | ||
| 505 | if (__ptrace_detach(tracer, p)) | 498 | if (__ptrace_detach(tracer, p)) |
| 506 | list_add(&p->ptrace_entry, &ptrace_dead); | 499 | list_add(&p->ptrace_entry, dead); |
| 507 | } | ||
| 508 | |||
| 509 | write_unlock_irq(&tasklist_lock); | ||
| 510 | BUG_ON(!list_empty(&tracer->ptraced)); | ||
| 511 | |||
| 512 | list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) { | ||
| 513 | list_del_init(&p->ptrace_entry); | ||
| 514 | release_task(p); | ||
| 515 | } | 500 | } |
| 516 | |||
| 517 | write_lock_irq(&tasklist_lock); | ||
| 518 | } | 501 | } |
| 519 | 502 | ||
| 520 | int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) | 503 | int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) |
diff --git a/kernel/range.c b/kernel/range.c index 322ea8e93e4b..82cfc285b046 100644 --- a/kernel/range.c +++ b/kernel/range.c | |||
| @@ -113,12 +113,12 @@ static int cmp_range(const void *x1, const void *x2) | |||
| 113 | { | 113 | { |
| 114 | const struct range *r1 = x1; | 114 | const struct range *r1 = x1; |
| 115 | const struct range *r2 = x2; | 115 | const struct range *r2 = x2; |
| 116 | s64 start1, start2; | ||
| 117 | 116 | ||
| 118 | start1 = r1->start; | 117 | if (r1->start < r2->start) |
| 119 | start2 = r2->start; | 118 | return -1; |
| 120 | 119 | if (r1->start > r2->start) | |
| 121 | return start1 - start2; | 120 | return 1; |
| 121 | return 0; | ||
| 122 | } | 122 | } |
| 123 | 123 | ||
| 124 | int clean_sort_range(struct range *range, int az) | 124 | int clean_sort_range(struct range *range, int az) |
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 807ccfbf69b3..e6fae503d1bc 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile | |||
| @@ -1,6 +1,6 @@ | |||
| 1 | obj-y += update.o srcu.o | 1 | obj-y += update.o srcu.o |
| 2 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o | 2 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o |
| 3 | obj-$(CONFIG_TREE_RCU) += tree.o | 3 | obj-$(CONFIG_TREE_RCU) += tree.o |
| 4 | obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o | 4 | obj-$(CONFIG_PREEMPT_RCU) += tree.o |
| 5 | obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o | 5 | obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o |
| 6 | obj-$(CONFIG_TINY_RCU) += tiny.o | 6 | obj-$(CONFIG_TINY_RCU) += tiny.o |
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index ff1a6de62f17..07bb02eda844 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h | |||
| @@ -135,4 +135,6 @@ int rcu_jiffies_till_stall_check(void); | |||
| 135 | */ | 135 | */ |
| 136 | #define TPS(x) tracepoint_string(x) | 136 | #define TPS(x) tracepoint_string(x) |
| 137 | 137 | ||
| 138 | void rcu_early_boot_tests(void); | ||
| 139 | |||
| 138 | #endif /* __LINUX_RCU_H */ | 140 | #endif /* __LINUX_RCU_H */ |
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 240fa9094f83..4d559baf06e0 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c | |||
| @@ -812,6 +812,7 @@ rcu_torture_cbflood(void *arg) | |||
| 812 | cur_ops->cb_barrier(); | 812 | cur_ops->cb_barrier(); |
| 813 | stutter_wait("rcu_torture_cbflood"); | 813 | stutter_wait("rcu_torture_cbflood"); |
| 814 | } while (!torture_must_stop()); | 814 | } while (!torture_must_stop()); |
| 815 | vfree(rhp); | ||
| 815 | torture_kthread_stopping("rcu_torture_cbflood"); | 816 | torture_kthread_stopping("rcu_torture_cbflood"); |
| 816 | return 0; | 817 | return 0; |
| 817 | } | 818 | } |
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index c0623fc47125..0db5649f8817 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c | |||
| @@ -247,7 +247,7 @@ void rcu_bh_qs(void) | |||
| 247 | * be called from hardirq context. It is normally called from the | 247 | * be called from hardirq context. It is normally called from the |
| 248 | * scheduling-clock interrupt. | 248 | * scheduling-clock interrupt. |
| 249 | */ | 249 | */ |
| 250 | void rcu_check_callbacks(int cpu, int user) | 250 | void rcu_check_callbacks(int user) |
| 251 | { | 251 | { |
| 252 | RCU_TRACE(check_cpu_stalls()); | 252 | RCU_TRACE(check_cpu_stalls()); |
| 253 | if (user || rcu_is_cpu_rrupt_from_idle()) | 253 | if (user || rcu_is_cpu_rrupt_from_idle()) |
| @@ -380,7 +380,9 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | |||
| 380 | } | 380 | } |
| 381 | EXPORT_SYMBOL_GPL(call_rcu_bh); | 381 | EXPORT_SYMBOL_GPL(call_rcu_bh); |
| 382 | 382 | ||
| 383 | void rcu_init(void) | 383 | void __init rcu_init(void) |
| 384 | { | 384 | { |
| 385 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | 385 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); |
| 386 | |||
| 387 | rcu_early_boot_tests(); | ||
| 386 | } | 388 | } |
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9815447d22e0..7680fc275036 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c | |||
| @@ -105,7 +105,7 @@ struct rcu_state sname##_state = { \ | |||
| 105 | .name = RCU_STATE_NAME(sname), \ | 105 | .name = RCU_STATE_NAME(sname), \ |
| 106 | .abbr = sabbr, \ | 106 | .abbr = sabbr, \ |
| 107 | }; \ | 107 | }; \ |
| 108 | DEFINE_PER_CPU(struct rcu_data, sname##_data) | 108 | DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data) |
| 109 | 109 | ||
| 110 | RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); | 110 | RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); |
| 111 | RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); | 111 | RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); |
| @@ -152,19 +152,6 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active); | |||
| 152 | */ | 152 | */ |
| 153 | static int rcu_scheduler_fully_active __read_mostly; | 153 | static int rcu_scheduler_fully_active __read_mostly; |
| 154 | 154 | ||
| 155 | #ifdef CONFIG_RCU_BOOST | ||
| 156 | |||
| 157 | /* | ||
| 158 | * Control variables for per-CPU and per-rcu_node kthreads. These | ||
| 159 | * handle all flavors of RCU. | ||
| 160 | */ | ||
| 161 | static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); | ||
| 162 | DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); | ||
| 163 | DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); | ||
| 164 | DEFINE_PER_CPU(char, rcu_cpu_has_work); | ||
| 165 | |||
| 166 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 167 | |||
| 168 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); | 155 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); |
| 169 | static void invoke_rcu_core(void); | 156 | static void invoke_rcu_core(void); |
| 170 | static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); | 157 | static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); |
| @@ -286,11 +273,11 @@ static void rcu_momentary_dyntick_idle(void) | |||
| 286 | * and requires special handling for preemptible RCU. | 273 | * and requires special handling for preemptible RCU. |
| 287 | * The caller must have disabled preemption. | 274 | * The caller must have disabled preemption. |
| 288 | */ | 275 | */ |
| 289 | void rcu_note_context_switch(int cpu) | 276 | void rcu_note_context_switch(void) |
| 290 | { | 277 | { |
| 291 | trace_rcu_utilization(TPS("Start context switch")); | 278 | trace_rcu_utilization(TPS("Start context switch")); |
| 292 | rcu_sched_qs(); | 279 | rcu_sched_qs(); |
| 293 | rcu_preempt_note_context_switch(cpu); | 280 | rcu_preempt_note_context_switch(); |
| 294 | if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) | 281 | if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) |
| 295 | rcu_momentary_dyntick_idle(); | 282 | rcu_momentary_dyntick_idle(); |
| 296 | trace_rcu_utilization(TPS("End context switch")); | 283 | trace_rcu_utilization(TPS("End context switch")); |
| @@ -325,7 +312,7 @@ static void force_qs_rnp(struct rcu_state *rsp, | |||
| 325 | unsigned long *maxj), | 312 | unsigned long *maxj), |
| 326 | bool *isidle, unsigned long *maxj); | 313 | bool *isidle, unsigned long *maxj); |
| 327 | static void force_quiescent_state(struct rcu_state *rsp); | 314 | static void force_quiescent_state(struct rcu_state *rsp); |
| 328 | static int rcu_pending(int cpu); | 315 | static int rcu_pending(void); |
| 329 | 316 | ||
| 330 | /* | 317 | /* |
| 331 | * Return the number of RCU-sched batches processed thus far for debug & stats. | 318 | * Return the number of RCU-sched batches processed thus far for debug & stats. |
| @@ -510,11 +497,11 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 510 | * we really have entered idle, and must do the appropriate accounting. | 497 | * we really have entered idle, and must do the appropriate accounting. |
| 511 | * The caller must have disabled interrupts. | 498 | * The caller must have disabled interrupts. |
| 512 | */ | 499 | */ |
| 513 | static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, | 500 | static void rcu_eqs_enter_common(long long oldval, bool user) |
| 514 | bool user) | ||
| 515 | { | 501 | { |
| 516 | struct rcu_state *rsp; | 502 | struct rcu_state *rsp; |
| 517 | struct rcu_data *rdp; | 503 | struct rcu_data *rdp; |
| 504 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | ||
| 518 | 505 | ||
| 519 | trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); | 506 | trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); |
| 520 | if (!user && !is_idle_task(current)) { | 507 | if (!user && !is_idle_task(current)) { |
| @@ -531,7 +518,7 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, | |||
| 531 | rdp = this_cpu_ptr(rsp->rda); | 518 | rdp = this_cpu_ptr(rsp->rda); |
| 532 | do_nocb_deferred_wakeup(rdp); | 519 | do_nocb_deferred_wakeup(rdp); |
| 533 | } | 520 | } |
| 534 | rcu_prepare_for_idle(smp_processor_id()); | 521 | rcu_prepare_for_idle(); |
| 535 | /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ | 522 | /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ |
| 536 | smp_mb__before_atomic(); /* See above. */ | 523 | smp_mb__before_atomic(); /* See above. */ |
| 537 | atomic_inc(&rdtp->dynticks); | 524 | atomic_inc(&rdtp->dynticks); |
| @@ -565,7 +552,7 @@ static void rcu_eqs_enter(bool user) | |||
| 565 | WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); | 552 | WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); |
| 566 | if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) { | 553 | if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) { |
| 567 | rdtp->dynticks_nesting = 0; | 554 | rdtp->dynticks_nesting = 0; |
| 568 | rcu_eqs_enter_common(rdtp, oldval, user); | 555 | rcu_eqs_enter_common(oldval, user); |
| 569 | } else { | 556 | } else { |
| 570 | rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; | 557 | rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; |
| 571 | } | 558 | } |
| @@ -589,7 +576,7 @@ void rcu_idle_enter(void) | |||
| 589 | 576 | ||
| 590 | local_irq_save(flags); | 577 | local_irq_save(flags); |
| 591 | rcu_eqs_enter(false); | 578 | rcu_eqs_enter(false); |
| 592 | rcu_sysidle_enter(this_cpu_ptr(&rcu_dynticks), 0); | 579 | rcu_sysidle_enter(0); |
| 593 | local_irq_restore(flags); | 580 | local_irq_restore(flags); |
| 594 | } | 581 | } |
| 595 | EXPORT_SYMBOL_GPL(rcu_idle_enter); | 582 | EXPORT_SYMBOL_GPL(rcu_idle_enter); |
| @@ -639,8 +626,8 @@ void rcu_irq_exit(void) | |||
| 639 | if (rdtp->dynticks_nesting) | 626 | if (rdtp->dynticks_nesting) |
| 640 | trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting); | 627 | trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting); |
| 641 | else | 628 | else |
| 642 | rcu_eqs_enter_common(rdtp, oldval, true); | 629 | rcu_eqs_enter_common(oldval, true); |
| 643 | rcu_sysidle_enter(rdtp, 1); | 630 | rcu_sysidle_enter(1); |
| 644 | local_irq_restore(flags); | 631 | local_irq_restore(flags); |
| 645 | } | 632 | } |
| 646 | 633 | ||
| @@ -651,16 +638,17 @@ void rcu_irq_exit(void) | |||
| 651 | * we really have exited idle, and must do the appropriate accounting. | 638 | * we really have exited idle, and must do the appropriate accounting. |
| 652 | * The caller must have disabled interrupts. | 639 | * The caller must have disabled interrupts. |
| 653 | */ | 640 | */ |
| 654 | static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, | 641 | static void rcu_eqs_exit_common(long long oldval, int user) |
| 655 | int user) | ||
| 656 | { | 642 | { |
| 643 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | ||
| 644 | |||
| 657 | rcu_dynticks_task_exit(); | 645 | rcu_dynticks_task_exit(); |
| 658 | smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */ | 646 | smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */ |
| 659 | atomic_inc(&rdtp->dynticks); | 647 | atomic_inc(&rdtp->dynticks); |
| 660 | /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ | 648 | /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ |
| 661 | smp_mb__after_atomic(); /* See above. */ | 649 | smp_mb__after_atomic(); /* See above. */ |
| 662 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); | 650 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); |
| 663 | rcu_cleanup_after_idle(smp_processor_id()); | 651 | rcu_cleanup_after_idle(); |
| 664 | trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); | 652 | trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); |
| 665 | if (!user && !is_idle_task(current)) { | 653 | if (!user && !is_idle_task(current)) { |
| 666 | struct task_struct *idle __maybe_unused = | 654 | struct task_struct *idle __maybe_unused = |
| @@ -691,7 +679,7 @@ static void rcu_eqs_exit(bool user) | |||
| 691 | rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; | 679 | rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; |
| 692 | } else { | 680 | } else { |
| 693 | rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; | 681 | rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; |
| 694 | rcu_eqs_exit_common(rdtp, oldval, user); | 682 | rcu_eqs_exit_common(oldval, user); |
| 695 | } | 683 | } |
| 696 | } | 684 | } |
| 697 | 685 | ||
| @@ -712,7 +700,7 @@ void rcu_idle_exit(void) | |||
| 712 | 700 | ||
| 713 | local_irq_save(flags); | 701 | local_irq_save(flags); |
| 714 | rcu_eqs_exit(false); | 702 | rcu_eqs_exit(false); |
| 715 | rcu_sysidle_exit(this_cpu_ptr(&rcu_dynticks), 0); | 703 | rcu_sysidle_exit(0); |
| 716 | local_irq_restore(flags); | 704 | local_irq_restore(flags); |
| 717 | } | 705 | } |
| 718 | EXPORT_SYMBOL_GPL(rcu_idle_exit); | 706 | EXPORT_SYMBOL_GPL(rcu_idle_exit); |
| @@ -763,8 +751,8 @@ void rcu_irq_enter(void) | |||
| 763 | if (oldval) | 751 | if (oldval) |
| 764 | trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); | 752 | trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); |
| 765 | else | 753 | else |
| 766 | rcu_eqs_exit_common(rdtp, oldval, true); | 754 | rcu_eqs_exit_common(oldval, true); |
| 767 | rcu_sysidle_exit(rdtp, 1); | 755 | rcu_sysidle_exit(1); |
| 768 | local_irq_restore(flags); | 756 | local_irq_restore(flags); |
| 769 | } | 757 | } |
| 770 | 758 | ||
| @@ -2387,7 +2375,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 2387 | * invoked from the scheduling-clock interrupt. If rcu_pending returns | 2375 | * invoked from the scheduling-clock interrupt. If rcu_pending returns |
| 2388 | * false, there is no point in invoking rcu_check_callbacks(). | 2376 | * false, there is no point in invoking rcu_check_callbacks(). |
| 2389 | */ | 2377 | */ |
| 2390 | void rcu_check_callbacks(int cpu, int user) | 2378 | void rcu_check_callbacks(int user) |
| 2391 | { | 2379 | { |
| 2392 | trace_rcu_utilization(TPS("Start scheduler-tick")); | 2380 | trace_rcu_utilization(TPS("Start scheduler-tick")); |
| 2393 | increment_cpu_stall_ticks(); | 2381 | increment_cpu_stall_ticks(); |
| @@ -2419,8 +2407,8 @@ void rcu_check_callbacks(int cpu, int user) | |||
| 2419 | 2407 | ||
| 2420 | rcu_bh_qs(); | 2408 | rcu_bh_qs(); |
| 2421 | } | 2409 | } |
| 2422 | rcu_preempt_check_callbacks(cpu); | 2410 | rcu_preempt_check_callbacks(); |
| 2423 | if (rcu_pending(cpu)) | 2411 | if (rcu_pending()) |
| 2424 | invoke_rcu_core(); | 2412 | invoke_rcu_core(); |
| 2425 | if (user) | 2413 | if (user) |
| 2426 | rcu_note_voluntary_context_switch(current); | 2414 | rcu_note_voluntary_context_switch(current); |
| @@ -2963,6 +2951,9 @@ static int synchronize_sched_expedited_cpu_stop(void *data) | |||
| 2963 | */ | 2951 | */ |
| 2964 | void synchronize_sched_expedited(void) | 2952 | void synchronize_sched_expedited(void) |
| 2965 | { | 2953 | { |
| 2954 | cpumask_var_t cm; | ||
| 2955 | bool cma = false; | ||
| 2956 | int cpu; | ||
| 2966 | long firstsnap, s, snap; | 2957 | long firstsnap, s, snap; |
| 2967 | int trycount = 0; | 2958 | int trycount = 0; |
| 2968 | struct rcu_state *rsp = &rcu_sched_state; | 2959 | struct rcu_state *rsp = &rcu_sched_state; |
| @@ -2997,11 +2988,26 @@ void synchronize_sched_expedited(void) | |||
| 2997 | } | 2988 | } |
| 2998 | WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); | 2989 | WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); |
| 2999 | 2990 | ||
| 2991 | /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */ | ||
| 2992 | cma = zalloc_cpumask_var(&cm, GFP_KERNEL); | ||
| 2993 | if (cma) { | ||
| 2994 | cpumask_copy(cm, cpu_online_mask); | ||
| 2995 | cpumask_clear_cpu(raw_smp_processor_id(), cm); | ||
| 2996 | for_each_cpu(cpu, cm) { | ||
| 2997 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | ||
| 2998 | |||
| 2999 | if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1)) | ||
| 3000 | cpumask_clear_cpu(cpu, cm); | ||
| 3001 | } | ||
| 3002 | if (cpumask_weight(cm) == 0) | ||
| 3003 | goto all_cpus_idle; | ||
| 3004 | } | ||
| 3005 | |||
| 3000 | /* | 3006 | /* |
| 3001 | * Each pass through the following loop attempts to force a | 3007 | * Each pass through the following loop attempts to force a |
| 3002 | * context switch on each CPU. | 3008 | * context switch on each CPU. |
| 3003 | */ | 3009 | */ |
| 3004 | while (try_stop_cpus(cpu_online_mask, | 3010 | while (try_stop_cpus(cma ? cm : cpu_online_mask, |
| 3005 | synchronize_sched_expedited_cpu_stop, | 3011 | synchronize_sched_expedited_cpu_stop, |
| 3006 | NULL) == -EAGAIN) { | 3012 | NULL) == -EAGAIN) { |
| 3007 | put_online_cpus(); | 3013 | put_online_cpus(); |
| @@ -3013,6 +3019,7 @@ void synchronize_sched_expedited(void) | |||
| 3013 | /* ensure test happens before caller kfree */ | 3019 | /* ensure test happens before caller kfree */ |
| 3014 | smp_mb__before_atomic(); /* ^^^ */ | 3020 | smp_mb__before_atomic(); /* ^^^ */ |
| 3015 | atomic_long_inc(&rsp->expedited_workdone1); | 3021 | atomic_long_inc(&rsp->expedited_workdone1); |
| 3022 | free_cpumask_var(cm); | ||
| 3016 | return; | 3023 | return; |
| 3017 | } | 3024 | } |
| 3018 | 3025 | ||
| @@ -3022,6 +3029,7 @@ void synchronize_sched_expedited(void) | |||
| 3022 | } else { | 3029 | } else { |
| 3023 | wait_rcu_gp(call_rcu_sched); | 3030 | wait_rcu_gp(call_rcu_sched); |
| 3024 | atomic_long_inc(&rsp->expedited_normal); | 3031 | atomic_long_inc(&rsp->expedited_normal); |
| 3032 | free_cpumask_var(cm); | ||
| 3025 | return; | 3033 | return; |
| 3026 | } | 3034 | } |
| 3027 | 3035 | ||
| @@ -3031,6 +3039,7 @@ void synchronize_sched_expedited(void) | |||
| 3031 | /* ensure test happens before caller kfree */ | 3039 | /* ensure test happens before caller kfree */ |
| 3032 | smp_mb__before_atomic(); /* ^^^ */ | 3040 | smp_mb__before_atomic(); /* ^^^ */ |
| 3033 | atomic_long_inc(&rsp->expedited_workdone2); | 3041 | atomic_long_inc(&rsp->expedited_workdone2); |
| 3042 | free_cpumask_var(cm); | ||
| 3034 | return; | 3043 | return; |
| 3035 | } | 3044 | } |
| 3036 | 3045 | ||
| @@ -3045,6 +3054,7 @@ void synchronize_sched_expedited(void) | |||
| 3045 | /* CPU hotplug operation in flight, use normal GP. */ | 3054 | /* CPU hotplug operation in flight, use normal GP. */ |
| 3046 | wait_rcu_gp(call_rcu_sched); | 3055 | wait_rcu_gp(call_rcu_sched); |
| 3047 | atomic_long_inc(&rsp->expedited_normal); | 3056 | atomic_long_inc(&rsp->expedited_normal); |
| 3057 | free_cpumask_var(cm); | ||
| 3048 | return; | 3058 | return; |
| 3049 | } | 3059 | } |
| 3050 | snap = atomic_long_read(&rsp->expedited_start); | 3060 | snap = atomic_long_read(&rsp->expedited_start); |
| @@ -3052,6 +3062,9 @@ void synchronize_sched_expedited(void) | |||
| 3052 | } | 3062 | } |
| 3053 | atomic_long_inc(&rsp->expedited_stoppedcpus); | 3063 | atomic_long_inc(&rsp->expedited_stoppedcpus); |
| 3054 | 3064 | ||
| 3065 | all_cpus_idle: | ||
| 3066 | free_cpumask_var(cm); | ||
| 3067 | |||
| 3055 | /* | 3068 | /* |
| 3056 | * Everyone up to our most recent fetch is covered by our grace | 3069 | * Everyone up to our most recent fetch is covered by our grace |
| 3057 | * period. Update the counter, but only if our work is still | 3070 | * period. Update the counter, but only if our work is still |
| @@ -3143,12 +3156,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 3143 | * by the current CPU, returning 1 if so. This function is part of the | 3156 | * by the current CPU, returning 1 if so. This function is part of the |
| 3144 | * RCU implementation; it is -not- an exported member of the RCU API. | 3157 | * RCU implementation; it is -not- an exported member of the RCU API. |
| 3145 | */ | 3158 | */ |
| 3146 | static int rcu_pending(int cpu) | 3159 | static int rcu_pending(void) |
| 3147 | { | 3160 | { |
| 3148 | struct rcu_state *rsp; | 3161 | struct rcu_state *rsp; |
| 3149 | 3162 | ||
| 3150 | for_each_rcu_flavor(rsp) | 3163 | for_each_rcu_flavor(rsp) |
| 3151 | if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu))) | 3164 | if (__rcu_pending(rsp, this_cpu_ptr(rsp->rda))) |
| 3152 | return 1; | 3165 | return 1; |
| 3153 | return 0; | 3166 | return 0; |
| 3154 | } | 3167 | } |
| @@ -3158,7 +3171,7 @@ static int rcu_pending(int cpu) | |||
| 3158 | * non-NULL, store an indication of whether all callbacks are lazy. | 3171 | * non-NULL, store an indication of whether all callbacks are lazy. |
| 3159 | * (If there are no callbacks, all of them are deemed to be lazy.) | 3172 | * (If there are no callbacks, all of them are deemed to be lazy.) |
| 3160 | */ | 3173 | */ |
| 3161 | static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy) | 3174 | static int __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy) |
| 3162 | { | 3175 | { |
| 3163 | bool al = true; | 3176 | bool al = true; |
| 3164 | bool hc = false; | 3177 | bool hc = false; |
| @@ -3166,7 +3179,7 @@ static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy) | |||
| 3166 | struct rcu_state *rsp; | 3179 | struct rcu_state *rsp; |
| 3167 | 3180 | ||
| 3168 | for_each_rcu_flavor(rsp) { | 3181 | for_each_rcu_flavor(rsp) { |
| 3169 | rdp = per_cpu_ptr(rsp->rda, cpu); | 3182 | rdp = this_cpu_ptr(rsp->rda); |
| 3170 | if (!rdp->nxtlist) | 3183 | if (!rdp->nxtlist) |
| 3171 | continue; | 3184 | continue; |
| 3172 | hc = true; | 3185 | hc = true; |
| @@ -3485,8 +3498,10 @@ static int rcu_cpu_notify(struct notifier_block *self, | |||
| 3485 | case CPU_DEAD_FROZEN: | 3498 | case CPU_DEAD_FROZEN: |
| 3486 | case CPU_UP_CANCELED: | 3499 | case CPU_UP_CANCELED: |
| 3487 | case CPU_UP_CANCELED_FROZEN: | 3500 | case CPU_UP_CANCELED_FROZEN: |
| 3488 | for_each_rcu_flavor(rsp) | 3501 | for_each_rcu_flavor(rsp) { |
| 3489 | rcu_cleanup_dead_cpu(cpu, rsp); | 3502 | rcu_cleanup_dead_cpu(cpu, rsp); |
| 3503 | do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu)); | ||
| 3504 | } | ||
| 3490 | break; | 3505 | break; |
| 3491 | default: | 3506 | default: |
| 3492 | break; | 3507 | break; |
| @@ -3766,6 +3781,8 @@ void __init rcu_init(void) | |||
| 3766 | pm_notifier(rcu_pm_notify, 0); | 3781 | pm_notifier(rcu_pm_notify, 0); |
| 3767 | for_each_online_cpu(cpu) | 3782 | for_each_online_cpu(cpu) |
| 3768 | rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); | 3783 | rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); |
| 3784 | |||
| 3785 | rcu_early_boot_tests(); | ||
| 3769 | } | 3786 | } |
| 3770 | 3787 | ||
| 3771 | #include "tree_plugin.h" | 3788 | #include "tree_plugin.h" |
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index bbdc45d8d74f..8e7b1843896e 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h | |||
| @@ -139,7 +139,7 @@ struct rcu_node { | |||
| 139 | unsigned long expmask; /* Groups that have ->blkd_tasks */ | 139 | unsigned long expmask; /* Groups that have ->blkd_tasks */ |
| 140 | /* elements that need to drain to allow the */ | 140 | /* elements that need to drain to allow the */ |
| 141 | /* current expedited grace period to */ | 141 | /* current expedited grace period to */ |
| 142 | /* complete (only for TREE_PREEMPT_RCU). */ | 142 | /* complete (only for PREEMPT_RCU). */ |
| 143 | unsigned long qsmaskinit; | 143 | unsigned long qsmaskinit; |
| 144 | /* Per-GP initial value for qsmask & expmask. */ | 144 | /* Per-GP initial value for qsmask & expmask. */ |
| 145 | unsigned long grpmask; /* Mask to apply to parent qsmask. */ | 145 | unsigned long grpmask; /* Mask to apply to parent qsmask. */ |
| @@ -530,10 +530,10 @@ DECLARE_PER_CPU(struct rcu_data, rcu_sched_data); | |||
| 530 | extern struct rcu_state rcu_bh_state; | 530 | extern struct rcu_state rcu_bh_state; |
| 531 | DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); | 531 | DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); |
| 532 | 532 | ||
| 533 | #ifdef CONFIG_TREE_PREEMPT_RCU | 533 | #ifdef CONFIG_PREEMPT_RCU |
| 534 | extern struct rcu_state rcu_preempt_state; | 534 | extern struct rcu_state rcu_preempt_state; |
| 535 | DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); | 535 | DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); |
| 536 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 536 | #endif /* #ifdef CONFIG_PREEMPT_RCU */ |
| 537 | 537 | ||
| 538 | #ifdef CONFIG_RCU_BOOST | 538 | #ifdef CONFIG_RCU_BOOST |
| 539 | DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); | 539 | DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); |
| @@ -547,7 +547,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); | |||
| 547 | /* Forward declarations for rcutree_plugin.h */ | 547 | /* Forward declarations for rcutree_plugin.h */ |
| 548 | static void rcu_bootup_announce(void); | 548 | static void rcu_bootup_announce(void); |
| 549 | long rcu_batches_completed(void); | 549 | long rcu_batches_completed(void); |
| 550 | static void rcu_preempt_note_context_switch(int cpu); | 550 | static void rcu_preempt_note_context_switch(void); |
| 551 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); | 551 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); |
| 552 | #ifdef CONFIG_HOTPLUG_CPU | 552 | #ifdef CONFIG_HOTPLUG_CPU |
| 553 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, | 553 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, |
| @@ -561,12 +561,12 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | |||
| 561 | struct rcu_node *rnp, | 561 | struct rcu_node *rnp, |
| 562 | struct rcu_data *rdp); | 562 | struct rcu_data *rdp); |
| 563 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 563 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
| 564 | static void rcu_preempt_check_callbacks(int cpu); | 564 | static void rcu_preempt_check_callbacks(void); |
| 565 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); | 565 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); |
| 566 | #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) | 566 | #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU) |
| 567 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, | 567 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, |
| 568 | bool wake); | 568 | bool wake); |
| 569 | #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ | 569 | #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU) */ |
| 570 | static void __init __rcu_init_preempt(void); | 570 | static void __init __rcu_init_preempt(void); |
| 571 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); | 571 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); |
| 572 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); | 572 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); |
| @@ -579,8 +579,8 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | |||
| 579 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 579 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
| 580 | static void __init rcu_spawn_boost_kthreads(void); | 580 | static void __init rcu_spawn_boost_kthreads(void); |
| 581 | static void rcu_prepare_kthreads(int cpu); | 581 | static void rcu_prepare_kthreads(int cpu); |
| 582 | static void rcu_cleanup_after_idle(int cpu); | 582 | static void rcu_cleanup_after_idle(void); |
| 583 | static void rcu_prepare_for_idle(int cpu); | 583 | static void rcu_prepare_for_idle(void); |
| 584 | static void rcu_idle_count_callbacks_posted(void); | 584 | static void rcu_idle_count_callbacks_posted(void); |
| 585 | static void print_cpu_stall_info_begin(void); | 585 | static void print_cpu_stall_info_begin(void); |
| 586 | static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); | 586 | static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); |
| @@ -606,8 +606,8 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp); | |||
| 606 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | 606 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ |
| 607 | static void __maybe_unused rcu_kick_nohz_cpu(int cpu); | 607 | static void __maybe_unused rcu_kick_nohz_cpu(int cpu); |
| 608 | static bool init_nocb_callback_list(struct rcu_data *rdp); | 608 | static bool init_nocb_callback_list(struct rcu_data *rdp); |
| 609 | static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); | 609 | static void rcu_sysidle_enter(int irq); |
| 610 | static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); | 610 | static void rcu_sysidle_exit(int irq); |
| 611 | static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, | 611 | static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, |
| 612 | unsigned long *maxj); | 612 | unsigned long *maxj); |
| 613 | static bool is_sysidle_rcu_state(struct rcu_state *rsp); | 613 | static bool is_sysidle_rcu_state(struct rcu_state *rsp); |
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c1d7f27bd38f..3ec85cb5d544 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h | |||
| @@ -30,14 +30,24 @@ | |||
| 30 | #include <linux/smpboot.h> | 30 | #include <linux/smpboot.h> |
| 31 | #include "../time/tick-internal.h" | 31 | #include "../time/tick-internal.h" |
| 32 | 32 | ||
| 33 | #define RCU_KTHREAD_PRIO 1 | ||
| 34 | |||
| 35 | #ifdef CONFIG_RCU_BOOST | 33 | #ifdef CONFIG_RCU_BOOST |
| 34 | |||
| 36 | #include "../locking/rtmutex_common.h" | 35 | #include "../locking/rtmutex_common.h" |
| 37 | #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO | 36 | |
| 38 | #else | 37 | /* rcuc/rcub kthread realtime priority */ |
| 39 | #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO | 38 | static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO; |
| 40 | #endif | 39 | module_param(kthread_prio, int, 0644); |
| 40 | |||
| 41 | /* | ||
| 42 | * Control variables for per-CPU and per-rcu_node kthreads. These | ||
| 43 | * handle all flavors of RCU. | ||
| 44 | */ | ||
| 45 | static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); | ||
| 46 | DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); | ||
| 47 | DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); | ||
| 48 | DEFINE_PER_CPU(char, rcu_cpu_has_work); | ||
| 49 | |||
| 50 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 41 | 51 | ||
| 42 | #ifdef CONFIG_RCU_NOCB_CPU | 52 | #ifdef CONFIG_RCU_NOCB_CPU |
| 43 | static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ | 53 | static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ |
| @@ -72,9 +82,6 @@ static void __init rcu_bootup_announce_oddness(void) | |||
| 72 | #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE | 82 | #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE |
| 73 | pr_info("\tRCU torture testing starts during boot.\n"); | 83 | pr_info("\tRCU torture testing starts during boot.\n"); |
| 74 | #endif | 84 | #endif |
| 75 | #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) | ||
| 76 | pr_info("\tDump stacks of tasks blocking RCU-preempt GP.\n"); | ||
| 77 | #endif | ||
| 78 | #if defined(CONFIG_RCU_CPU_STALL_INFO) | 85 | #if defined(CONFIG_RCU_CPU_STALL_INFO) |
| 79 | pr_info("\tAdditional per-CPU info printed with stalls.\n"); | 86 | pr_info("\tAdditional per-CPU info printed with stalls.\n"); |
| 80 | #endif | 87 | #endif |
| @@ -85,9 +92,12 @@ static void __init rcu_bootup_announce_oddness(void) | |||
| 85 | pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); | 92 | pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); |
| 86 | if (nr_cpu_ids != NR_CPUS) | 93 | if (nr_cpu_ids != NR_CPUS) |
| 87 | pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); | 94 | pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); |
| 95 | #ifdef CONFIG_RCU_BOOST | ||
| 96 | pr_info("\tRCU kthread priority: %d.\n", kthread_prio); | ||
| 97 | #endif | ||
| 88 | } | 98 | } |
| 89 | 99 | ||
| 90 | #ifdef CONFIG_TREE_PREEMPT_RCU | 100 | #ifdef CONFIG_PREEMPT_RCU |
| 91 | 101 | ||
| 92 | RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); | 102 | RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); |
| 93 | static struct rcu_state *rcu_state_p = &rcu_preempt_state; | 103 | static struct rcu_state *rcu_state_p = &rcu_preempt_state; |
| @@ -156,7 +166,7 @@ static void rcu_preempt_qs(void) | |||
| 156 | * | 166 | * |
| 157 | * Caller must disable preemption. | 167 | * Caller must disable preemption. |
| 158 | */ | 168 | */ |
| 159 | static void rcu_preempt_note_context_switch(int cpu) | 169 | static void rcu_preempt_note_context_switch(void) |
| 160 | { | 170 | { |
| 161 | struct task_struct *t = current; | 171 | struct task_struct *t = current; |
| 162 | unsigned long flags; | 172 | unsigned long flags; |
| @@ -167,7 +177,7 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
| 167 | !t->rcu_read_unlock_special.b.blocked) { | 177 | !t->rcu_read_unlock_special.b.blocked) { |
| 168 | 178 | ||
| 169 | /* Possibly blocking in an RCU read-side critical section. */ | 179 | /* Possibly blocking in an RCU read-side critical section. */ |
| 170 | rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); | 180 | rdp = this_cpu_ptr(rcu_preempt_state.rda); |
| 171 | rnp = rdp->mynode; | 181 | rnp = rdp->mynode; |
| 172 | raw_spin_lock_irqsave(&rnp->lock, flags); | 182 | raw_spin_lock_irqsave(&rnp->lock, flags); |
| 173 | smp_mb__after_unlock_lock(); | 183 | smp_mb__after_unlock_lock(); |
| @@ -415,8 +425,6 @@ void rcu_read_unlock_special(struct task_struct *t) | |||
| 415 | } | 425 | } |
| 416 | } | 426 | } |
| 417 | 427 | ||
| 418 | #ifdef CONFIG_RCU_CPU_STALL_VERBOSE | ||
| 419 | |||
| 420 | /* | 428 | /* |
| 421 | * Dump detailed information for all tasks blocking the current RCU | 429 | * Dump detailed information for all tasks blocking the current RCU |
| 422 | * grace period on the specified rcu_node structure. | 430 | * grace period on the specified rcu_node structure. |
| @@ -451,14 +459,6 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) | |||
| 451 | rcu_print_detail_task_stall_rnp(rnp); | 459 | rcu_print_detail_task_stall_rnp(rnp); |
| 452 | } | 460 | } |
| 453 | 461 | ||
| 454 | #else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ | ||
| 455 | |||
| 456 | static void rcu_print_detail_task_stall(struct rcu_state *rsp) | ||
| 457 | { | ||
| 458 | } | ||
| 459 | |||
| 460 | #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ | ||
| 461 | |||
| 462 | #ifdef CONFIG_RCU_CPU_STALL_INFO | 462 | #ifdef CONFIG_RCU_CPU_STALL_INFO |
| 463 | 463 | ||
| 464 | static void rcu_print_task_stall_begin(struct rcu_node *rnp) | 464 | static void rcu_print_task_stall_begin(struct rcu_node *rnp) |
| @@ -621,7 +621,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | |||
| 621 | * | 621 | * |
| 622 | * Caller must disable hard irqs. | 622 | * Caller must disable hard irqs. |
| 623 | */ | 623 | */ |
| 624 | static void rcu_preempt_check_callbacks(int cpu) | 624 | static void rcu_preempt_check_callbacks(void) |
| 625 | { | 625 | { |
| 626 | struct task_struct *t = current; | 626 | struct task_struct *t = current; |
| 627 | 627 | ||
| @@ -630,8 +630,8 @@ static void rcu_preempt_check_callbacks(int cpu) | |||
| 630 | return; | 630 | return; |
| 631 | } | 631 | } |
| 632 | if (t->rcu_read_lock_nesting > 0 && | 632 | if (t->rcu_read_lock_nesting > 0 && |
| 633 | per_cpu(rcu_preempt_data, cpu).qs_pending && | 633 | __this_cpu_read(rcu_preempt_data.qs_pending) && |
| 634 | !per_cpu(rcu_preempt_data, cpu).passed_quiesce) | 634 | !__this_cpu_read(rcu_preempt_data.passed_quiesce)) |
| 635 | t->rcu_read_unlock_special.b.need_qs = true; | 635 | t->rcu_read_unlock_special.b.need_qs = true; |
| 636 | } | 636 | } |
| 637 | 637 | ||
| @@ -919,7 +919,7 @@ void exit_rcu(void) | |||
| 919 | __rcu_read_unlock(); | 919 | __rcu_read_unlock(); |
| 920 | } | 920 | } |
| 921 | 921 | ||
| 922 | #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 922 | #else /* #ifdef CONFIG_PREEMPT_RCU */ |
| 923 | 923 | ||
| 924 | static struct rcu_state *rcu_state_p = &rcu_sched_state; | 924 | static struct rcu_state *rcu_state_p = &rcu_sched_state; |
| 925 | 925 | ||
| @@ -945,7 +945,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed); | |||
| 945 | * Because preemptible RCU does not exist, we never have to check for | 945 | * Because preemptible RCU does not exist, we never have to check for |
| 946 | * CPUs being in quiescent states. | 946 | * CPUs being in quiescent states. |
| 947 | */ | 947 | */ |
| 948 | static void rcu_preempt_note_context_switch(int cpu) | 948 | static void rcu_preempt_note_context_switch(void) |
| 949 | { | 949 | { |
| 950 | } | 950 | } |
| 951 | 951 | ||
| @@ -1017,7 +1017,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | |||
| 1017 | * Because preemptible RCU does not exist, it never has any callbacks | 1017 | * Because preemptible RCU does not exist, it never has any callbacks |
| 1018 | * to check. | 1018 | * to check. |
| 1019 | */ | 1019 | */ |
| 1020 | static void rcu_preempt_check_callbacks(int cpu) | 1020 | static void rcu_preempt_check_callbacks(void) |
| 1021 | { | 1021 | { |
| 1022 | } | 1022 | } |
| 1023 | 1023 | ||
| @@ -1070,7 +1070,7 @@ void exit_rcu(void) | |||
| 1070 | { | 1070 | { |
| 1071 | } | 1071 | } |
| 1072 | 1072 | ||
| 1073 | #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ | 1073 | #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ |
| 1074 | 1074 | ||
| 1075 | #ifdef CONFIG_RCU_BOOST | 1075 | #ifdef CONFIG_RCU_BOOST |
| 1076 | 1076 | ||
| @@ -1326,7 +1326,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | |||
| 1326 | smp_mb__after_unlock_lock(); | 1326 | smp_mb__after_unlock_lock(); |
| 1327 | rnp->boost_kthread_task = t; | 1327 | rnp->boost_kthread_task = t; |
| 1328 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1328 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 1329 | sp.sched_priority = RCU_BOOST_PRIO; | 1329 | sp.sched_priority = kthread_prio; |
| 1330 | sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); | 1330 | sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); |
| 1331 | wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ | 1331 | wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ |
| 1332 | return 0; | 1332 | return 0; |
| @@ -1343,7 +1343,7 @@ static void rcu_cpu_kthread_setup(unsigned int cpu) | |||
| 1343 | { | 1343 | { |
| 1344 | struct sched_param sp; | 1344 | struct sched_param sp; |
| 1345 | 1345 | ||
| 1346 | sp.sched_priority = RCU_KTHREAD_PRIO; | 1346 | sp.sched_priority = kthread_prio; |
| 1347 | sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); | 1347 | sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); |
| 1348 | } | 1348 | } |
| 1349 | 1349 | ||
| @@ -1512,10 +1512,10 @@ static void rcu_prepare_kthreads(int cpu) | |||
| 1512 | * any flavor of RCU. | 1512 | * any flavor of RCU. |
| 1513 | */ | 1513 | */ |
| 1514 | #ifndef CONFIG_RCU_NOCB_CPU_ALL | 1514 | #ifndef CONFIG_RCU_NOCB_CPU_ALL |
| 1515 | int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) | 1515 | int rcu_needs_cpu(unsigned long *delta_jiffies) |
| 1516 | { | 1516 | { |
| 1517 | *delta_jiffies = ULONG_MAX; | 1517 | *delta_jiffies = ULONG_MAX; |
| 1518 | return rcu_cpu_has_callbacks(cpu, NULL); | 1518 | return rcu_cpu_has_callbacks(NULL); |
| 1519 | } | 1519 | } |
| 1520 | #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ | 1520 | #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ |
| 1521 | 1521 | ||
| @@ -1523,7 +1523,7 @@ int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) | |||
| 1523 | * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up | 1523 | * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up |
| 1524 | * after it. | 1524 | * after it. |
| 1525 | */ | 1525 | */ |
| 1526 | static void rcu_cleanup_after_idle(int cpu) | 1526 | static void rcu_cleanup_after_idle(void) |
| 1527 | { | 1527 | { |
| 1528 | } | 1528 | } |
| 1529 | 1529 | ||
| @@ -1531,7 +1531,7 @@ static void rcu_cleanup_after_idle(int cpu) | |||
| 1531 | * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n, | 1531 | * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n, |
| 1532 | * is nothing. | 1532 | * is nothing. |
| 1533 | */ | 1533 | */ |
| 1534 | static void rcu_prepare_for_idle(int cpu) | 1534 | static void rcu_prepare_for_idle(void) |
| 1535 | { | 1535 | { |
| 1536 | } | 1536 | } |
| 1537 | 1537 | ||
| @@ -1624,15 +1624,15 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) | |||
| 1624 | * The caller must have disabled interrupts. | 1624 | * The caller must have disabled interrupts. |
| 1625 | */ | 1625 | */ |
| 1626 | #ifndef CONFIG_RCU_NOCB_CPU_ALL | 1626 | #ifndef CONFIG_RCU_NOCB_CPU_ALL |
| 1627 | int rcu_needs_cpu(int cpu, unsigned long *dj) | 1627 | int rcu_needs_cpu(unsigned long *dj) |
| 1628 | { | 1628 | { |
| 1629 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | 1629 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); |
| 1630 | 1630 | ||
| 1631 | /* Snapshot to detect later posting of non-lazy callback. */ | 1631 | /* Snapshot to detect later posting of non-lazy callback. */ |
| 1632 | rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; | 1632 | rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; |
| 1633 | 1633 | ||
| 1634 | /* If no callbacks, RCU doesn't need the CPU. */ | 1634 | /* If no callbacks, RCU doesn't need the CPU. */ |
| 1635 | if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) { | 1635 | if (!rcu_cpu_has_callbacks(&rdtp->all_lazy)) { |
| 1636 | *dj = ULONG_MAX; | 1636 | *dj = ULONG_MAX; |
| 1637 | return 0; | 1637 | return 0; |
| 1638 | } | 1638 | } |
| @@ -1666,12 +1666,12 @@ int rcu_needs_cpu(int cpu, unsigned long *dj) | |||
| 1666 | * | 1666 | * |
| 1667 | * The caller must have disabled interrupts. | 1667 | * The caller must have disabled interrupts. |
| 1668 | */ | 1668 | */ |
| 1669 | static void rcu_prepare_for_idle(int cpu) | 1669 | static void rcu_prepare_for_idle(void) |
| 1670 | { | 1670 | { |
| 1671 | #ifndef CONFIG_RCU_NOCB_CPU_ALL | 1671 | #ifndef CONFIG_RCU_NOCB_CPU_ALL |
| 1672 | bool needwake; | 1672 | bool needwake; |
| 1673 | struct rcu_data *rdp; | 1673 | struct rcu_data *rdp; |
| 1674 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | 1674 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); |
| 1675 | struct rcu_node *rnp; | 1675 | struct rcu_node *rnp; |
| 1676 | struct rcu_state *rsp; | 1676 | struct rcu_state *rsp; |
| 1677 | int tne; | 1677 | int tne; |
| @@ -1679,7 +1679,7 @@ static void rcu_prepare_for_idle(int cpu) | |||
| 1679 | /* Handle nohz enablement switches conservatively. */ | 1679 | /* Handle nohz enablement switches conservatively. */ |
| 1680 | tne = ACCESS_ONCE(tick_nohz_active); | 1680 | tne = ACCESS_ONCE(tick_nohz_active); |
| 1681 | if (tne != rdtp->tick_nohz_enabled_snap) { | 1681 | if (tne != rdtp->tick_nohz_enabled_snap) { |
| 1682 | if (rcu_cpu_has_callbacks(cpu, NULL)) | 1682 | if (rcu_cpu_has_callbacks(NULL)) |
| 1683 | invoke_rcu_core(); /* force nohz to see update. */ | 1683 | invoke_rcu_core(); /* force nohz to see update. */ |
| 1684 | rdtp->tick_nohz_enabled_snap = tne; | 1684 | rdtp->tick_nohz_enabled_snap = tne; |
| 1685 | return; | 1685 | return; |
| @@ -1688,7 +1688,7 @@ static void rcu_prepare_for_idle(int cpu) | |||
| 1688 | return; | 1688 | return; |
| 1689 | 1689 | ||
| 1690 | /* If this is a no-CBs CPU, no callbacks, just return. */ | 1690 | /* If this is a no-CBs CPU, no callbacks, just return. */ |
| 1691 | if (rcu_is_nocb_cpu(cpu)) | 1691 | if (rcu_is_nocb_cpu(smp_processor_id())) |
| 1692 | return; | 1692 | return; |
| 1693 | 1693 | ||
| 1694 | /* | 1694 | /* |
| @@ -1712,7 +1712,7 @@ static void rcu_prepare_for_idle(int cpu) | |||
| 1712 | return; | 1712 | return; |
| 1713 | rdtp->last_accelerate = jiffies; | 1713 | rdtp->last_accelerate = jiffies; |
| 1714 | for_each_rcu_flavor(rsp) { | 1714 | for_each_rcu_flavor(rsp) { |
| 1715 | rdp = per_cpu_ptr(rsp->rda, cpu); | 1715 | rdp = this_cpu_ptr(rsp->rda); |
| 1716 | if (!*rdp->nxttail[RCU_DONE_TAIL]) | 1716 | if (!*rdp->nxttail[RCU_DONE_TAIL]) |
| 1717 | continue; | 1717 | continue; |
| 1718 | rnp = rdp->mynode; | 1718 | rnp = rdp->mynode; |
| @@ -1731,10 +1731,10 @@ static void rcu_prepare_for_idle(int cpu) | |||
| 1731 | * any grace periods that elapsed while the CPU was idle, and if any | 1731 | * any grace periods that elapsed while the CPU was idle, and if any |
| 1732 | * callbacks are now ready to invoke, initiate invocation. | 1732 | * callbacks are now ready to invoke, initiate invocation. |
| 1733 | */ | 1733 | */ |
| 1734 | static void rcu_cleanup_after_idle(int cpu) | 1734 | static void rcu_cleanup_after_idle(void) |
| 1735 | { | 1735 | { |
| 1736 | #ifndef CONFIG_RCU_NOCB_CPU_ALL | 1736 | #ifndef CONFIG_RCU_NOCB_CPU_ALL |
| 1737 | if (rcu_is_nocb_cpu(cpu)) | 1737 | if (rcu_is_nocb_cpu(smp_processor_id())) |
| 1738 | return; | 1738 | return; |
| 1739 | if (rcu_try_advance_all_cbs()) | 1739 | if (rcu_try_advance_all_cbs()) |
| 1740 | invoke_rcu_core(); | 1740 | invoke_rcu_core(); |
| @@ -2573,9 +2573,13 @@ static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu) | |||
| 2573 | rdp->nocb_leader = rdp_spawn; | 2573 | rdp->nocb_leader = rdp_spawn; |
| 2574 | if (rdp_last && rdp != rdp_spawn) | 2574 | if (rdp_last && rdp != rdp_spawn) |
| 2575 | rdp_last->nocb_next_follower = rdp; | 2575 | rdp_last->nocb_next_follower = rdp; |
| 2576 | rdp_last = rdp; | 2576 | if (rdp == rdp_spawn) { |
| 2577 | rdp = rdp->nocb_next_follower; | 2577 | rdp = rdp->nocb_next_follower; |
| 2578 | rdp_last->nocb_next_follower = NULL; | 2578 | } else { |
| 2579 | rdp_last = rdp; | ||
| 2580 | rdp = rdp->nocb_next_follower; | ||
| 2581 | rdp_last->nocb_next_follower = NULL; | ||
| 2582 | } | ||
| 2579 | } while (rdp); | 2583 | } while (rdp); |
| 2580 | rdp_spawn->nocb_next_follower = rdp_old_leader; | 2584 | rdp_spawn->nocb_next_follower = rdp_old_leader; |
| 2581 | } | 2585 | } |
| @@ -2761,9 +2765,10 @@ static int full_sysidle_state; /* Current system-idle state. */ | |||
| 2761 | * to detect full-system idle states, not RCU quiescent states and grace | 2765 | * to detect full-system idle states, not RCU quiescent states and grace |
| 2762 | * periods. The caller must have disabled interrupts. | 2766 | * periods. The caller must have disabled interrupts. |
| 2763 | */ | 2767 | */ |
| 2764 | static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) | 2768 | static void rcu_sysidle_enter(int irq) |
| 2765 | { | 2769 | { |
| 2766 | unsigned long j; | 2770 | unsigned long j; |
| 2771 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | ||
| 2767 | 2772 | ||
| 2768 | /* If there are no nohz_full= CPUs, no need to track this. */ | 2773 | /* If there are no nohz_full= CPUs, no need to track this. */ |
| 2769 | if (!tick_nohz_full_enabled()) | 2774 | if (!tick_nohz_full_enabled()) |
| @@ -2832,8 +2837,10 @@ void rcu_sysidle_force_exit(void) | |||
| 2832 | * usermode execution does -not- count as idle here! The caller must | 2837 | * usermode execution does -not- count as idle here! The caller must |
| 2833 | * have disabled interrupts. | 2838 | * have disabled interrupts. |
| 2834 | */ | 2839 | */ |
| 2835 | static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) | 2840 | static void rcu_sysidle_exit(int irq) |
| 2836 | { | 2841 | { |
| 2842 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | ||
| 2843 | |||
| 2837 | /* If there are no nohz_full= CPUs, no need to track this. */ | 2844 | /* If there are no nohz_full= CPUs, no need to track this. */ |
| 2838 | if (!tick_nohz_full_enabled()) | 2845 | if (!tick_nohz_full_enabled()) |
| 2839 | return; | 2846 | return; |
| @@ -3127,11 +3134,11 @@ static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) | |||
| 3127 | 3134 | ||
| 3128 | #else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | 3135 | #else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ |
| 3129 | 3136 | ||
| 3130 | static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) | 3137 | static void rcu_sysidle_enter(int irq) |
| 3131 | { | 3138 | { |
| 3132 | } | 3139 | } |
| 3133 | 3140 | ||
| 3134 | static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) | 3141 | static void rcu_sysidle_exit(int irq) |
| 3135 | { | 3142 | { |
| 3136 | } | 3143 | } |
| 3137 | 3144 | ||
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 3ef8ba58694e..e0d31a345ee6 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c | |||
| @@ -306,7 +306,7 @@ struct debug_obj_descr rcuhead_debug_descr = { | |||
| 306 | EXPORT_SYMBOL_GPL(rcuhead_debug_descr); | 306 | EXPORT_SYMBOL_GPL(rcuhead_debug_descr); |
| 307 | #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | 307 | #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ |
| 308 | 308 | ||
| 309 | #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) | 309 | #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) |
| 310 | void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, | 310 | void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, |
| 311 | unsigned long secs, | 311 | unsigned long secs, |
| 312 | unsigned long c_old, unsigned long c) | 312 | unsigned long c_old, unsigned long c) |
| @@ -531,7 +531,8 @@ static int __noreturn rcu_tasks_kthread(void *arg) | |||
| 531 | struct rcu_head *next; | 531 | struct rcu_head *next; |
| 532 | LIST_HEAD(rcu_tasks_holdouts); | 532 | LIST_HEAD(rcu_tasks_holdouts); |
| 533 | 533 | ||
| 534 | /* FIXME: Add housekeeping affinity. */ | 534 | /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ |
| 535 | housekeeping_affine(current); | ||
| 535 | 536 | ||
| 536 | /* | 537 | /* |
| 537 | * Each pass through the following loop makes one check for | 538 | * Each pass through the following loop makes one check for |
| @@ -690,3 +691,87 @@ static void rcu_spawn_tasks_kthread(void) | |||
| 690 | } | 691 | } |
| 691 | 692 | ||
| 692 | #endif /* #ifdef CONFIG_TASKS_RCU */ | 693 | #endif /* #ifdef CONFIG_TASKS_RCU */ |
| 694 | |||
| 695 | #ifdef CONFIG_PROVE_RCU | ||
| 696 | |||
| 697 | /* | ||
| 698 | * Early boot self test parameters, one for each flavor | ||
| 699 | */ | ||
| 700 | static bool rcu_self_test; | ||
| 701 | static bool rcu_self_test_bh; | ||
| 702 | static bool rcu_self_test_sched; | ||
| 703 | |||
| 704 | module_param(rcu_self_test, bool, 0444); | ||
| 705 | module_param(rcu_self_test_bh, bool, 0444); | ||
| 706 | module_param(rcu_self_test_sched, bool, 0444); | ||
| 707 | |||
| 708 | static int rcu_self_test_counter; | ||
| 709 | |||
| 710 | static void test_callback(struct rcu_head *r) | ||
| 711 | { | ||
| 712 | rcu_self_test_counter++; | ||
| 713 | pr_info("RCU test callback executed %d\n", rcu_self_test_counter); | ||
| 714 | } | ||
| 715 | |||
| 716 | static void early_boot_test_call_rcu(void) | ||
| 717 | { | ||
| 718 | static struct rcu_head head; | ||
| 719 | |||
| 720 | call_rcu(&head, test_callback); | ||
| 721 | } | ||
| 722 | |||
| 723 | static void early_boot_test_call_rcu_bh(void) | ||
| 724 | { | ||
| 725 | static struct rcu_head head; | ||
| 726 | |||
| 727 | call_rcu_bh(&head, test_callback); | ||
| 728 | } | ||
| 729 | |||
| 730 | static void early_boot_test_call_rcu_sched(void) | ||
| 731 | { | ||
| 732 | static struct rcu_head head; | ||
| 733 | |||
| 734 | call_rcu_sched(&head, test_callback); | ||
| 735 | } | ||
| 736 | |||
| 737 | void rcu_early_boot_tests(void) | ||
| 738 | { | ||
| 739 | pr_info("Running RCU self tests\n"); | ||
| 740 | |||
| 741 | if (rcu_self_test) | ||
| 742 | early_boot_test_call_rcu(); | ||
| 743 | if (rcu_self_test_bh) | ||
| 744 | early_boot_test_call_rcu_bh(); | ||
| 745 | if (rcu_self_test_sched) | ||
| 746 | early_boot_test_call_rcu_sched(); | ||
| 747 | } | ||
| 748 | |||
| 749 | static int rcu_verify_early_boot_tests(void) | ||
| 750 | { | ||
| 751 | int ret = 0; | ||
| 752 | int early_boot_test_counter = 0; | ||
| 753 | |||
| 754 | if (rcu_self_test) { | ||
| 755 | early_boot_test_counter++; | ||
| 756 | rcu_barrier(); | ||
| 757 | } | ||
| 758 | if (rcu_self_test_bh) { | ||
| 759 | early_boot_test_counter++; | ||
| 760 | rcu_barrier_bh(); | ||
| 761 | } | ||
| 762 | if (rcu_self_test_sched) { | ||
| 763 | early_boot_test_counter++; | ||
| 764 | rcu_barrier_sched(); | ||
| 765 | } | ||
| 766 | |||
| 767 | if (rcu_self_test_counter != early_boot_test_counter) { | ||
| 768 | WARN_ON(1); | ||
| 769 | ret = -1; | ||
| 770 | } | ||
| 771 | |||
| 772 | return ret; | ||
| 773 | } | ||
| 774 | late_initcall(rcu_verify_early_boot_tests); | ||
| 775 | #else | ||
| 776 | void rcu_early_boot_tests(void) {} | ||
| 777 | #endif /* CONFIG_PROVE_RCU */ | ||
diff --git a/kernel/res_counter.c b/kernel/res_counter.c deleted file mode 100644 index e791130f85a7..000000000000 --- a/kernel/res_counter.c +++ /dev/null | |||
| @@ -1,211 +0,0 @@ | |||
| 1 | /* | ||
| 2 | * resource cgroups | ||
| 3 | * | ||
| 4 | * Copyright 2007 OpenVZ SWsoft Inc | ||
| 5 | * | ||
| 6 | * Author: Pavel Emelianov <xemul@openvz.org> | ||
| 7 | * | ||
| 8 | */ | ||
| 9 | |||
| 10 | #include <linux/types.h> | ||
| 11 | #include <linux/parser.h> | ||
| 12 | #include <linux/fs.h> | ||
| 13 | #include <linux/res_counter.h> | ||
| 14 | #include <linux/uaccess.h> | ||
| 15 | #include <linux/mm.h> | ||
| 16 | |||
| 17 | void res_counter_init(struct res_counter *counter, struct res_counter *parent) | ||
| 18 | { | ||
| 19 | spin_lock_init(&counter->lock); | ||
| 20 | counter->limit = RES_COUNTER_MAX; | ||
| 21 | counter->soft_limit = RES_COUNTER_MAX; | ||
| 22 | counter->parent = parent; | ||
| 23 | } | ||
| 24 | |||
| 25 | static u64 res_counter_uncharge_locked(struct res_counter *counter, | ||
| 26 | unsigned long val) | ||
| 27 | { | ||
| 28 | if (WARN_ON(counter->usage < val)) | ||
| 29 | val = counter->usage; | ||
| 30 | |||
| 31 | counter->usage -= val; | ||
| 32 | return counter->usage; | ||
| 33 | } | ||
| 34 | |||
| 35 | static int res_counter_charge_locked(struct res_counter *counter, | ||
| 36 | unsigned long val, bool force) | ||
| 37 | { | ||
| 38 | int ret = 0; | ||
| 39 | |||
| 40 | if (counter->usage + val > counter->limit) { | ||
| 41 | counter->failcnt++; | ||
| 42 | ret = -ENOMEM; | ||
| 43 | if (!force) | ||
| 44 | return ret; | ||
| 45 | } | ||
| 46 | |||
| 47 | counter->usage += val; | ||
| 48 | if (counter->usage > counter->max_usage) | ||
| 49 | counter->max_usage = counter->usage; | ||
| 50 | return ret; | ||
| 51 | } | ||
| 52 | |||
| 53 | static int __res_counter_charge(struct res_counter *counter, unsigned long val, | ||
| 54 | struct res_counter **limit_fail_at, bool force) | ||
| 55 | { | ||
| 56 | int ret, r; | ||
| 57 | unsigned long flags; | ||
| 58 | struct res_counter *c, *u; | ||
| 59 | |||
| 60 | r = ret = 0; | ||
| 61 | *limit_fail_at = NULL; | ||
| 62 | local_irq_save(flags); | ||
| 63 | for (c = counter; c != NULL; c = c->parent) { | ||
| 64 | spin_lock(&c->lock); | ||
| 65 | r = res_counter_charge_locked(c, val, force); | ||
| 66 | spin_unlock(&c->lock); | ||
| 67 | if (r < 0 && !ret) { | ||
| 68 | ret = r; | ||
| 69 | *limit_fail_at = c; | ||
| 70 | if (!force) | ||
| 71 | break; | ||
| 72 | } | ||
| 73 | } | ||
| 74 | |||
| 75 | if (ret < 0 && !force) { | ||
| 76 | for (u = counter; u != c; u = u->parent) { | ||
| 77 | spin_lock(&u->lock); | ||
| 78 | res_counter_uncharge_locked(u, val); | ||
| 79 | spin_unlock(&u->lock); | ||
| 80 | } | ||
| 81 | } | ||
| 82 | local_irq_restore(flags); | ||
| 83 | |||
| 84 | return ret; | ||
| 85 | } | ||
| 86 | |||
| 87 | int res_counter_charge(struct res_counter *counter, unsigned long val, | ||
| 88 | struct res_counter **limit_fail_at) | ||
| 89 | { | ||
| 90 | return __res_counter_charge(counter, val, limit_fail_at, false); | ||
| 91 | } | ||
| 92 | |||
| 93 | int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, | ||
| 94 | struct res_counter **limit_fail_at) | ||
| 95 | { | ||
| 96 | return __res_counter_charge(counter, val, limit_fail_at, true); | ||
| 97 | } | ||
| 98 | |||
| 99 | u64 res_counter_uncharge_until(struct res_counter *counter, | ||
| 100 | struct res_counter *top, | ||
| 101 | unsigned long val) | ||
| 102 | { | ||
| 103 | unsigned long flags; | ||
| 104 | struct res_counter *c; | ||
| 105 | u64 ret = 0; | ||
| 106 | |||
| 107 | local_irq_save(flags); | ||
| 108 | for (c = counter; c != top; c = c->parent) { | ||
| 109 | u64 r; | ||
| 110 | spin_lock(&c->lock); | ||
| 111 | r = res_counter_uncharge_locked(c, val); | ||
| 112 | if (c == counter) | ||
| 113 | ret = r; | ||
| 114 | spin_unlock(&c->lock); | ||
| 115 | } | ||
| 116 | local_irq_restore(flags); | ||
| 117 | return ret; | ||
| 118 | } | ||
| 119 | |||
| 120 | u64 res_counter_uncharge(struct res_counter *counter, unsigned long val) | ||
| 121 | { | ||
| 122 | return res_counter_uncharge_until(counter, NULL, val); | ||
| 123 | } | ||
| 124 | |||
| 125 | static inline unsigned long long * | ||
| 126 | res_counter_member(struct res_counter *counter, int member) | ||
| 127 | { | ||
| 128 | switch (member) { | ||
| 129 | case RES_USAGE: | ||
| 130 | return &counter->usage; | ||
| 131 | case RES_MAX_USAGE: | ||
| 132 | return &counter->max_usage; | ||
| 133 | case RES_LIMIT: | ||
| 134 | return &counter->limit; | ||
| 135 | case RES_FAILCNT: | ||
| 136 | return &counter->failcnt; | ||
| 137 | case RES_SOFT_LIMIT: | ||
| 138 | return &counter->soft_limit; | ||
| 139 | }; | ||
| 140 | |||
| 141 | BUG(); | ||
| 142 | return NULL; | ||
| 143 | } | ||
| 144 | |||
| 145 | ssize_t res_counter_read(struct res_counter *counter, int member, | ||
| 146 | const char __user *userbuf, size_t nbytes, loff_t *pos, | ||
| 147 | int (*read_strategy)(unsigned long long val, char *st_buf)) | ||
| 148 | { | ||
| 149 | unsigned long long *val; | ||
| 150 | char buf[64], *s; | ||
| 151 | |||
| 152 | s = buf; | ||
| 153 | val = res_counter_member(counter, member); | ||
| 154 | if (read_strategy) | ||
| 155 | s += read_strategy(*val, s); | ||
| 156 | else | ||
| 157 | s += sprintf(s, "%llu\n", *val); | ||
| 158 | return simple_read_from_buffer((void __user *)userbuf, nbytes, | ||
| 159 | pos, buf, s - buf); | ||
| 160 | } | ||
| 161 | |||
| 162 | #if BITS_PER_LONG == 32 | ||
| 163 | u64 res_counter_read_u64(struct res_counter *counter, int member) | ||
| 164 | { | ||
| 165 | unsigned long flags; | ||
| 166 | u64 ret; | ||
| 167 | |||
| 168 | spin_lock_irqsave(&counter->lock, flags); | ||
| 169 | ret = *res_counter_member(counter, member); | ||
| 170 | spin_unlock_irqrestore(&counter->lock, flags); | ||
| 171 | |||
| 172 | return ret; | ||
| 173 | } | ||
| 174 | #else | ||
| 175 | u64 res_counter_read_u64(struct res_counter *counter, int member) | ||
| 176 | { | ||
| 177 | return *res_counter_member(counter, member); | ||
| 178 | } | ||
| 179 | #endif | ||
| 180 | |||
| 181 | int res_counter_memparse_write_strategy(const char *buf, | ||
| 182 | unsigned long long *resp) | ||
| 183 | { | ||
| 184 | char *end; | ||
| 185 | unsigned long long res; | ||
| 186 | |||
| 187 | /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */ | ||
| 188 | if (*buf == '-') { | ||
| 189 | int rc = kstrtoull(buf + 1, 10, &res); | ||
| 190 | |||
| 191 | if (rc) | ||
| 192 | return rc; | ||
| 193 | if (res != 1) | ||
| 194 | return -EINVAL; | ||
| 195 | *resp = RES_COUNTER_MAX; | ||
| 196 | return 0; | ||
| 197 | } | ||
| 198 | |||
| 199 | res = memparse(buf, &end); | ||
| 200 | if (*end != '\0') | ||
| 201 | return -EINVAL; | ||
| 202 | |||
| 203 | if (PAGE_ALIGN(res) >= res) | ||
| 204 | res = PAGE_ALIGN(res); | ||
| 205 | else | ||
| 206 | res = RES_COUNTER_MAX; | ||
| 207 | |||
| 208 | *resp = res; | ||
| 209 | |||
| 210 | return 0; | ||
| 211 | } | ||
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index a63f4dc27909..607f852b4d04 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c | |||
| @@ -148,7 +148,7 @@ EXPORT_SYMBOL(wait_for_completion_timeout); | |||
| 148 | * | 148 | * |
| 149 | * This waits to be signaled for completion of a specific task. It is NOT | 149 | * This waits to be signaled for completion of a specific task. It is NOT |
| 150 | * interruptible and there is no timeout. The caller is accounted as waiting | 150 | * interruptible and there is no timeout. The caller is accounted as waiting |
| 151 | * for IO. | 151 | * for IO (which traditionally means blkio only). |
| 152 | */ | 152 | */ |
| 153 | void __sched wait_for_completion_io(struct completion *x) | 153 | void __sched wait_for_completion_io(struct completion *x) |
| 154 | { | 154 | { |
| @@ -163,7 +163,8 @@ EXPORT_SYMBOL(wait_for_completion_io); | |||
| 163 | * | 163 | * |
| 164 | * This waits for either a completion of a specific task to be signaled or for a | 164 | * This waits for either a completion of a specific task to be signaled or for a |
| 165 | * specified timeout to expire. The timeout is in jiffies. It is not | 165 | * specified timeout to expire. The timeout is in jiffies. It is not |
| 166 | * interruptible. The caller is accounted as waiting for IO. | 166 | * interruptible. The caller is accounted as waiting for IO (which traditionally |
| 167 | * means blkio only). | ||
| 167 | * | 168 | * |
| 168 | * Return: 0 if timed out, and positive (at least 1, or number of jiffies left | 169 | * Return: 0 if timed out, and positive (at least 1, or number of jiffies left |
| 169 | * till timeout) if completed. | 170 | * till timeout) if completed. |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 89e7283015a6..d22fb16a7153 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
| @@ -1008,6 +1008,9 @@ inline int task_curr(const struct task_struct *p) | |||
| 1008 | return cpu_curr(task_cpu(p)) == p; | 1008 | return cpu_curr(task_cpu(p)) == p; |
| 1009 | } | 1009 | } |
| 1010 | 1010 | ||
| 1011 | /* | ||
| 1012 | * Can drop rq->lock because from sched_class::switched_from() methods drop it. | ||
| 1013 | */ | ||
| 1011 | static inline void check_class_changed(struct rq *rq, struct task_struct *p, | 1014 | static inline void check_class_changed(struct rq *rq, struct task_struct *p, |
| 1012 | const struct sched_class *prev_class, | 1015 | const struct sched_class *prev_class, |
| 1013 | int oldprio) | 1016 | int oldprio) |
| @@ -1015,6 +1018,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, | |||
| 1015 | if (prev_class != p->sched_class) { | 1018 | if (prev_class != p->sched_class) { |
| 1016 | if (prev_class->switched_from) | 1019 | if (prev_class->switched_from) |
| 1017 | prev_class->switched_from(rq, p); | 1020 | prev_class->switched_from(rq, p); |
| 1021 | /* Possble rq->lock 'hole'. */ | ||
| 1018 | p->sched_class->switched_to(rq, p); | 1022 | p->sched_class->switched_to(rq, p); |
| 1019 | } else if (oldprio != p->prio || dl_task(p)) | 1023 | } else if (oldprio != p->prio || dl_task(p)) |
| 1020 | p->sched_class->prio_changed(rq, p, oldprio); | 1024 | p->sched_class->prio_changed(rq, p, oldprio); |
| @@ -1054,7 +1058,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
| 1054 | * ttwu() will sort out the placement. | 1058 | * ttwu() will sort out the placement. |
| 1055 | */ | 1059 | */ |
| 1056 | WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && | 1060 | WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && |
| 1057 | !(task_preempt_count(p) & PREEMPT_ACTIVE)); | 1061 | !p->on_rq); |
| 1058 | 1062 | ||
| 1059 | #ifdef CONFIG_LOCKDEP | 1063 | #ifdef CONFIG_LOCKDEP |
| 1060 | /* | 1064 | /* |
| @@ -1078,7 +1082,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
| 1078 | if (p->sched_class->migrate_task_rq) | 1082 | if (p->sched_class->migrate_task_rq) |
| 1079 | p->sched_class->migrate_task_rq(p, new_cpu); | 1083 | p->sched_class->migrate_task_rq(p, new_cpu); |
| 1080 | p->se.nr_migrations++; | 1084 | p->se.nr_migrations++; |
| 1081 | perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); | 1085 | perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); |
| 1082 | } | 1086 | } |
| 1083 | 1087 | ||
| 1084 | __set_task_cpu(p, new_cpu); | 1088 | __set_task_cpu(p, new_cpu); |
| @@ -1407,7 +1411,8 @@ out: | |||
| 1407 | static inline | 1411 | static inline |
| 1408 | int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) | 1412 | int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) |
| 1409 | { | 1413 | { |
| 1410 | cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); | 1414 | if (p->nr_cpus_allowed > 1) |
| 1415 | cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); | ||
| 1411 | 1416 | ||
| 1412 | /* | 1417 | /* |
| 1413 | * In order not to call set_task_cpu() on a blocking task we need | 1418 | * In order not to call set_task_cpu() on a blocking task we need |
| @@ -1623,8 +1628,10 @@ void wake_up_if_idle(int cpu) | |||
| 1623 | struct rq *rq = cpu_rq(cpu); | 1628 | struct rq *rq = cpu_rq(cpu); |
| 1624 | unsigned long flags; | 1629 | unsigned long flags; |
| 1625 | 1630 | ||
| 1626 | if (!is_idle_task(rq->curr)) | 1631 | rcu_read_lock(); |
| 1627 | return; | 1632 | |
| 1633 | if (!is_idle_task(rcu_dereference(rq->curr))) | ||
| 1634 | goto out; | ||
| 1628 | 1635 | ||
| 1629 | if (set_nr_if_polling(rq->idle)) { | 1636 | if (set_nr_if_polling(rq->idle)) { |
| 1630 | trace_sched_wake_idle_without_ipi(cpu); | 1637 | trace_sched_wake_idle_without_ipi(cpu); |
| @@ -1635,6 +1642,9 @@ void wake_up_if_idle(int cpu) | |||
| 1635 | /* Else cpu is not in idle, do nothing here */ | 1642 | /* Else cpu is not in idle, do nothing here */ |
| 1636 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 1643 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
| 1637 | } | 1644 | } |
| 1645 | |||
| 1646 | out: | ||
| 1647 | rcu_read_unlock(); | ||
| 1638 | } | 1648 | } |
| 1639 | 1649 | ||
| 1640 | bool cpus_share_cache(int this_cpu, int that_cpu) | 1650 | bool cpus_share_cache(int this_cpu, int that_cpu) |
| @@ -1853,12 +1863,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
| 1853 | p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; | 1863 | p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; |
| 1854 | p->numa_scan_period = sysctl_numa_balancing_scan_delay; | 1864 | p->numa_scan_period = sysctl_numa_balancing_scan_delay; |
| 1855 | p->numa_work.next = &p->numa_work; | 1865 | p->numa_work.next = &p->numa_work; |
| 1856 | p->numa_faults_memory = NULL; | 1866 | p->numa_faults = NULL; |
| 1857 | p->numa_faults_buffer_memory = NULL; | ||
| 1858 | p->last_task_numa_placement = 0; | 1867 | p->last_task_numa_placement = 0; |
| 1859 | p->last_sum_exec_runtime = 0; | 1868 | p->last_sum_exec_runtime = 0; |
| 1860 | 1869 | ||
| 1861 | INIT_LIST_HEAD(&p->numa_entry); | ||
| 1862 | p->numa_group = NULL; | 1870 | p->numa_group = NULL; |
| 1863 | #endif /* CONFIG_NUMA_BALANCING */ | 1871 | #endif /* CONFIG_NUMA_BALANCING */ |
| 1864 | } | 1872 | } |
| @@ -2034,25 +2042,6 @@ static inline int dl_bw_cpus(int i) | |||
| 2034 | } | 2042 | } |
| 2035 | #endif | 2043 | #endif |
| 2036 | 2044 | ||
| 2037 | static inline | ||
| 2038 | void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) | ||
| 2039 | { | ||
| 2040 | dl_b->total_bw -= tsk_bw; | ||
| 2041 | } | ||
| 2042 | |||
| 2043 | static inline | ||
| 2044 | void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) | ||
| 2045 | { | ||
| 2046 | dl_b->total_bw += tsk_bw; | ||
| 2047 | } | ||
| 2048 | |||
| 2049 | static inline | ||
| 2050 | bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) | ||
| 2051 | { | ||
| 2052 | return dl_b->bw != -1 && | ||
| 2053 | dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; | ||
| 2054 | } | ||
| 2055 | |||
| 2056 | /* | 2045 | /* |
| 2057 | * We must be sure that accepting a new task (or allowing changing the | 2046 | * We must be sure that accepting a new task (or allowing changing the |
| 2058 | * parameters of an existing one) is consistent with the bandwidth | 2047 | * parameters of an existing one) is consistent with the bandwidth |
| @@ -2220,7 +2209,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, | |||
| 2220 | 2209 | ||
| 2221 | /** | 2210 | /** |
| 2222 | * finish_task_switch - clean up after a task-switch | 2211 | * finish_task_switch - clean up after a task-switch |
| 2223 | * @rq: runqueue associated with task-switch | ||
| 2224 | * @prev: the thread we just switched away from. | 2212 | * @prev: the thread we just switched away from. |
| 2225 | * | 2213 | * |
| 2226 | * finish_task_switch must be called after the context switch, paired | 2214 | * finish_task_switch must be called after the context switch, paired |
| @@ -2232,10 +2220,16 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, | |||
| 2232 | * so, we finish that here outside of the runqueue lock. (Doing it | 2220 | * so, we finish that here outside of the runqueue lock. (Doing it |
| 2233 | * with the lock held can cause deadlocks; see schedule() for | 2221 | * with the lock held can cause deadlocks; see schedule() for |
| 2234 | * details.) | 2222 | * details.) |
| 2223 | * | ||
| 2224 | * The context switch have flipped the stack from under us and restored the | ||
| 2225 | * local variables which were saved when this task called schedule() in the | ||
| 2226 | * past. prev == current is still correct but we need to recalculate this_rq | ||
| 2227 | * because prev may have moved to another CPU. | ||
| 2235 | */ | 2228 | */ |
| 2236 | static void finish_task_switch(struct rq *rq, struct task_struct *prev) | 2229 | static struct rq *finish_task_switch(struct task_struct *prev) |
| 2237 | __releases(rq->lock) | 2230 | __releases(rq->lock) |
| 2238 | { | 2231 | { |
| 2232 | struct rq *rq = this_rq(); | ||
| 2239 | struct mm_struct *mm = rq->prev_mm; | 2233 | struct mm_struct *mm = rq->prev_mm; |
| 2240 | long prev_state; | 2234 | long prev_state; |
| 2241 | 2235 | ||
| @@ -2275,6 +2269,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
| 2275 | } | 2269 | } |
| 2276 | 2270 | ||
| 2277 | tick_nohz_task_switch(current); | 2271 | tick_nohz_task_switch(current); |
| 2272 | return rq; | ||
| 2278 | } | 2273 | } |
| 2279 | 2274 | ||
| 2280 | #ifdef CONFIG_SMP | 2275 | #ifdef CONFIG_SMP |
| @@ -2309,25 +2304,22 @@ static inline void post_schedule(struct rq *rq) | |||
| 2309 | asmlinkage __visible void schedule_tail(struct task_struct *prev) | 2304 | asmlinkage __visible void schedule_tail(struct task_struct *prev) |
| 2310 | __releases(rq->lock) | 2305 | __releases(rq->lock) |
| 2311 | { | 2306 | { |
| 2312 | struct rq *rq = this_rq(); | 2307 | struct rq *rq; |
| 2313 | |||
| 2314 | finish_task_switch(rq, prev); | ||
| 2315 | 2308 | ||
| 2316 | /* | 2309 | /* finish_task_switch() drops rq->lock and enables preemtion */ |
| 2317 | * FIXME: do we need to worry about rq being invalidated by the | 2310 | preempt_disable(); |
| 2318 | * task_switch? | 2311 | rq = finish_task_switch(prev); |
| 2319 | */ | ||
| 2320 | post_schedule(rq); | 2312 | post_schedule(rq); |
| 2313 | preempt_enable(); | ||
| 2321 | 2314 | ||
| 2322 | if (current->set_child_tid) | 2315 | if (current->set_child_tid) |
| 2323 | put_user(task_pid_vnr(current), current->set_child_tid); | 2316 | put_user(task_pid_vnr(current), current->set_child_tid); |
| 2324 | } | 2317 | } |
| 2325 | 2318 | ||
| 2326 | /* | 2319 | /* |
| 2327 | * context_switch - switch to the new MM and the new | 2320 | * context_switch - switch to the new MM and the new thread's register state. |
| 2328 | * thread's register state. | ||
| 2329 | */ | 2321 | */ |
| 2330 | static inline void | 2322 | static inline struct rq * |
| 2331 | context_switch(struct rq *rq, struct task_struct *prev, | 2323 | context_switch(struct rq *rq, struct task_struct *prev, |
| 2332 | struct task_struct *next) | 2324 | struct task_struct *next) |
| 2333 | { | 2325 | { |
| @@ -2366,14 +2358,9 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
| 2366 | context_tracking_task_switch(prev, next); | 2358 | context_tracking_task_switch(prev, next); |
| 2367 | /* Here we just switch the register state and the stack. */ | 2359 | /* Here we just switch the register state and the stack. */ |
| 2368 | switch_to(prev, next, prev); | 2360 | switch_to(prev, next, prev); |
| 2369 | |||
| 2370 | barrier(); | 2361 | barrier(); |
| 2371 | /* | 2362 | |
| 2372 | * this_rq must be evaluated again because prev may have moved | 2363 | return finish_task_switch(prev); |
| 2373 | * CPUs since it called schedule(), thus the 'rq' on its stack | ||
| 2374 | * frame will be invalid. | ||
| 2375 | */ | ||
| 2376 | finish_task_switch(this_rq(), prev); | ||
| 2377 | } | 2364 | } |
| 2378 | 2365 | ||
| 2379 | /* | 2366 | /* |
| @@ -2773,7 +2760,7 @@ need_resched: | |||
| 2773 | preempt_disable(); | 2760 | preempt_disable(); |
| 2774 | cpu = smp_processor_id(); | 2761 | cpu = smp_processor_id(); |
| 2775 | rq = cpu_rq(cpu); | 2762 | rq = cpu_rq(cpu); |
| 2776 | rcu_note_context_switch(cpu); | 2763 | rcu_note_context_switch(); |
| 2777 | prev = rq->curr; | 2764 | prev = rq->curr; |
| 2778 | 2765 | ||
| 2779 | schedule_debug(prev); | 2766 | schedule_debug(prev); |
| @@ -2826,15 +2813,8 @@ need_resched: | |||
| 2826 | rq->curr = next; | 2813 | rq->curr = next; |
| 2827 | ++*switch_count; | 2814 | ++*switch_count; |
| 2828 | 2815 | ||
| 2829 | context_switch(rq, prev, next); /* unlocks the rq */ | 2816 | rq = context_switch(rq, prev, next); /* unlocks the rq */ |
| 2830 | /* | 2817 | cpu = cpu_of(rq); |
| 2831 | * The context switch have flipped the stack from under us | ||
| 2832 | * and restored the local variables which were saved when | ||
| 2833 | * this task called schedule() in the past. prev == current | ||
| 2834 | * is still correct, but it can be moved to another cpu/rq. | ||
| 2835 | */ | ||
| 2836 | cpu = smp_processor_id(); | ||
| 2837 | rq = cpu_rq(cpu); | ||
| 2838 | } else | 2818 | } else |
| 2839 | raw_spin_unlock_irq(&rq->lock); | 2819 | raw_spin_unlock_irq(&rq->lock); |
| 2840 | 2820 | ||
| @@ -4547,8 +4527,10 @@ void sched_show_task(struct task_struct *p) | |||
| 4547 | #ifdef CONFIG_DEBUG_STACK_USAGE | 4527 | #ifdef CONFIG_DEBUG_STACK_USAGE |
| 4548 | free = stack_not_used(p); | 4528 | free = stack_not_used(p); |
| 4549 | #endif | 4529 | #endif |
| 4530 | ppid = 0; | ||
| 4550 | rcu_read_lock(); | 4531 | rcu_read_lock(); |
| 4551 | ppid = task_pid_nr(rcu_dereference(p->real_parent)); | 4532 | if (pid_alive(p)) |
| 4533 | ppid = task_pid_nr(rcu_dereference(p->real_parent)); | ||
| 4552 | rcu_read_unlock(); | 4534 | rcu_read_unlock(); |
| 4553 | printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, | 4535 | printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, |
| 4554 | task_pid_nr(p), ppid, | 4536 | task_pid_nr(p), ppid, |
| @@ -4653,6 +4635,81 @@ void init_idle(struct task_struct *idle, int cpu) | |||
| 4653 | #endif | 4635 | #endif |
| 4654 | } | 4636 | } |
| 4655 | 4637 | ||
| 4638 | int cpuset_cpumask_can_shrink(const struct cpumask *cur, | ||
| 4639 | const struct cpumask *trial) | ||
| 4640 | { | ||
| 4641 | int ret = 1, trial_cpus; | ||
| 4642 | struct dl_bw *cur_dl_b; | ||
| 4643 | unsigned long flags; | ||
| 4644 | |||
| 4645 | rcu_read_lock_sched(); | ||
| 4646 | cur_dl_b = dl_bw_of(cpumask_any(cur)); | ||
| 4647 | trial_cpus = cpumask_weight(trial); | ||
| 4648 | |||
| 4649 | raw_spin_lock_irqsave(&cur_dl_b->lock, flags); | ||
| 4650 | if (cur_dl_b->bw != -1 && | ||
| 4651 | cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw) | ||
| 4652 | ret = 0; | ||
| 4653 | raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); | ||
| 4654 | rcu_read_unlock_sched(); | ||
| 4655 | |||
| 4656 | return ret; | ||
| 4657 | } | ||
| 4658 | |||
| 4659 | int task_can_attach(struct task_struct *p, | ||
| 4660 | const struct cpumask *cs_cpus_allowed) | ||
| 4661 | { | ||
| 4662 | int ret = 0; | ||
| 4663 | |||
| 4664 | /* | ||
| 4665 | * Kthreads which disallow setaffinity shouldn't be moved | ||
| 4666 | * to a new cpuset; we don't want to change their cpu | ||
| 4667 | * affinity and isolating such threads by their set of | ||
| 4668 | * allowed nodes is unnecessary. Thus, cpusets are not | ||
| 4669 | * applicable for such threads. This prevents checking for | ||
| 4670 | * success of set_cpus_allowed_ptr() on all attached tasks | ||
| 4671 | * before cpus_allowed may be changed. | ||
| 4672 | */ | ||
| 4673 | if (p->flags & PF_NO_SETAFFINITY) { | ||
| 4674 | ret = -EINVAL; | ||
| 4675 | goto out; | ||
| 4676 | } | ||
| 4677 | |||
| 4678 | #ifdef CONFIG_SMP | ||
| 4679 | if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, | ||
| 4680 | cs_cpus_allowed)) { | ||
| 4681 | unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, | ||
| 4682 | cs_cpus_allowed); | ||
| 4683 | struct dl_bw *dl_b; | ||
| 4684 | bool overflow; | ||
| 4685 | int cpus; | ||
| 4686 | unsigned long flags; | ||
| 4687 | |||
| 4688 | rcu_read_lock_sched(); | ||
| 4689 | dl_b = dl_bw_of(dest_cpu); | ||
| 4690 | raw_spin_lock_irqsave(&dl_b->lock, flags); | ||
| 4691 | cpus = dl_bw_cpus(dest_cpu); | ||
| 4692 | overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); | ||
| 4693 | if (overflow) | ||
| 4694 | ret = -EBUSY; | ||
| 4695 | else { | ||
| 4696 | /* | ||
| 4697 | * We reserve space for this task in the destination | ||
| 4698 | * root_domain, as we can't fail after this point. | ||
| 4699 | * We will free resources in the source root_domain | ||
| 4700 | * later on (see set_cpus_allowed_dl()). | ||
| 4701 | */ | ||
| 4702 | __dl_add(dl_b, p->dl.dl_bw); | ||
| 4703 | } | ||
| 4704 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | ||
| 4705 | rcu_read_unlock_sched(); | ||
| 4706 | |||
| 4707 | } | ||
| 4708 | #endif | ||
| 4709 | out: | ||
| 4710 | return ret; | ||
| 4711 | } | ||
| 4712 | |||
| 4656 | #ifdef CONFIG_SMP | 4713 | #ifdef CONFIG_SMP |
| 4657 | /* | 4714 | /* |
| 4658 | * move_queued_task - move a queued task to new rq. | 4715 | * move_queued_task - move a queued task to new rq. |
| @@ -6103,7 +6160,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd) | |||
| 6103 | 6160 | ||
| 6104 | #ifdef CONFIG_NUMA | 6161 | #ifdef CONFIG_NUMA |
| 6105 | static int sched_domains_numa_levels; | 6162 | static int sched_domains_numa_levels; |
| 6163 | enum numa_topology_type sched_numa_topology_type; | ||
| 6106 | static int *sched_domains_numa_distance; | 6164 | static int *sched_domains_numa_distance; |
| 6165 | int sched_max_numa_distance; | ||
| 6107 | static struct cpumask ***sched_domains_numa_masks; | 6166 | static struct cpumask ***sched_domains_numa_masks; |
| 6108 | static int sched_domains_curr_level; | 6167 | static int sched_domains_curr_level; |
| 6109 | #endif | 6168 | #endif |
| @@ -6275,7 +6334,7 @@ static void sched_numa_warn(const char *str) | |||
| 6275 | printk(KERN_WARNING "\n"); | 6334 | printk(KERN_WARNING "\n"); |
| 6276 | } | 6335 | } |
| 6277 | 6336 | ||
| 6278 | static bool find_numa_distance(int distance) | 6337 | bool find_numa_distance(int distance) |
| 6279 | { | 6338 | { |
| 6280 | int i; | 6339 | int i; |
| 6281 | 6340 | ||
| @@ -6290,6 +6349,56 @@ static bool find_numa_distance(int distance) | |||
| 6290 | return false; | 6349 | return false; |
| 6291 | } | 6350 | } |
| 6292 | 6351 | ||
| 6352 | /* | ||
| 6353 | * A system can have three types of NUMA topology: | ||
| 6354 | * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system | ||
| 6355 | * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes | ||
| 6356 | * NUMA_BACKPLANE: nodes can reach other nodes through a backplane | ||
| 6357 | * | ||
| 6358 | * The difference between a glueless mesh topology and a backplane | ||
| 6359 | * topology lies in whether communication between not directly | ||
| 6360 | * connected nodes goes through intermediary nodes (where programs | ||
| 6361 | * could run), or through backplane controllers. This affects | ||
| 6362 | * placement of programs. | ||
| 6363 | * | ||
| 6364 | * The type of topology can be discerned with the following tests: | ||
| 6365 | * - If the maximum distance between any nodes is 1 hop, the system | ||
| 6366 | * is directly connected. | ||
| 6367 | * - If for two nodes A and B, located N > 1 hops away from each other, | ||
| 6368 | * there is an intermediary node C, which is < N hops away from both | ||
| 6369 | * nodes A and B, the system is a glueless mesh. | ||
| 6370 | */ | ||
| 6371 | static void init_numa_topology_type(void) | ||
| 6372 | { | ||
| 6373 | int a, b, c, n; | ||
| 6374 | |||
| 6375 | n = sched_max_numa_distance; | ||
| 6376 | |||
| 6377 | if (n <= 1) | ||
| 6378 | sched_numa_topology_type = NUMA_DIRECT; | ||
| 6379 | |||
| 6380 | for_each_online_node(a) { | ||
| 6381 | for_each_online_node(b) { | ||
| 6382 | /* Find two nodes furthest removed from each other. */ | ||
| 6383 | if (node_distance(a, b) < n) | ||
| 6384 | continue; | ||
| 6385 | |||
| 6386 | /* Is there an intermediary node between a and b? */ | ||
| 6387 | for_each_online_node(c) { | ||
| 6388 | if (node_distance(a, c) < n && | ||
| 6389 | node_distance(b, c) < n) { | ||
| 6390 | sched_numa_topology_type = | ||
| 6391 | NUMA_GLUELESS_MESH; | ||
| 6392 | return; | ||
| 6393 | } | ||
| 6394 | } | ||
| 6395 | |||
| 6396 | sched_numa_topology_type = NUMA_BACKPLANE; | ||
| 6397 | return; | ||
| 6398 | } | ||
| 6399 | } | ||
| 6400 | } | ||
| 6401 | |||
| 6293 | static void sched_init_numa(void) | 6402 | static void sched_init_numa(void) |
| 6294 | { | 6403 | { |
| 6295 | int next_distance, curr_distance = node_distance(0, 0); | 6404 | int next_distance, curr_distance = node_distance(0, 0); |
| @@ -6426,6 +6535,9 @@ static void sched_init_numa(void) | |||
| 6426 | sched_domain_topology = tl; | 6535 | sched_domain_topology = tl; |
| 6427 | 6536 | ||
| 6428 | sched_domains_numa_levels = level; | 6537 | sched_domains_numa_levels = level; |
| 6538 | sched_max_numa_distance = sched_domains_numa_distance[level - 1]; | ||
| 6539 | |||
| 6540 | init_numa_topology_type(); | ||
| 6429 | } | 6541 | } |
| 6430 | 6542 | ||
| 6431 | static void sched_domains_numa_masks_set(int cpu) | 6543 | static void sched_domains_numa_masks_set(int cpu) |
| @@ -7001,9 +7113,6 @@ void __init sched_init(void) | |||
| 7001 | #ifdef CONFIG_RT_GROUP_SCHED | 7113 | #ifdef CONFIG_RT_GROUP_SCHED |
| 7002 | alloc_size += 2 * nr_cpu_ids * sizeof(void **); | 7114 | alloc_size += 2 * nr_cpu_ids * sizeof(void **); |
| 7003 | #endif | 7115 | #endif |
| 7004 | #ifdef CONFIG_CPUMASK_OFFSTACK | ||
| 7005 | alloc_size += num_possible_cpus() * cpumask_size(); | ||
| 7006 | #endif | ||
| 7007 | if (alloc_size) { | 7116 | if (alloc_size) { |
| 7008 | ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); | 7117 | ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); |
| 7009 | 7118 | ||
| @@ -7023,13 +7132,13 @@ void __init sched_init(void) | |||
| 7023 | ptr += nr_cpu_ids * sizeof(void **); | 7132 | ptr += nr_cpu_ids * sizeof(void **); |
| 7024 | 7133 | ||
| 7025 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7134 | #endif /* CONFIG_RT_GROUP_SCHED */ |
| 7135 | } | ||
| 7026 | #ifdef CONFIG_CPUMASK_OFFSTACK | 7136 | #ifdef CONFIG_CPUMASK_OFFSTACK |
| 7027 | for_each_possible_cpu(i) { | 7137 | for_each_possible_cpu(i) { |
| 7028 | per_cpu(load_balance_mask, i) = (void *)ptr; | 7138 | per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node( |
| 7029 | ptr += cpumask_size(); | 7139 | cpumask_size(), GFP_KERNEL, cpu_to_node(i)); |
| 7030 | } | ||
| 7031 | #endif /* CONFIG_CPUMASK_OFFSTACK */ | ||
| 7032 | } | 7140 | } |
| 7141 | #endif /* CONFIG_CPUMASK_OFFSTACK */ | ||
| 7033 | 7142 | ||
| 7034 | init_rt_bandwidth(&def_rt_bandwidth, | 7143 | init_rt_bandwidth(&def_rt_bandwidth, |
| 7035 | global_rt_period(), global_rt_runtime()); | 7144 | global_rt_period(), global_rt_runtime()); |
| @@ -7178,6 +7287,25 @@ static inline int preempt_count_equals(int preempt_offset) | |||
| 7178 | 7287 | ||
| 7179 | void __might_sleep(const char *file, int line, int preempt_offset) | 7288 | void __might_sleep(const char *file, int line, int preempt_offset) |
| 7180 | { | 7289 | { |
| 7290 | /* | ||
| 7291 | * Blocking primitives will set (and therefore destroy) current->state, | ||
| 7292 | * since we will exit with TASK_RUNNING make sure we enter with it, | ||
| 7293 | * otherwise we will destroy state. | ||
| 7294 | */ | ||
| 7295 | if (WARN_ONCE(current->state != TASK_RUNNING, | ||
| 7296 | "do not call blocking ops when !TASK_RUNNING; " | ||
| 7297 | "state=%lx set at [<%p>] %pS\n", | ||
| 7298 | current->state, | ||
| 7299 | (void *)current->task_state_change, | ||
| 7300 | (void *)current->task_state_change)) | ||
| 7301 | __set_current_state(TASK_RUNNING); | ||
| 7302 | |||
| 7303 | ___might_sleep(file, line, preempt_offset); | ||
| 7304 | } | ||
| 7305 | EXPORT_SYMBOL(__might_sleep); | ||
| 7306 | |||
| 7307 | void ___might_sleep(const char *file, int line, int preempt_offset) | ||
| 7308 | { | ||
| 7181 | static unsigned long prev_jiffy; /* ratelimiting */ | 7309 | static unsigned long prev_jiffy; /* ratelimiting */ |
| 7182 | 7310 | ||
| 7183 | rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ | 7311 | rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ |
| @@ -7209,7 +7337,7 @@ void __might_sleep(const char *file, int line, int preempt_offset) | |||
| 7209 | #endif | 7337 | #endif |
| 7210 | dump_stack(); | 7338 | dump_stack(); |
| 7211 | } | 7339 | } |
| 7212 | EXPORT_SYMBOL(__might_sleep); | 7340 | EXPORT_SYMBOL(___might_sleep); |
| 7213 | #endif | 7341 | #endif |
| 7214 | 7342 | ||
| 7215 | #ifdef CONFIG_MAGIC_SYSRQ | 7343 | #ifdef CONFIG_MAGIC_SYSRQ |
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index 538c9796ad4a..020039bd1326 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h | |||
| @@ -25,9 +25,6 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, | |||
| 25 | void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); | 25 | void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); |
| 26 | int cpudl_init(struct cpudl *cp); | 26 | int cpudl_init(struct cpudl *cp); |
| 27 | void cpudl_cleanup(struct cpudl *cp); | 27 | void cpudl_cleanup(struct cpudl *cp); |
| 28 | #else | ||
| 29 | #define cpudl_set(cp, cpu, dl) do { } while (0) | ||
| 30 | #define cpudl_init() do { } while (0) | ||
| 31 | #endif /* CONFIG_SMP */ | 28 | #endif /* CONFIG_SMP */ |
| 32 | 29 | ||
| 33 | #endif /* _LINUX_CPUDL_H */ | 30 | #endif /* _LINUX_CPUDL_H */ |
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index 6b033347fdfd..63cbb9ca0496 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h | |||
| @@ -26,9 +26,6 @@ int cpupri_find(struct cpupri *cp, | |||
| 26 | void cpupri_set(struct cpupri *cp, int cpu, int pri); | 26 | void cpupri_set(struct cpupri *cp, int cpu, int pri); |
| 27 | int cpupri_init(struct cpupri *cp); | 27 | int cpupri_init(struct cpupri *cp); |
| 28 | void cpupri_cleanup(struct cpupri *cp); | 28 | void cpupri_cleanup(struct cpupri *cp); |
| 29 | #else | ||
| 30 | #define cpupri_set(cp, cpu, pri) do { } while (0) | ||
| 31 | #define cpupri_init() do { } while (0) | ||
| 32 | #endif | 29 | #endif |
| 33 | 30 | ||
| 34 | #endif /* _LINUX_CPUPRI_H */ | 31 | #endif /* _LINUX_CPUPRI_H */ |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 28fa9d9e9201..b52092f2636d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
| @@ -563,11 +563,6 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) | |||
| 563 | { | 563 | { |
| 564 | struct hrtimer *timer = &dl_se->dl_timer; | 564 | struct hrtimer *timer = &dl_se->dl_timer; |
| 565 | 565 | ||
| 566 | if (hrtimer_active(timer)) { | ||
| 567 | hrtimer_try_to_cancel(timer); | ||
| 568 | return; | ||
| 569 | } | ||
| 570 | |||
| 571 | hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 566 | hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
| 572 | timer->function = dl_task_timer; | 567 | timer->function = dl_task_timer; |
| 573 | } | 568 | } |
| @@ -575,24 +570,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) | |||
| 575 | static | 570 | static |
| 576 | int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) | 571 | int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) |
| 577 | { | 572 | { |
| 578 | int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq)); | 573 | return (dl_se->runtime <= 0); |
| 579 | int rorun = dl_se->runtime <= 0; | ||
| 580 | |||
| 581 | if (!rorun && !dmiss) | ||
| 582 | return 0; | ||
| 583 | |||
| 584 | /* | ||
| 585 | * If we are beyond our current deadline and we are still | ||
| 586 | * executing, then we have already used some of the runtime of | ||
| 587 | * the next instance. Thus, if we do not account that, we are | ||
| 588 | * stealing bandwidth from the system at each deadline miss! | ||
| 589 | */ | ||
| 590 | if (dmiss) { | ||
| 591 | dl_se->runtime = rorun ? dl_se->runtime : 0; | ||
| 592 | dl_se->runtime -= rq_clock(rq) - dl_se->deadline; | ||
| 593 | } | ||
| 594 | |||
| 595 | return 1; | ||
| 596 | } | 574 | } |
| 597 | 575 | ||
| 598 | extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); | 576 | extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); |
| @@ -633,7 +611,7 @@ static void update_curr_dl(struct rq *rq) | |||
| 633 | 611 | ||
| 634 | sched_rt_avg_update(rq, delta_exec); | 612 | sched_rt_avg_update(rq, delta_exec); |
| 635 | 613 | ||
| 636 | dl_se->runtime -= delta_exec; | 614 | dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; |
| 637 | if (dl_runtime_exceeded(rq, dl_se)) { | 615 | if (dl_runtime_exceeded(rq, dl_se)) { |
| 638 | __dequeue_task_dl(rq, curr, 0); | 616 | __dequeue_task_dl(rq, curr, 0); |
| 639 | if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) | 617 | if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) |
| @@ -831,10 +809,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, | |||
| 831 | * parameters of the task might need updating. Otherwise, | 809 | * parameters of the task might need updating. Otherwise, |
| 832 | * we want a replenishment of its runtime. | 810 | * we want a replenishment of its runtime. |
| 833 | */ | 811 | */ |
| 834 | if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH) | 812 | if (dl_se->dl_new || flags & ENQUEUE_WAKEUP) |
| 835 | replenish_dl_entity(dl_se, pi_se); | ||
| 836 | else | ||
| 837 | update_dl_entity(dl_se, pi_se); | 813 | update_dl_entity(dl_se, pi_se); |
| 814 | else if (flags & ENQUEUE_REPLENISH) | ||
| 815 | replenish_dl_entity(dl_se, pi_se); | ||
| 838 | 816 | ||
| 839 | __enqueue_dl_entity(dl_se); | 817 | __enqueue_dl_entity(dl_se); |
| 840 | } | 818 | } |
| @@ -933,7 +911,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) | |||
| 933 | struct task_struct *curr; | 911 | struct task_struct *curr; |
| 934 | struct rq *rq; | 912 | struct rq *rq; |
| 935 | 913 | ||
| 936 | if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) | 914 | if (sd_flag != SD_BALANCE_WAKE) |
| 937 | goto out; | 915 | goto out; |
| 938 | 916 | ||
| 939 | rq = cpu_rq(cpu); | 917 | rq = cpu_rq(cpu); |
| @@ -1018,6 +996,10 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p) | |||
| 1018 | { | 996 | { |
| 1019 | hrtick_start(rq, p->dl.runtime); | 997 | hrtick_start(rq, p->dl.runtime); |
| 1020 | } | 998 | } |
| 999 | #else /* !CONFIG_SCHED_HRTICK */ | ||
| 1000 | static void start_hrtick_dl(struct rq *rq, struct task_struct *p) | ||
| 1001 | { | ||
| 1002 | } | ||
| 1021 | #endif | 1003 | #endif |
| 1022 | 1004 | ||
| 1023 | static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, | 1005 | static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, |
| @@ -1071,10 +1053,8 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) | |||
| 1071 | /* Running task will never be pushed. */ | 1053 | /* Running task will never be pushed. */ |
| 1072 | dequeue_pushable_dl_task(rq, p); | 1054 | dequeue_pushable_dl_task(rq, p); |
| 1073 | 1055 | ||
| 1074 | #ifdef CONFIG_SCHED_HRTICK | ||
| 1075 | if (hrtick_enabled(rq)) | 1056 | if (hrtick_enabled(rq)) |
| 1076 | start_hrtick_dl(rq, p); | 1057 | start_hrtick_dl(rq, p); |
| 1077 | #endif | ||
| 1078 | 1058 | ||
| 1079 | set_post_schedule(rq); | 1059 | set_post_schedule(rq); |
| 1080 | 1060 | ||
| @@ -1093,10 +1073,8 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) | |||
| 1093 | { | 1073 | { |
| 1094 | update_curr_dl(rq); | 1074 | update_curr_dl(rq); |
| 1095 | 1075 | ||
| 1096 | #ifdef CONFIG_SCHED_HRTICK | ||
| 1097 | if (hrtick_enabled(rq) && queued && p->dl.runtime > 0) | 1076 | if (hrtick_enabled(rq) && queued && p->dl.runtime > 0) |
| 1098 | start_hrtick_dl(rq, p); | 1077 | start_hrtick_dl(rq, p); |
| 1099 | #endif | ||
| 1100 | } | 1078 | } |
| 1101 | 1079 | ||
| 1102 | static void task_fork_dl(struct task_struct *p) | 1080 | static void task_fork_dl(struct task_struct *p) |
| @@ -1333,6 +1311,7 @@ static int push_dl_task(struct rq *rq) | |||
| 1333 | { | 1311 | { |
| 1334 | struct task_struct *next_task; | 1312 | struct task_struct *next_task; |
| 1335 | struct rq *later_rq; | 1313 | struct rq *later_rq; |
| 1314 | int ret = 0; | ||
| 1336 | 1315 | ||
| 1337 | if (!rq->dl.overloaded) | 1316 | if (!rq->dl.overloaded) |
| 1338 | return 0; | 1317 | return 0; |
| @@ -1378,7 +1357,6 @@ retry: | |||
| 1378 | * The task is still there. We don't try | 1357 | * The task is still there. We don't try |
| 1379 | * again, some other cpu will pull it when ready. | 1358 | * again, some other cpu will pull it when ready. |
| 1380 | */ | 1359 | */ |
| 1381 | dequeue_pushable_dl_task(rq, next_task); | ||
| 1382 | goto out; | 1360 | goto out; |
| 1383 | } | 1361 | } |
| 1384 | 1362 | ||
| @@ -1394,6 +1372,7 @@ retry: | |||
| 1394 | deactivate_task(rq, next_task, 0); | 1372 | deactivate_task(rq, next_task, 0); |
| 1395 | set_task_cpu(next_task, later_rq->cpu); | 1373 | set_task_cpu(next_task, later_rq->cpu); |
| 1396 | activate_task(later_rq, next_task, 0); | 1374 | activate_task(later_rq, next_task, 0); |
| 1375 | ret = 1; | ||
| 1397 | 1376 | ||
| 1398 | resched_curr(later_rq); | 1377 | resched_curr(later_rq); |
| 1399 | 1378 | ||
| @@ -1402,7 +1381,7 @@ retry: | |||
| 1402 | out: | 1381 | out: |
| 1403 | put_task_struct(next_task); | 1382 | put_task_struct(next_task); |
| 1404 | 1383 | ||
| 1405 | return 1; | 1384 | return ret; |
| 1406 | } | 1385 | } |
| 1407 | 1386 | ||
| 1408 | static void push_dl_tasks(struct rq *rq) | 1387 | static void push_dl_tasks(struct rq *rq) |
| @@ -1508,7 +1487,7 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) | |||
| 1508 | p->nr_cpus_allowed > 1 && | 1487 | p->nr_cpus_allowed > 1 && |
| 1509 | dl_task(rq->curr) && | 1488 | dl_task(rq->curr) && |
| 1510 | (rq->curr->nr_cpus_allowed < 2 || | 1489 | (rq->curr->nr_cpus_allowed < 2 || |
| 1511 | dl_entity_preempt(&rq->curr->dl, &p->dl))) { | 1490 | !dl_entity_preempt(&p->dl, &rq->curr->dl))) { |
| 1512 | push_dl_tasks(rq); | 1491 | push_dl_tasks(rq); |
| 1513 | } | 1492 | } |
| 1514 | } | 1493 | } |
| @@ -1517,10 +1496,33 @@ static void set_cpus_allowed_dl(struct task_struct *p, | |||
| 1517 | const struct cpumask *new_mask) | 1496 | const struct cpumask *new_mask) |
| 1518 | { | 1497 | { |
| 1519 | struct rq *rq; | 1498 | struct rq *rq; |
| 1499 | struct root_domain *src_rd; | ||
| 1520 | int weight; | 1500 | int weight; |
| 1521 | 1501 | ||
| 1522 | BUG_ON(!dl_task(p)); | 1502 | BUG_ON(!dl_task(p)); |
| 1523 | 1503 | ||
| 1504 | rq = task_rq(p); | ||
| 1505 | src_rd = rq->rd; | ||
| 1506 | /* | ||
| 1507 | * Migrating a SCHED_DEADLINE task between exclusive | ||
| 1508 | * cpusets (different root_domains) entails a bandwidth | ||
| 1509 | * update. We already made space for us in the destination | ||
| 1510 | * domain (see cpuset_can_attach()). | ||
| 1511 | */ | ||
| 1512 | if (!cpumask_intersects(src_rd->span, new_mask)) { | ||
| 1513 | struct dl_bw *src_dl_b; | ||
| 1514 | |||
| 1515 | src_dl_b = dl_bw_of(cpu_of(rq)); | ||
| 1516 | /* | ||
| 1517 | * We now free resources of the root_domain we are migrating | ||
| 1518 | * off. In the worst case, sched_setattr() may temporary fail | ||
| 1519 | * until we complete the update. | ||
| 1520 | */ | ||
| 1521 | raw_spin_lock(&src_dl_b->lock); | ||
| 1522 | __dl_clear(src_dl_b, p->dl.dl_bw); | ||
| 1523 | raw_spin_unlock(&src_dl_b->lock); | ||
| 1524 | } | ||
| 1525 | |||
| 1524 | /* | 1526 | /* |
| 1525 | * Update only if the task is actually running (i.e., | 1527 | * Update only if the task is actually running (i.e., |
| 1526 | * it is on the rq AND it is not throttled). | 1528 | * it is on the rq AND it is not throttled). |
| @@ -1537,8 +1539,6 @@ static void set_cpus_allowed_dl(struct task_struct *p, | |||
| 1537 | if ((p->nr_cpus_allowed > 1) == (weight > 1)) | 1539 | if ((p->nr_cpus_allowed > 1) == (weight > 1)) |
| 1538 | return; | 1540 | return; |
| 1539 | 1541 | ||
| 1540 | rq = task_rq(p); | ||
| 1541 | |||
| 1542 | /* | 1542 | /* |
| 1543 | * The process used to be able to migrate OR it can now migrate | 1543 | * The process used to be able to migrate OR it can now migrate |
| 1544 | */ | 1544 | */ |
| @@ -1586,22 +1586,48 @@ void init_sched_dl_class(void) | |||
| 1586 | 1586 | ||
| 1587 | #endif /* CONFIG_SMP */ | 1587 | #endif /* CONFIG_SMP */ |
| 1588 | 1588 | ||
| 1589 | /* | ||
| 1590 | * Ensure p's dl_timer is cancelled. May drop rq->lock for a while. | ||
| 1591 | */ | ||
| 1592 | static void cancel_dl_timer(struct rq *rq, struct task_struct *p) | ||
| 1593 | { | ||
| 1594 | struct hrtimer *dl_timer = &p->dl.dl_timer; | ||
| 1595 | |||
| 1596 | /* Nobody will change task's class if pi_lock is held */ | ||
| 1597 | lockdep_assert_held(&p->pi_lock); | ||
| 1598 | |||
| 1599 | if (hrtimer_active(dl_timer)) { | ||
| 1600 | int ret = hrtimer_try_to_cancel(dl_timer); | ||
| 1601 | |||
| 1602 | if (unlikely(ret == -1)) { | ||
| 1603 | /* | ||
| 1604 | * Note, p may migrate OR new deadline tasks | ||
| 1605 | * may appear in rq when we are unlocking it. | ||
| 1606 | * A caller of us must be fine with that. | ||
| 1607 | */ | ||
| 1608 | raw_spin_unlock(&rq->lock); | ||
| 1609 | hrtimer_cancel(dl_timer); | ||
| 1610 | raw_spin_lock(&rq->lock); | ||
| 1611 | } | ||
| 1612 | } | ||
| 1613 | } | ||
| 1614 | |||
| 1589 | static void switched_from_dl(struct rq *rq, struct task_struct *p) | 1615 | static void switched_from_dl(struct rq *rq, struct task_struct *p) |
| 1590 | { | 1616 | { |
| 1591 | if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) | 1617 | cancel_dl_timer(rq, p); |
| 1592 | hrtimer_try_to_cancel(&p->dl.dl_timer); | ||
| 1593 | 1618 | ||
| 1594 | __dl_clear_params(p); | 1619 | __dl_clear_params(p); |
| 1595 | 1620 | ||
| 1596 | #ifdef CONFIG_SMP | ||
| 1597 | /* | 1621 | /* |
| 1598 | * Since this might be the only -deadline task on the rq, | 1622 | * Since this might be the only -deadline task on the rq, |
| 1599 | * this is the right place to try to pull some other one | 1623 | * this is the right place to try to pull some other one |
| 1600 | * from an overloaded cpu, if any. | 1624 | * from an overloaded cpu, if any. |
| 1601 | */ | 1625 | */ |
| 1602 | if (!rq->dl.dl_nr_running) | 1626 | if (!task_on_rq_queued(p) || rq->dl.dl_nr_running) |
| 1603 | pull_dl_task(rq); | 1627 | return; |
| 1604 | #endif | 1628 | |
| 1629 | if (pull_dl_task(rq)) | ||
| 1630 | resched_curr(rq); | ||
| 1605 | } | 1631 | } |
| 1606 | 1632 | ||
| 1607 | /* | 1633 | /* |
| @@ -1622,7 +1648,8 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) | |||
| 1622 | 1648 | ||
| 1623 | if (task_on_rq_queued(p) && rq->curr != p) { | 1649 | if (task_on_rq_queued(p) && rq->curr != p) { |
| 1624 | #ifdef CONFIG_SMP | 1650 | #ifdef CONFIG_SMP |
| 1625 | if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) | 1651 | if (p->nr_cpus_allowed > 1 && rq->dl.overloaded && |
| 1652 | push_dl_task(rq) && rq != task_rq(p)) | ||
| 1626 | /* Only reschedule if pushing failed */ | 1653 | /* Only reschedule if pushing failed */ |
| 1627 | check_resched = 0; | 1654 | check_resched = 0; |
| 1628 | #endif /* CONFIG_SMP */ | 1655 | #endif /* CONFIG_SMP */ |
| @@ -1704,3 +1731,12 @@ const struct sched_class dl_sched_class = { | |||
| 1704 | 1731 | ||
| 1705 | .update_curr = update_curr_dl, | 1732 | .update_curr = update_curr_dl, |
| 1706 | }; | 1733 | }; |
| 1734 | |||
| 1735 | #ifdef CONFIG_SCHED_DEBUG | ||
| 1736 | extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); | ||
| 1737 | |||
| 1738 | void print_dl_stats(struct seq_file *m, int cpu) | ||
| 1739 | { | ||
| 1740 | print_dl_rq(m, cpu, &cpu_rq(cpu)->dl); | ||
| 1741 | } | ||
| 1742 | #endif /* CONFIG_SCHED_DEBUG */ | ||
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index ce33780d8f20..92cc52001e74 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
| @@ -261,6 +261,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) | |||
| 261 | #undef P | 261 | #undef P |
| 262 | } | 262 | } |
| 263 | 263 | ||
| 264 | void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) | ||
| 265 | { | ||
| 266 | SEQ_printf(m, "\ndl_rq[%d]:\n", cpu); | ||
| 267 | SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running); | ||
| 268 | } | ||
| 269 | |||
| 264 | extern __read_mostly int sched_clock_running; | 270 | extern __read_mostly int sched_clock_running; |
| 265 | 271 | ||
| 266 | static void print_cpu(struct seq_file *m, int cpu) | 272 | static void print_cpu(struct seq_file *m, int cpu) |
| @@ -329,6 +335,7 @@ do { \ | |||
| 329 | spin_lock_irqsave(&sched_debug_lock, flags); | 335 | spin_lock_irqsave(&sched_debug_lock, flags); |
| 330 | print_cfs_stats(m, cpu); | 336 | print_cfs_stats(m, cpu); |
| 331 | print_rt_stats(m, cpu); | 337 | print_rt_stats(m, cpu); |
| 338 | print_dl_stats(m, cpu); | ||
| 332 | 339 | ||
| 333 | print_rq(m, rq, cpu); | 340 | print_rq(m, rq, cpu); |
| 334 | spin_unlock_irqrestore(&sched_debug_lock, flags); | 341 | spin_unlock_irqrestore(&sched_debug_lock, flags); |
| @@ -528,8 +535,8 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) | |||
| 528 | unsigned long nr_faults = -1; | 535 | unsigned long nr_faults = -1; |
| 529 | int cpu_current, home_node; | 536 | int cpu_current, home_node; |
| 530 | 537 | ||
| 531 | if (p->numa_faults_memory) | 538 | if (p->numa_faults) |
| 532 | nr_faults = p->numa_faults_memory[2*node + i]; | 539 | nr_faults = p->numa_faults[2*node + i]; |
| 533 | 540 | ||
| 534 | cpu_current = !i ? (task_node(p) == node) : | 541 | cpu_current = !i ? (task_node(p) == node) : |
| 535 | (pol && node_isset(node, pol->v.nodes)); | 542 | (pol && node_isset(node, pol->v.nodes)); |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ef2b104b254c..40667cbf371b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
| @@ -873,7 +873,6 @@ struct numa_group { | |||
| 873 | spinlock_t lock; /* nr_tasks, tasks */ | 873 | spinlock_t lock; /* nr_tasks, tasks */ |
| 874 | int nr_tasks; | 874 | int nr_tasks; |
| 875 | pid_t gid; | 875 | pid_t gid; |
| 876 | struct list_head task_list; | ||
| 877 | 876 | ||
| 878 | struct rcu_head rcu; | 877 | struct rcu_head rcu; |
| 879 | nodemask_t active_nodes; | 878 | nodemask_t active_nodes; |
| @@ -901,18 +900,24 @@ pid_t task_numa_group_id(struct task_struct *p) | |||
| 901 | return p->numa_group ? p->numa_group->gid : 0; | 900 | return p->numa_group ? p->numa_group->gid : 0; |
| 902 | } | 901 | } |
| 903 | 902 | ||
| 904 | static inline int task_faults_idx(int nid, int priv) | 903 | /* |
| 904 | * The averaged statistics, shared & private, memory & cpu, | ||
| 905 | * occupy the first half of the array. The second half of the | ||
| 906 | * array is for current counters, which are averaged into the | ||
| 907 | * first set by task_numa_placement. | ||
| 908 | */ | ||
| 909 | static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv) | ||
| 905 | { | 910 | { |
| 906 | return NR_NUMA_HINT_FAULT_TYPES * nid + priv; | 911 | return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv; |
| 907 | } | 912 | } |
| 908 | 913 | ||
| 909 | static inline unsigned long task_faults(struct task_struct *p, int nid) | 914 | static inline unsigned long task_faults(struct task_struct *p, int nid) |
| 910 | { | 915 | { |
| 911 | if (!p->numa_faults_memory) | 916 | if (!p->numa_faults) |
| 912 | return 0; | 917 | return 0; |
| 913 | 918 | ||
| 914 | return p->numa_faults_memory[task_faults_idx(nid, 0)] + | 919 | return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] + |
| 915 | p->numa_faults_memory[task_faults_idx(nid, 1)]; | 920 | p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)]; |
| 916 | } | 921 | } |
| 917 | 922 | ||
| 918 | static inline unsigned long group_faults(struct task_struct *p, int nid) | 923 | static inline unsigned long group_faults(struct task_struct *p, int nid) |
| @@ -920,14 +925,79 @@ static inline unsigned long group_faults(struct task_struct *p, int nid) | |||
| 920 | if (!p->numa_group) | 925 | if (!p->numa_group) |
| 921 | return 0; | 926 | return 0; |
| 922 | 927 | ||
| 923 | return p->numa_group->faults[task_faults_idx(nid, 0)] + | 928 | return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] + |
| 924 | p->numa_group->faults[task_faults_idx(nid, 1)]; | 929 | p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)]; |
| 925 | } | 930 | } |
| 926 | 931 | ||
| 927 | static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) | 932 | static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) |
| 928 | { | 933 | { |
| 929 | return group->faults_cpu[task_faults_idx(nid, 0)] + | 934 | return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] + |
| 930 | group->faults_cpu[task_faults_idx(nid, 1)]; | 935 | group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)]; |
| 936 | } | ||
| 937 | |||
| 938 | /* Handle placement on systems where not all nodes are directly connected. */ | ||
| 939 | static unsigned long score_nearby_nodes(struct task_struct *p, int nid, | ||
| 940 | int maxdist, bool task) | ||
| 941 | { | ||
| 942 | unsigned long score = 0; | ||
| 943 | int node; | ||
| 944 | |||
| 945 | /* | ||
| 946 | * All nodes are directly connected, and the same distance | ||
| 947 | * from each other. No need for fancy placement algorithms. | ||
| 948 | */ | ||
| 949 | if (sched_numa_topology_type == NUMA_DIRECT) | ||
| 950 | return 0; | ||
| 951 | |||
| 952 | /* | ||
| 953 | * This code is called for each node, introducing N^2 complexity, | ||
| 954 | * which should be ok given the number of nodes rarely exceeds 8. | ||
| 955 | */ | ||
| 956 | for_each_online_node(node) { | ||
| 957 | unsigned long faults; | ||
| 958 | int dist = node_distance(nid, node); | ||
| 959 | |||
| 960 | /* | ||
| 961 | * The furthest away nodes in the system are not interesting | ||
| 962 | * for placement; nid was already counted. | ||
| 963 | */ | ||
| 964 | if (dist == sched_max_numa_distance || node == nid) | ||
| 965 | continue; | ||
| 966 | |||
| 967 | /* | ||
| 968 | * On systems with a backplane NUMA topology, compare groups | ||
| 969 | * of nodes, and move tasks towards the group with the most | ||
| 970 | * memory accesses. When comparing two nodes at distance | ||
| 971 | * "hoplimit", only nodes closer by than "hoplimit" are part | ||
| 972 | * of each group. Skip other nodes. | ||
| 973 | */ | ||
| 974 | if (sched_numa_topology_type == NUMA_BACKPLANE && | ||
| 975 | dist > maxdist) | ||
| 976 | continue; | ||
| 977 | |||
| 978 | /* Add up the faults from nearby nodes. */ | ||
| 979 | if (task) | ||
| 980 | faults = task_faults(p, node); | ||
| 981 | else | ||
| 982 | faults = group_faults(p, node); | ||
| 983 | |||
| 984 | /* | ||
| 985 | * On systems with a glueless mesh NUMA topology, there are | ||
| 986 | * no fixed "groups of nodes". Instead, nodes that are not | ||
| 987 | * directly connected bounce traffic through intermediate | ||
| 988 | * nodes; a numa_group can occupy any set of nodes. | ||
| 989 | * The further away a node is, the less the faults count. | ||
| 990 | * This seems to result in good task placement. | ||
| 991 | */ | ||
| 992 | if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { | ||
| 993 | faults *= (sched_max_numa_distance - dist); | ||
| 994 | faults /= (sched_max_numa_distance - LOCAL_DISTANCE); | ||
| 995 | } | ||
| 996 | |||
| 997 | score += faults; | ||
| 998 | } | ||
| 999 | |||
| 1000 | return score; | ||
| 931 | } | 1001 | } |
| 932 | 1002 | ||
| 933 | /* | 1003 | /* |
| @@ -936,11 +1006,12 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) | |||
| 936 | * larger multiplier, in order to group tasks together that are almost | 1006 | * larger multiplier, in order to group tasks together that are almost |
| 937 | * evenly spread out between numa nodes. | 1007 | * evenly spread out between numa nodes. |
| 938 | */ | 1008 | */ |
| 939 | static inline unsigned long task_weight(struct task_struct *p, int nid) | 1009 | static inline unsigned long task_weight(struct task_struct *p, int nid, |
| 1010 | int dist) | ||
| 940 | { | 1011 | { |
| 941 | unsigned long total_faults; | 1012 | unsigned long faults, total_faults; |
| 942 | 1013 | ||
| 943 | if (!p->numa_faults_memory) | 1014 | if (!p->numa_faults) |
| 944 | return 0; | 1015 | return 0; |
| 945 | 1016 | ||
| 946 | total_faults = p->total_numa_faults; | 1017 | total_faults = p->total_numa_faults; |
| @@ -948,15 +1019,29 @@ static inline unsigned long task_weight(struct task_struct *p, int nid) | |||
| 948 | if (!total_faults) | 1019 | if (!total_faults) |
| 949 | return 0; | 1020 | return 0; |
| 950 | 1021 | ||
| 951 | return 1000 * task_faults(p, nid) / total_faults; | 1022 | faults = task_faults(p, nid); |
| 1023 | faults += score_nearby_nodes(p, nid, dist, true); | ||
| 1024 | |||
| 1025 | return 1000 * faults / total_faults; | ||
| 952 | } | 1026 | } |
| 953 | 1027 | ||
| 954 | static inline unsigned long group_weight(struct task_struct *p, int nid) | 1028 | static inline unsigned long group_weight(struct task_struct *p, int nid, |
| 1029 | int dist) | ||
| 955 | { | 1030 | { |
| 956 | if (!p->numa_group || !p->numa_group->total_faults) | 1031 | unsigned long faults, total_faults; |
| 1032 | |||
| 1033 | if (!p->numa_group) | ||
| 1034 | return 0; | ||
| 1035 | |||
| 1036 | total_faults = p->numa_group->total_faults; | ||
| 1037 | |||
| 1038 | if (!total_faults) | ||
| 957 | return 0; | 1039 | return 0; |
| 958 | 1040 | ||
| 959 | return 1000 * group_faults(p, nid) / p->numa_group->total_faults; | 1041 | faults = group_faults(p, nid); |
| 1042 | faults += score_nearby_nodes(p, nid, dist, false); | ||
| 1043 | |||
| 1044 | return 1000 * faults / total_faults; | ||
| 960 | } | 1045 | } |
| 961 | 1046 | ||
| 962 | bool should_numa_migrate_memory(struct task_struct *p, struct page * page, | 1047 | bool should_numa_migrate_memory(struct task_struct *p, struct page * page, |
| @@ -1089,6 +1174,7 @@ struct task_numa_env { | |||
| 1089 | struct numa_stats src_stats, dst_stats; | 1174 | struct numa_stats src_stats, dst_stats; |
| 1090 | 1175 | ||
| 1091 | int imbalance_pct; | 1176 | int imbalance_pct; |
| 1177 | int dist; | ||
| 1092 | 1178 | ||
| 1093 | struct task_struct *best_task; | 1179 | struct task_struct *best_task; |
| 1094 | long best_imp; | 1180 | long best_imp; |
| @@ -1168,6 +1254,7 @@ static void task_numa_compare(struct task_numa_env *env, | |||
| 1168 | long load; | 1254 | long load; |
| 1169 | long imp = env->p->numa_group ? groupimp : taskimp; | 1255 | long imp = env->p->numa_group ? groupimp : taskimp; |
| 1170 | long moveimp = imp; | 1256 | long moveimp = imp; |
| 1257 | int dist = env->dist; | ||
| 1171 | 1258 | ||
| 1172 | rcu_read_lock(); | 1259 | rcu_read_lock(); |
| 1173 | 1260 | ||
| @@ -1208,8 +1295,8 @@ static void task_numa_compare(struct task_numa_env *env, | |||
| 1208 | * in any group then look only at task weights. | 1295 | * in any group then look only at task weights. |
| 1209 | */ | 1296 | */ |
| 1210 | if (cur->numa_group == env->p->numa_group) { | 1297 | if (cur->numa_group == env->p->numa_group) { |
| 1211 | imp = taskimp + task_weight(cur, env->src_nid) - | 1298 | imp = taskimp + task_weight(cur, env->src_nid, dist) - |
| 1212 | task_weight(cur, env->dst_nid); | 1299 | task_weight(cur, env->dst_nid, dist); |
| 1213 | /* | 1300 | /* |
| 1214 | * Add some hysteresis to prevent swapping the | 1301 | * Add some hysteresis to prevent swapping the |
| 1215 | * tasks within a group over tiny differences. | 1302 | * tasks within a group over tiny differences. |
| @@ -1223,11 +1310,11 @@ static void task_numa_compare(struct task_numa_env *env, | |||
| 1223 | * instead. | 1310 | * instead. |
| 1224 | */ | 1311 | */ |
| 1225 | if (cur->numa_group) | 1312 | if (cur->numa_group) |
| 1226 | imp += group_weight(cur, env->src_nid) - | 1313 | imp += group_weight(cur, env->src_nid, dist) - |
| 1227 | group_weight(cur, env->dst_nid); | 1314 | group_weight(cur, env->dst_nid, dist); |
| 1228 | else | 1315 | else |
| 1229 | imp += task_weight(cur, env->src_nid) - | 1316 | imp += task_weight(cur, env->src_nid, dist) - |
| 1230 | task_weight(cur, env->dst_nid); | 1317 | task_weight(cur, env->dst_nid, dist); |
| 1231 | } | 1318 | } |
| 1232 | } | 1319 | } |
| 1233 | 1320 | ||
| @@ -1326,7 +1413,7 @@ static int task_numa_migrate(struct task_struct *p) | |||
| 1326 | }; | 1413 | }; |
| 1327 | struct sched_domain *sd; | 1414 | struct sched_domain *sd; |
| 1328 | unsigned long taskweight, groupweight; | 1415 | unsigned long taskweight, groupweight; |
| 1329 | int nid, ret; | 1416 | int nid, ret, dist; |
| 1330 | long taskimp, groupimp; | 1417 | long taskimp, groupimp; |
| 1331 | 1418 | ||
| 1332 | /* | 1419 | /* |
| @@ -1354,29 +1441,45 @@ static int task_numa_migrate(struct task_struct *p) | |||
| 1354 | return -EINVAL; | 1441 | return -EINVAL; |
| 1355 | } | 1442 | } |
| 1356 | 1443 | ||
| 1357 | taskweight = task_weight(p, env.src_nid); | ||
| 1358 | groupweight = group_weight(p, env.src_nid); | ||
| 1359 | update_numa_stats(&env.src_stats, env.src_nid); | ||
| 1360 | env.dst_nid = p->numa_preferred_nid; | 1444 | env.dst_nid = p->numa_preferred_nid; |
| 1361 | taskimp = task_weight(p, env.dst_nid) - taskweight; | 1445 | dist = env.dist = node_distance(env.src_nid, env.dst_nid); |
| 1362 | groupimp = group_weight(p, env.dst_nid) - groupweight; | 1446 | taskweight = task_weight(p, env.src_nid, dist); |
| 1447 | groupweight = group_weight(p, env.src_nid, dist); | ||
| 1448 | update_numa_stats(&env.src_stats, env.src_nid); | ||
| 1449 | taskimp = task_weight(p, env.dst_nid, dist) - taskweight; | ||
| 1450 | groupimp = group_weight(p, env.dst_nid, dist) - groupweight; | ||
| 1363 | update_numa_stats(&env.dst_stats, env.dst_nid); | 1451 | update_numa_stats(&env.dst_stats, env.dst_nid); |
| 1364 | 1452 | ||
| 1365 | /* Try to find a spot on the preferred nid. */ | 1453 | /* Try to find a spot on the preferred nid. */ |
| 1366 | task_numa_find_cpu(&env, taskimp, groupimp); | 1454 | task_numa_find_cpu(&env, taskimp, groupimp); |
| 1367 | 1455 | ||
| 1368 | /* No space available on the preferred nid. Look elsewhere. */ | 1456 | /* |
| 1369 | if (env.best_cpu == -1) { | 1457 | * Look at other nodes in these cases: |
| 1458 | * - there is no space available on the preferred_nid | ||
| 1459 | * - the task is part of a numa_group that is interleaved across | ||
| 1460 | * multiple NUMA nodes; in order to better consolidate the group, | ||
| 1461 | * we need to check other locations. | ||
| 1462 | */ | ||
| 1463 | if (env.best_cpu == -1 || (p->numa_group && | ||
| 1464 | nodes_weight(p->numa_group->active_nodes) > 1)) { | ||
| 1370 | for_each_online_node(nid) { | 1465 | for_each_online_node(nid) { |
| 1371 | if (nid == env.src_nid || nid == p->numa_preferred_nid) | 1466 | if (nid == env.src_nid || nid == p->numa_preferred_nid) |
| 1372 | continue; | 1467 | continue; |
| 1373 | 1468 | ||
| 1469 | dist = node_distance(env.src_nid, env.dst_nid); | ||
| 1470 | if (sched_numa_topology_type == NUMA_BACKPLANE && | ||
| 1471 | dist != env.dist) { | ||
| 1472 | taskweight = task_weight(p, env.src_nid, dist); | ||
| 1473 | groupweight = group_weight(p, env.src_nid, dist); | ||
| 1474 | } | ||
| 1475 | |||
| 1374 | /* Only consider nodes where both task and groups benefit */ | 1476 | /* Only consider nodes where both task and groups benefit */ |
| 1375 | taskimp = task_weight(p, nid) - taskweight; | 1477 | taskimp = task_weight(p, nid, dist) - taskweight; |
| 1376 | groupimp = group_weight(p, nid) - groupweight; | 1478 | groupimp = group_weight(p, nid, dist) - groupweight; |
| 1377 | if (taskimp < 0 && groupimp < 0) | 1479 | if (taskimp < 0 && groupimp < 0) |
| 1378 | continue; | 1480 | continue; |
| 1379 | 1481 | ||
| 1482 | env.dist = dist; | ||
| 1380 | env.dst_nid = nid; | 1483 | env.dst_nid = nid; |
| 1381 | update_numa_stats(&env.dst_stats, env.dst_nid); | 1484 | update_numa_stats(&env.dst_stats, env.dst_nid); |
| 1382 | task_numa_find_cpu(&env, taskimp, groupimp); | 1485 | task_numa_find_cpu(&env, taskimp, groupimp); |
| @@ -1431,7 +1534,7 @@ static void numa_migrate_preferred(struct task_struct *p) | |||
| 1431 | unsigned long interval = HZ; | 1534 | unsigned long interval = HZ; |
| 1432 | 1535 | ||
| 1433 | /* This task has no NUMA fault statistics yet */ | 1536 | /* This task has no NUMA fault statistics yet */ |
| 1434 | if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) | 1537 | if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) |
| 1435 | return; | 1538 | return; |
| 1436 | 1539 | ||
| 1437 | /* Periodically retry migrating the task to the preferred node */ | 1540 | /* Periodically retry migrating the task to the preferred node */ |
| @@ -1580,6 +1683,92 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) | |||
| 1580 | return delta; | 1683 | return delta; |
| 1581 | } | 1684 | } |
| 1582 | 1685 | ||
| 1686 | /* | ||
| 1687 | * Determine the preferred nid for a task in a numa_group. This needs to | ||
| 1688 | * be done in a way that produces consistent results with group_weight, | ||
| 1689 | * otherwise workloads might not converge. | ||
| 1690 | */ | ||
| 1691 | static int preferred_group_nid(struct task_struct *p, int nid) | ||
| 1692 | { | ||
| 1693 | nodemask_t nodes; | ||
| 1694 | int dist; | ||
| 1695 | |||
| 1696 | /* Direct connections between all NUMA nodes. */ | ||
| 1697 | if (sched_numa_topology_type == NUMA_DIRECT) | ||
| 1698 | return nid; | ||
| 1699 | |||
| 1700 | /* | ||
| 1701 | * On a system with glueless mesh NUMA topology, group_weight | ||
| 1702 | * scores nodes according to the number of NUMA hinting faults on | ||
| 1703 | * both the node itself, and on nearby nodes. | ||
| 1704 | */ | ||
| 1705 | if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { | ||
| 1706 | unsigned long score, max_score = 0; | ||
| 1707 | int node, max_node = nid; | ||
| 1708 | |||
| 1709 | dist = sched_max_numa_distance; | ||
| 1710 | |||
| 1711 | for_each_online_node(node) { | ||
| 1712 | score = group_weight(p, node, dist); | ||
| 1713 | if (score > max_score) { | ||
| 1714 | max_score = score; | ||
| 1715 | max_node = node; | ||
| 1716 | } | ||
| 1717 | } | ||
| 1718 | return max_node; | ||
| 1719 | } | ||
| 1720 | |||
| 1721 | /* | ||
| 1722 | * Finding the preferred nid in a system with NUMA backplane | ||
| 1723 | * interconnect topology is more involved. The goal is to locate | ||
| 1724 | * tasks from numa_groups near each other in the system, and | ||
| 1725 | * untangle workloads from different sides of the system. This requires | ||
| 1726 | * searching down the hierarchy of node groups, recursively searching | ||
| 1727 | * inside the highest scoring group of nodes. The nodemask tricks | ||
| 1728 | * keep the complexity of the search down. | ||
| 1729 | */ | ||
| 1730 | nodes = node_online_map; | ||
| 1731 | for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) { | ||
| 1732 | unsigned long max_faults = 0; | ||
| 1733 | nodemask_t max_group; | ||
| 1734 | int a, b; | ||
| 1735 | |||
| 1736 | /* Are there nodes at this distance from each other? */ | ||
| 1737 | if (!find_numa_distance(dist)) | ||
| 1738 | continue; | ||
| 1739 | |||
| 1740 | for_each_node_mask(a, nodes) { | ||
| 1741 | unsigned long faults = 0; | ||
| 1742 | nodemask_t this_group; | ||
| 1743 | nodes_clear(this_group); | ||
| 1744 | |||
| 1745 | /* Sum group's NUMA faults; includes a==b case. */ | ||
| 1746 | for_each_node_mask(b, nodes) { | ||
| 1747 | if (node_distance(a, b) < dist) { | ||
| 1748 | faults += group_faults(p, b); | ||
| 1749 | node_set(b, this_group); | ||
| 1750 | node_clear(b, nodes); | ||
| 1751 | } | ||
| 1752 | } | ||
| 1753 | |||
| 1754 | /* Remember the top group. */ | ||
| 1755 | if (faults > max_faults) { | ||
| 1756 | max_faults = faults; | ||
| 1757 | max_group = this_group; | ||
| 1758 | /* | ||
| 1759 | * subtle: at the smallest distance there is | ||
| 1760 | * just one node left in each "group", the | ||
| 1761 | * winner is the preferred nid. | ||
| 1762 | */ | ||
| 1763 | nid = a; | ||
| 1764 | } | ||
| 1765 | } | ||
| 1766 | /* Next round, evaluate the nodes within max_group. */ | ||
| 1767 | nodes = max_group; | ||
| 1768 | } | ||
| 1769 | return nid; | ||
| 1770 | } | ||
| 1771 | |||
| 1583 | static void task_numa_placement(struct task_struct *p) | 1772 | static void task_numa_placement(struct task_struct *p) |
| 1584 | { | 1773 | { |
| 1585 | int seq, nid, max_nid = -1, max_group_nid = -1; | 1774 | int seq, nid, max_nid = -1, max_group_nid = -1; |
| @@ -1607,18 +1796,23 @@ static void task_numa_placement(struct task_struct *p) | |||
| 1607 | 1796 | ||
| 1608 | /* Find the node with the highest number of faults */ | 1797 | /* Find the node with the highest number of faults */ |
| 1609 | for_each_online_node(nid) { | 1798 | for_each_online_node(nid) { |
| 1799 | /* Keep track of the offsets in numa_faults array */ | ||
| 1800 | int mem_idx, membuf_idx, cpu_idx, cpubuf_idx; | ||
| 1610 | unsigned long faults = 0, group_faults = 0; | 1801 | unsigned long faults = 0, group_faults = 0; |
| 1611 | int priv, i; | 1802 | int priv; |
| 1612 | 1803 | ||
| 1613 | for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) { | 1804 | for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) { |
| 1614 | long diff, f_diff, f_weight; | 1805 | long diff, f_diff, f_weight; |
| 1615 | 1806 | ||
| 1616 | i = task_faults_idx(nid, priv); | 1807 | mem_idx = task_faults_idx(NUMA_MEM, nid, priv); |
| 1808 | membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv); | ||
| 1809 | cpu_idx = task_faults_idx(NUMA_CPU, nid, priv); | ||
| 1810 | cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv); | ||
| 1617 | 1811 | ||
| 1618 | /* Decay existing window, copy faults since last scan */ | 1812 | /* Decay existing window, copy faults since last scan */ |
| 1619 | diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2; | 1813 | diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2; |
| 1620 | fault_types[priv] += p->numa_faults_buffer_memory[i]; | 1814 | fault_types[priv] += p->numa_faults[membuf_idx]; |
| 1621 | p->numa_faults_buffer_memory[i] = 0; | 1815 | p->numa_faults[membuf_idx] = 0; |
| 1622 | 1816 | ||
| 1623 | /* | 1817 | /* |
| 1624 | * Normalize the faults_from, so all tasks in a group | 1818 | * Normalize the faults_from, so all tasks in a group |
| @@ -1628,21 +1822,27 @@ static void task_numa_placement(struct task_struct *p) | |||
| 1628 | * faults are less important. | 1822 | * faults are less important. |
| 1629 | */ | 1823 | */ |
| 1630 | f_weight = div64_u64(runtime << 16, period + 1); | 1824 | f_weight = div64_u64(runtime << 16, period + 1); |
| 1631 | f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) / | 1825 | f_weight = (f_weight * p->numa_faults[cpubuf_idx]) / |
| 1632 | (total_faults + 1); | 1826 | (total_faults + 1); |
| 1633 | f_diff = f_weight - p->numa_faults_cpu[i] / 2; | 1827 | f_diff = f_weight - p->numa_faults[cpu_idx] / 2; |
| 1634 | p->numa_faults_buffer_cpu[i] = 0; | 1828 | p->numa_faults[cpubuf_idx] = 0; |
| 1635 | 1829 | ||
| 1636 | p->numa_faults_memory[i] += diff; | 1830 | p->numa_faults[mem_idx] += diff; |
| 1637 | p->numa_faults_cpu[i] += f_diff; | 1831 | p->numa_faults[cpu_idx] += f_diff; |
| 1638 | faults += p->numa_faults_memory[i]; | 1832 | faults += p->numa_faults[mem_idx]; |
| 1639 | p->total_numa_faults += diff; | 1833 | p->total_numa_faults += diff; |
| 1640 | if (p->numa_group) { | 1834 | if (p->numa_group) { |
| 1641 | /* safe because we can only change our own group */ | 1835 | /* |
| 1642 | p->numa_group->faults[i] += diff; | 1836 | * safe because we can only change our own group |
| 1643 | p->numa_group->faults_cpu[i] += f_diff; | 1837 | * |
| 1838 | * mem_idx represents the offset for a given | ||
| 1839 | * nid and priv in a specific region because it | ||
| 1840 | * is at the beginning of the numa_faults array. | ||
| 1841 | */ | ||
| 1842 | p->numa_group->faults[mem_idx] += diff; | ||
| 1843 | p->numa_group->faults_cpu[mem_idx] += f_diff; | ||
| 1644 | p->numa_group->total_faults += diff; | 1844 | p->numa_group->total_faults += diff; |
| 1645 | group_faults += p->numa_group->faults[i]; | 1845 | group_faults += p->numa_group->faults[mem_idx]; |
| 1646 | } | 1846 | } |
| 1647 | } | 1847 | } |
| 1648 | 1848 | ||
| @@ -1662,7 +1862,7 @@ static void task_numa_placement(struct task_struct *p) | |||
| 1662 | if (p->numa_group) { | 1862 | if (p->numa_group) { |
| 1663 | update_numa_active_node_mask(p->numa_group); | 1863 | update_numa_active_node_mask(p->numa_group); |
| 1664 | spin_unlock_irq(group_lock); | 1864 | spin_unlock_irq(group_lock); |
| 1665 | max_nid = max_group_nid; | 1865 | max_nid = preferred_group_nid(p, max_group_nid); |
| 1666 | } | 1866 | } |
| 1667 | 1867 | ||
| 1668 | if (max_faults) { | 1868 | if (max_faults) { |
| @@ -1705,7 +1905,6 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, | |||
| 1705 | 1905 | ||
| 1706 | atomic_set(&grp->refcount, 1); | 1906 | atomic_set(&grp->refcount, 1); |
| 1707 | spin_lock_init(&grp->lock); | 1907 | spin_lock_init(&grp->lock); |
| 1708 | INIT_LIST_HEAD(&grp->task_list); | ||
| 1709 | grp->gid = p->pid; | 1908 | grp->gid = p->pid; |
| 1710 | /* Second half of the array tracks nids where faults happen */ | 1909 | /* Second half of the array tracks nids where faults happen */ |
| 1711 | grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * | 1910 | grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * |
| @@ -1714,11 +1913,10 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, | |||
| 1714 | node_set(task_node(current), grp->active_nodes); | 1913 | node_set(task_node(current), grp->active_nodes); |
| 1715 | 1914 | ||
| 1716 | for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) | 1915 | for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) |
| 1717 | grp->faults[i] = p->numa_faults_memory[i]; | 1916 | grp->faults[i] = p->numa_faults[i]; |
| 1718 | 1917 | ||
| 1719 | grp->total_faults = p->total_numa_faults; | 1918 | grp->total_faults = p->total_numa_faults; |
| 1720 | 1919 | ||
| 1721 | list_add(&p->numa_entry, &grp->task_list); | ||
| 1722 | grp->nr_tasks++; | 1920 | grp->nr_tasks++; |
| 1723 | rcu_assign_pointer(p->numa_group, grp); | 1921 | rcu_assign_pointer(p->numa_group, grp); |
| 1724 | } | 1922 | } |
| @@ -1773,13 +1971,12 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, | |||
| 1773 | double_lock_irq(&my_grp->lock, &grp->lock); | 1971 | double_lock_irq(&my_grp->lock, &grp->lock); |
| 1774 | 1972 | ||
| 1775 | for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { | 1973 | for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { |
| 1776 | my_grp->faults[i] -= p->numa_faults_memory[i]; | 1974 | my_grp->faults[i] -= p->numa_faults[i]; |
| 1777 | grp->faults[i] += p->numa_faults_memory[i]; | 1975 | grp->faults[i] += p->numa_faults[i]; |
| 1778 | } | 1976 | } |
| 1779 | my_grp->total_faults -= p->total_numa_faults; | 1977 | my_grp->total_faults -= p->total_numa_faults; |
| 1780 | grp->total_faults += p->total_numa_faults; | 1978 | grp->total_faults += p->total_numa_faults; |
| 1781 | 1979 | ||
| 1782 | list_move(&p->numa_entry, &grp->task_list); | ||
| 1783 | my_grp->nr_tasks--; | 1980 | my_grp->nr_tasks--; |
| 1784 | grp->nr_tasks++; | 1981 | grp->nr_tasks++; |
| 1785 | 1982 | ||
| @@ -1799,27 +1996,23 @@ no_join: | |||
| 1799 | void task_numa_free(struct task_struct *p) | 1996 | void task_numa_free(struct task_struct *p) |
| 1800 | { | 1997 | { |
| 1801 | struct numa_group *grp = p->numa_group; | 1998 | struct numa_group *grp = p->numa_group; |
| 1802 | void *numa_faults = p->numa_faults_memory; | 1999 | void *numa_faults = p->numa_faults; |
| 1803 | unsigned long flags; | 2000 | unsigned long flags; |
| 1804 | int i; | 2001 | int i; |
| 1805 | 2002 | ||
| 1806 | if (grp) { | 2003 | if (grp) { |
| 1807 | spin_lock_irqsave(&grp->lock, flags); | 2004 | spin_lock_irqsave(&grp->lock, flags); |
| 1808 | for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) | 2005 | for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) |
| 1809 | grp->faults[i] -= p->numa_faults_memory[i]; | 2006 | grp->faults[i] -= p->numa_faults[i]; |
| 1810 | grp->total_faults -= p->total_numa_faults; | 2007 | grp->total_faults -= p->total_numa_faults; |
| 1811 | 2008 | ||
| 1812 | list_del(&p->numa_entry); | ||
| 1813 | grp->nr_tasks--; | 2009 | grp->nr_tasks--; |
| 1814 | spin_unlock_irqrestore(&grp->lock, flags); | 2010 | spin_unlock_irqrestore(&grp->lock, flags); |
| 1815 | RCU_INIT_POINTER(p->numa_group, NULL); | 2011 | RCU_INIT_POINTER(p->numa_group, NULL); |
| 1816 | put_numa_group(grp); | 2012 | put_numa_group(grp); |
| 1817 | } | 2013 | } |
| 1818 | 2014 | ||
| 1819 | p->numa_faults_memory = NULL; | 2015 | p->numa_faults = NULL; |
| 1820 | p->numa_faults_buffer_memory = NULL; | ||
| 1821 | p->numa_faults_cpu= NULL; | ||
| 1822 | p->numa_faults_buffer_cpu = NULL; | ||
| 1823 | kfree(numa_faults); | 2016 | kfree(numa_faults); |
| 1824 | } | 2017 | } |
| 1825 | 2018 | ||
| @@ -1842,24 +2035,14 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) | |||
| 1842 | return; | 2035 | return; |
| 1843 | 2036 | ||
| 1844 | /* Allocate buffer to track faults on a per-node basis */ | 2037 | /* Allocate buffer to track faults on a per-node basis */ |
| 1845 | if (unlikely(!p->numa_faults_memory)) { | 2038 | if (unlikely(!p->numa_faults)) { |
| 1846 | int size = sizeof(*p->numa_faults_memory) * | 2039 | int size = sizeof(*p->numa_faults) * |
| 1847 | NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids; | 2040 | NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids; |
| 1848 | 2041 | ||
| 1849 | p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); | 2042 | p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); |
| 1850 | if (!p->numa_faults_memory) | 2043 | if (!p->numa_faults) |
| 1851 | return; | 2044 | return; |
| 1852 | 2045 | ||
| 1853 | BUG_ON(p->numa_faults_buffer_memory); | ||
| 1854 | /* | ||
| 1855 | * The averaged statistics, shared & private, memory & cpu, | ||
| 1856 | * occupy the first half of the array. The second half of the | ||
| 1857 | * array is for current counters, which are averaged into the | ||
| 1858 | * first set by task_numa_placement. | ||
| 1859 | */ | ||
| 1860 | p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids); | ||
| 1861 | p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids); | ||
| 1862 | p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids); | ||
| 1863 | p->total_numa_faults = 0; | 2046 | p->total_numa_faults = 0; |
| 1864 | memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); | 2047 | memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); |
| 1865 | } | 2048 | } |
| @@ -1899,8 +2082,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) | |||
| 1899 | if (migrated) | 2082 | if (migrated) |
| 1900 | p->numa_pages_migrated += pages; | 2083 | p->numa_pages_migrated += pages; |
| 1901 | 2084 | ||
| 1902 | p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; | 2085 | p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages; |
| 1903 | p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; | 2086 | p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages; |
| 1904 | p->numa_faults_locality[local] += pages; | 2087 | p->numa_faults_locality[local] += pages; |
| 1905 | } | 2088 | } |
| 1906 | 2089 | ||
| @@ -3822,6 +4005,10 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force) | |||
| 3822 | 4005 | ||
| 3823 | static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | 4006 | static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) |
| 3824 | { | 4007 | { |
| 4008 | /* init_cfs_bandwidth() was not called */ | ||
| 4009 | if (!cfs_b->throttled_cfs_rq.next) | ||
| 4010 | return; | ||
| 4011 | |||
| 3825 | hrtimer_cancel(&cfs_b->period_timer); | 4012 | hrtimer_cancel(&cfs_b->period_timer); |
| 3826 | hrtimer_cancel(&cfs_b->slack_timer); | 4013 | hrtimer_cancel(&cfs_b->slack_timer); |
| 3827 | } | 4014 | } |
| @@ -4241,7 +4428,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) | |||
| 4241 | * wl = S * s'_i; see (2) | 4428 | * wl = S * s'_i; see (2) |
| 4242 | */ | 4429 | */ |
| 4243 | if (W > 0 && w < W) | 4430 | if (W > 0 && w < W) |
| 4244 | wl = (w * tg->shares) / W; | 4431 | wl = (w * (long)tg->shares) / W; |
| 4245 | else | 4432 | else |
| 4246 | wl = tg->shares; | 4433 | wl = tg->shares; |
| 4247 | 4434 | ||
| @@ -4469,7 +4656,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | |||
| 4469 | latest_idle_timestamp = rq->idle_stamp; | 4656 | latest_idle_timestamp = rq->idle_stamp; |
| 4470 | shallowest_idle_cpu = i; | 4657 | shallowest_idle_cpu = i; |
| 4471 | } | 4658 | } |
| 4472 | } else { | 4659 | } else if (shallowest_idle_cpu == -1) { |
| 4473 | load = weighted_cpuload(i); | 4660 | load = weighted_cpuload(i); |
| 4474 | if (load < min_load || (load == min_load && i == this_cpu)) { | 4661 | if (load < min_load || (load == min_load && i == this_cpu)) { |
| 4475 | min_load = load; | 4662 | min_load = load; |
| @@ -4547,9 +4734,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
| 4547 | int want_affine = 0; | 4734 | int want_affine = 0; |
| 4548 | int sync = wake_flags & WF_SYNC; | 4735 | int sync = wake_flags & WF_SYNC; |
| 4549 | 4736 | ||
| 4550 | if (p->nr_cpus_allowed == 1) | ||
| 4551 | return prev_cpu; | ||
| 4552 | |||
| 4553 | if (sd_flag & SD_BALANCE_WAKE) | 4737 | if (sd_flag & SD_BALANCE_WAKE) |
| 4554 | want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); | 4738 | want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); |
| 4555 | 4739 | ||
| @@ -5189,7 +5373,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) | |||
| 5189 | struct numa_group *numa_group = rcu_dereference(p->numa_group); | 5373 | struct numa_group *numa_group = rcu_dereference(p->numa_group); |
| 5190 | int src_nid, dst_nid; | 5374 | int src_nid, dst_nid; |
| 5191 | 5375 | ||
| 5192 | if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || | 5376 | if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || |
| 5193 | !(env->sd->flags & SD_NUMA)) { | 5377 | !(env->sd->flags & SD_NUMA)) { |
| 5194 | return false; | 5378 | return false; |
| 5195 | } | 5379 | } |
| @@ -5228,7 +5412,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) | |||
| 5228 | if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) | 5412 | if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) |
| 5229 | return false; | 5413 | return false; |
| 5230 | 5414 | ||
| 5231 | if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA)) | 5415 | if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) |
| 5232 | return false; | 5416 | return false; |
| 5233 | 5417 | ||
| 5234 | src_nid = cpu_to_node(env->src_cpu); | 5418 | src_nid = cpu_to_node(env->src_cpu); |
| @@ -6172,8 +6356,10 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd | |||
| 6172 | * with a large weight task outweighs the tasks on the system). | 6356 | * with a large weight task outweighs the tasks on the system). |
| 6173 | */ | 6357 | */ |
| 6174 | if (prefer_sibling && sds->local && | 6358 | if (prefer_sibling && sds->local && |
| 6175 | sds->local_stat.group_has_free_capacity) | 6359 | sds->local_stat.group_has_free_capacity) { |
| 6176 | sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); | 6360 | sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); |
| 6361 | sgs->group_type = group_classify(sg, sgs); | ||
| 6362 | } | ||
| 6177 | 6363 | ||
| 6178 | if (update_sd_pick_busiest(env, sds, sg, sgs)) { | 6364 | if (update_sd_pick_busiest(env, sds, sg, sgs)) { |
| 6179 | sds->busiest = sg; | 6365 | sds->busiest = sg; |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 20bca398084a..ee15f5a0d1c1 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
| @@ -1301,9 +1301,6 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) | |||
| 1301 | struct task_struct *curr; | 1301 | struct task_struct *curr; |
| 1302 | struct rq *rq; | 1302 | struct rq *rq; |
| 1303 | 1303 | ||
| 1304 | if (p->nr_cpus_allowed == 1) | ||
| 1305 | goto out; | ||
| 1306 | |||
| 1307 | /* For anything but wake ups, just return the task_cpu */ | 1304 | /* For anything but wake ups, just return the task_cpu */ |
| 1308 | if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) | 1305 | if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) |
| 1309 | goto out; | 1306 | goto out; |
| @@ -1351,16 +1348,22 @@ out: | |||
| 1351 | 1348 | ||
| 1352 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) | 1349 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) |
| 1353 | { | 1350 | { |
| 1354 | if (rq->curr->nr_cpus_allowed == 1) | 1351 | /* |
| 1352 | * Current can't be migrated, useless to reschedule, | ||
| 1353 | * let's hope p can move out. | ||
| 1354 | */ | ||
| 1355 | if (rq->curr->nr_cpus_allowed == 1 || | ||
| 1356 | !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) | ||
| 1355 | return; | 1357 | return; |
| 1356 | 1358 | ||
| 1359 | /* | ||
| 1360 | * p is migratable, so let's not schedule it and | ||
| 1361 | * see if it is pushed or pulled somewhere else. | ||
| 1362 | */ | ||
| 1357 | if (p->nr_cpus_allowed != 1 | 1363 | if (p->nr_cpus_allowed != 1 |
| 1358 | && cpupri_find(&rq->rd->cpupri, p, NULL)) | 1364 | && cpupri_find(&rq->rd->cpupri, p, NULL)) |
| 1359 | return; | 1365 | return; |
| 1360 | 1366 | ||
| 1361 | if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) | ||
| 1362 | return; | ||
| 1363 | |||
| 1364 | /* | 1367 | /* |
| 1365 | * There appears to be other cpus that can accept | 1368 | * There appears to be other cpus that can accept |
| 1366 | * current and none to run 'p', so lets reschedule | 1369 | * current and none to run 'p', so lets reschedule |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2df8ef067cc5..9a2a45c970e7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
| @@ -176,6 +176,25 @@ struct dl_bw { | |||
| 176 | u64 bw, total_bw; | 176 | u64 bw, total_bw; |
| 177 | }; | 177 | }; |
| 178 | 178 | ||
| 179 | static inline | ||
| 180 | void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) | ||
| 181 | { | ||
| 182 | dl_b->total_bw -= tsk_bw; | ||
| 183 | } | ||
| 184 | |||
| 185 | static inline | ||
| 186 | void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) | ||
| 187 | { | ||
| 188 | dl_b->total_bw += tsk_bw; | ||
| 189 | } | ||
| 190 | |||
| 191 | static inline | ||
| 192 | bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) | ||
| 193 | { | ||
| 194 | return dl_b->bw != -1 && | ||
| 195 | dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; | ||
| 196 | } | ||
| 197 | |||
| 179 | extern struct mutex sched_domains_mutex; | 198 | extern struct mutex sched_domains_mutex; |
| 180 | 199 | ||
| 181 | #ifdef CONFIG_CGROUP_SCHED | 200 | #ifdef CONFIG_CGROUP_SCHED |
| @@ -678,7 +697,25 @@ static inline u64 rq_clock_task(struct rq *rq) | |||
| 678 | return rq->clock_task; | 697 | return rq->clock_task; |
| 679 | } | 698 | } |
| 680 | 699 | ||
| 700 | #ifdef CONFIG_NUMA | ||
| 701 | enum numa_topology_type { | ||
| 702 | NUMA_DIRECT, | ||
| 703 | NUMA_GLUELESS_MESH, | ||
| 704 | NUMA_BACKPLANE, | ||
| 705 | }; | ||
| 706 | extern enum numa_topology_type sched_numa_topology_type; | ||
| 707 | extern int sched_max_numa_distance; | ||
| 708 | extern bool find_numa_distance(int distance); | ||
| 709 | #endif | ||
| 710 | |||
| 681 | #ifdef CONFIG_NUMA_BALANCING | 711 | #ifdef CONFIG_NUMA_BALANCING |
| 712 | /* The regions in numa_faults array from task_struct */ | ||
| 713 | enum numa_faults_stats { | ||
| 714 | NUMA_MEM = 0, | ||
| 715 | NUMA_CPU, | ||
| 716 | NUMA_MEMBUF, | ||
| 717 | NUMA_CPUBUF | ||
| 718 | }; | ||
| 682 | extern void sched_setnuma(struct task_struct *p, int node); | 719 | extern void sched_setnuma(struct task_struct *p, int node); |
| 683 | extern int migrate_task_to(struct task_struct *p, int cpu); | 720 | extern int migrate_task_to(struct task_struct *p, int cpu); |
| 684 | extern int migrate_swap(struct task_struct *, struct task_struct *); | 721 | extern int migrate_swap(struct task_struct *, struct task_struct *); |
| @@ -1127,6 +1164,11 @@ struct sched_class { | |||
| 1127 | void (*task_fork) (struct task_struct *p); | 1164 | void (*task_fork) (struct task_struct *p); |
| 1128 | void (*task_dead) (struct task_struct *p); | 1165 | void (*task_dead) (struct task_struct *p); |
| 1129 | 1166 | ||
| 1167 | /* | ||
| 1168 | * The switched_from() call is allowed to drop rq->lock, therefore we | ||
| 1169 | * cannot assume the switched_from/switched_to pair is serliazed by | ||
| 1170 | * rq->lock. They are however serialized by p->pi_lock. | ||
| 1171 | */ | ||
| 1130 | void (*switched_from) (struct rq *this_rq, struct task_struct *task); | 1172 | void (*switched_from) (struct rq *this_rq, struct task_struct *task); |
| 1131 | void (*switched_to) (struct rq *this_rq, struct task_struct *task); | 1173 | void (*switched_to) (struct rq *this_rq, struct task_struct *task); |
| 1132 | void (*prio_changed) (struct rq *this_rq, struct task_struct *task, | 1174 | void (*prio_changed) (struct rq *this_rq, struct task_struct *task, |
| @@ -1504,6 +1546,7 @@ extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); | |||
| 1504 | extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); | 1546 | extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); |
| 1505 | extern void print_cfs_stats(struct seq_file *m, int cpu); | 1547 | extern void print_cfs_stats(struct seq_file *m, int cpu); |
| 1506 | extern void print_rt_stats(struct seq_file *m, int cpu); | 1548 | extern void print_rt_stats(struct seq_file *m, int cpu); |
| 1549 | extern void print_dl_stats(struct seq_file *m, int cpu); | ||
| 1507 | 1550 | ||
| 1508 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); | 1551 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); |
| 1509 | extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); | 1552 | extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); |
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 5a62915f47a8..852143a79f36 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c | |||
| @@ -9,6 +9,7 @@ | |||
| 9 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
| 10 | #include <linux/wait.h> | 10 | #include <linux/wait.h> |
| 11 | #include <linux/hash.h> | 11 | #include <linux/hash.h> |
| 12 | #include <linux/kthread.h> | ||
| 12 | 13 | ||
| 13 | void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) | 14 | void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) |
| 14 | { | 15 | { |
| @@ -297,6 +298,71 @@ int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void * | |||
| 297 | } | 298 | } |
| 298 | EXPORT_SYMBOL(autoremove_wake_function); | 299 | EXPORT_SYMBOL(autoremove_wake_function); |
| 299 | 300 | ||
| 301 | static inline bool is_kthread_should_stop(void) | ||
| 302 | { | ||
| 303 | return (current->flags & PF_KTHREAD) && kthread_should_stop(); | ||
| 304 | } | ||
| 305 | |||
| 306 | /* | ||
| 307 | * DEFINE_WAIT_FUNC(wait, woken_wake_func); | ||
| 308 | * | ||
| 309 | * add_wait_queue(&wq, &wait); | ||
| 310 | * for (;;) { | ||
| 311 | * if (condition) | ||
| 312 | * break; | ||
| 313 | * | ||
| 314 | * p->state = mode; condition = true; | ||
| 315 | * smp_mb(); // A smp_wmb(); // C | ||
| 316 | * if (!wait->flags & WQ_FLAG_WOKEN) wait->flags |= WQ_FLAG_WOKEN; | ||
| 317 | * schedule() try_to_wake_up(); | ||
| 318 | * p->state = TASK_RUNNING; ~~~~~~~~~~~~~~~~~~ | ||
| 319 | * wait->flags &= ~WQ_FLAG_WOKEN; condition = true; | ||
| 320 | * smp_mb() // B smp_wmb(); // C | ||
| 321 | * wait->flags |= WQ_FLAG_WOKEN; | ||
| 322 | * } | ||
| 323 | * remove_wait_queue(&wq, &wait); | ||
| 324 | * | ||
| 325 | */ | ||
| 326 | long wait_woken(wait_queue_t *wait, unsigned mode, long timeout) | ||
| 327 | { | ||
| 328 | set_current_state(mode); /* A */ | ||
| 329 | /* | ||
| 330 | * The above implies an smp_mb(), which matches with the smp_wmb() from | ||
| 331 | * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must | ||
| 332 | * also observe all state before the wakeup. | ||
| 333 | */ | ||
| 334 | if (!(wait->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop()) | ||
| 335 | timeout = schedule_timeout(timeout); | ||
| 336 | __set_current_state(TASK_RUNNING); | ||
| 337 | |||
| 338 | /* | ||
| 339 | * The below implies an smp_mb(), it too pairs with the smp_wmb() from | ||
| 340 | * woken_wake_function() such that we must either observe the wait | ||
| 341 | * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss | ||
| 342 | * an event. | ||
| 343 | */ | ||
| 344 | set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */ | ||
| 345 | |||
| 346 | return timeout; | ||
| 347 | } | ||
| 348 | EXPORT_SYMBOL(wait_woken); | ||
| 349 | |||
| 350 | int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) | ||
| 351 | { | ||
| 352 | /* | ||
| 353 | * Although this function is called under waitqueue lock, LOCK | ||
| 354 | * doesn't imply write barrier and the users expects write | ||
| 355 | * barrier semantics on wakeup functions. The following | ||
| 356 | * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() | ||
| 357 | * and is paired with set_mb() in wait_woken(). | ||
| 358 | */ | ||
| 359 | smp_wmb(); /* C */ | ||
| 360 | wait->flags |= WQ_FLAG_WOKEN; | ||
| 361 | |||
| 362 | return default_wake_function(wait, mode, sync, key); | ||
| 363 | } | ||
| 364 | EXPORT_SYMBOL(woken_wake_function); | ||
| 365 | |||
| 300 | int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) | 366 | int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) |
| 301 | { | 367 | { |
| 302 | struct wait_bit_key *key = arg; | 368 | struct wait_bit_key *key = arg; |
diff --git a/kernel/signal.c b/kernel/signal.c index 8f0876f9f6dd..16a305295256 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
| @@ -1275,7 +1275,17 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, | |||
| 1275 | local_irq_restore(*flags); | 1275 | local_irq_restore(*flags); |
| 1276 | break; | 1276 | break; |
| 1277 | } | 1277 | } |
| 1278 | 1278 | /* | |
| 1279 | * This sighand can be already freed and even reused, but | ||
| 1280 | * we rely on SLAB_DESTROY_BY_RCU and sighand_ctor() which | ||
| 1281 | * initializes ->siglock: this slab can't go away, it has | ||
| 1282 | * the same object type, ->siglock can't be reinitialized. | ||
| 1283 | * | ||
| 1284 | * We need to ensure that tsk->sighand is still the same | ||
| 1285 | * after we take the lock, we can race with de_thread() or | ||
| 1286 | * __exit_signal(). In the latter case the next iteration | ||
| 1287 | * must see ->sighand == NULL. | ||
| 1288 | */ | ||
| 1279 | spin_lock(&sighand->siglock); | 1289 | spin_lock(&sighand->siglock); |
| 1280 | if (likely(sighand == tsk->sighand)) { | 1290 | if (likely(sighand == tsk->sighand)) { |
| 1281 | rcu_read_unlock(); | 1291 | rcu_read_unlock(); |
| @@ -1331,23 +1341,21 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid) | |||
| 1331 | int error = -ESRCH; | 1341 | int error = -ESRCH; |
| 1332 | struct task_struct *p; | 1342 | struct task_struct *p; |
| 1333 | 1343 | ||
| 1334 | rcu_read_lock(); | 1344 | for (;;) { |
| 1335 | retry: | 1345 | rcu_read_lock(); |
| 1336 | p = pid_task(pid, PIDTYPE_PID); | 1346 | p = pid_task(pid, PIDTYPE_PID); |
| 1337 | if (p) { | 1347 | if (p) |
| 1338 | error = group_send_sig_info(sig, info, p); | 1348 | error = group_send_sig_info(sig, info, p); |
| 1339 | if (unlikely(error == -ESRCH)) | 1349 | rcu_read_unlock(); |
| 1340 | /* | 1350 | if (likely(!p || error != -ESRCH)) |
| 1341 | * The task was unhashed in between, try again. | 1351 | return error; |
| 1342 | * If it is dead, pid_task() will return NULL, | ||
| 1343 | * if we race with de_thread() it will find the | ||
| 1344 | * new leader. | ||
| 1345 | */ | ||
| 1346 | goto retry; | ||
| 1347 | } | ||
| 1348 | rcu_read_unlock(); | ||
| 1349 | 1352 | ||
| 1350 | return error; | 1353 | /* |
| 1354 | * The task was unhashed in between, try again. If it | ||
| 1355 | * is dead, pid_task() will return NULL, if we race with | ||
| 1356 | * de_thread() it will find the new leader. | ||
| 1357 | */ | ||
| 1358 | } | ||
| 1351 | } | 1359 | } |
| 1352 | 1360 | ||
| 1353 | int kill_proc_info(int sig, struct siginfo *info, pid_t pid) | 1361 | int kill_proc_info(int sig, struct siginfo *info, pid_t pid) |
| @@ -2748,6 +2756,10 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) | |||
| 2748 | if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) | 2756 | if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) |
| 2749 | err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); | 2757 | err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); |
| 2750 | #endif | 2758 | #endif |
| 2759 | #ifdef SEGV_BNDERR | ||
| 2760 | err |= __put_user(from->si_lower, &to->si_lower); | ||
| 2761 | err |= __put_user(from->si_upper, &to->si_upper); | ||
| 2762 | #endif | ||
| 2751 | break; | 2763 | break; |
| 2752 | case __SI_CHLD: | 2764 | case __SI_CHLD: |
| 2753 | err |= __put_user(from->si_pid, &to->si_pid); | 2765 | err |= __put_user(from->si_pid, &to->si_pid); |
diff --git a/kernel/smpboot.c b/kernel/smpboot.c index eb89e1807408..f032fb5284e3 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c | |||
| @@ -110,7 +110,7 @@ static int smpboot_thread_fn(void *data) | |||
| 110 | set_current_state(TASK_INTERRUPTIBLE); | 110 | set_current_state(TASK_INTERRUPTIBLE); |
| 111 | preempt_disable(); | 111 | preempt_disable(); |
| 112 | if (kthread_should_stop()) { | 112 | if (kthread_should_stop()) { |
| 113 | set_current_state(TASK_RUNNING); | 113 | __set_current_state(TASK_RUNNING); |
| 114 | preempt_enable(); | 114 | preempt_enable(); |
| 115 | if (ht->cleanup) | 115 | if (ht->cleanup) |
| 116 | ht->cleanup(td->cpu, cpu_online(td->cpu)); | 116 | ht->cleanup(td->cpu, cpu_online(td->cpu)); |
| @@ -136,26 +136,27 @@ static int smpboot_thread_fn(void *data) | |||
| 136 | /* Check for state change setup */ | 136 | /* Check for state change setup */ |
| 137 | switch (td->status) { | 137 | switch (td->status) { |
| 138 | case HP_THREAD_NONE: | 138 | case HP_THREAD_NONE: |
| 139 | __set_current_state(TASK_RUNNING); | ||
| 139 | preempt_enable(); | 140 | preempt_enable(); |
| 140 | if (ht->setup) | 141 | if (ht->setup) |
| 141 | ht->setup(td->cpu); | 142 | ht->setup(td->cpu); |
| 142 | td->status = HP_THREAD_ACTIVE; | 143 | td->status = HP_THREAD_ACTIVE; |
| 143 | preempt_disable(); | 144 | continue; |
| 144 | break; | 145 | |
| 145 | case HP_THREAD_PARKED: | 146 | case HP_THREAD_PARKED: |
| 147 | __set_current_state(TASK_RUNNING); | ||
| 146 | preempt_enable(); | 148 | preempt_enable(); |
| 147 | if (ht->unpark) | 149 | if (ht->unpark) |
| 148 | ht->unpark(td->cpu); | 150 | ht->unpark(td->cpu); |
| 149 | td->status = HP_THREAD_ACTIVE; | 151 | td->status = HP_THREAD_ACTIVE; |
| 150 | preempt_disable(); | 152 | continue; |
| 151 | break; | ||
| 152 | } | 153 | } |
| 153 | 154 | ||
| 154 | if (!ht->thread_should_run(td->cpu)) { | 155 | if (!ht->thread_should_run(td->cpu)) { |
| 155 | preempt_enable(); | 156 | preempt_enable_no_resched(); |
| 156 | schedule(); | 157 | schedule(); |
| 157 | } else { | 158 | } else { |
| 158 | set_current_state(TASK_RUNNING); | 159 | __set_current_state(TASK_RUNNING); |
| 159 | preempt_enable(); | 160 | preempt_enable(); |
| 160 | ht->thread_fn(td->cpu); | 161 | ht->thread_fn(td->cpu); |
| 161 | } | 162 | } |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 0699add19164..501baa9ac1be 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
| @@ -656,7 +656,7 @@ static void run_ksoftirqd(unsigned int cpu) | |||
| 656 | * in the task stack here. | 656 | * in the task stack here. |
| 657 | */ | 657 | */ |
| 658 | __do_softirq(); | 658 | __do_softirq(); |
| 659 | rcu_note_context_switch(cpu); | 659 | rcu_note_context_switch(); |
| 660 | local_irq_enable(); | 660 | local_irq_enable(); |
| 661 | cond_resched(); | 661 | cond_resched(); |
| 662 | return; | 662 | return; |
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 00fe55cc5a82..b6e4c16377c7 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c | |||
| @@ -25,6 +25,38 @@ void print_stack_trace(struct stack_trace *trace, int spaces) | |||
| 25 | } | 25 | } |
| 26 | EXPORT_SYMBOL_GPL(print_stack_trace); | 26 | EXPORT_SYMBOL_GPL(print_stack_trace); |
| 27 | 27 | ||
| 28 | int snprint_stack_trace(char *buf, size_t size, | ||
| 29 | struct stack_trace *trace, int spaces) | ||
| 30 | { | ||
| 31 | int i; | ||
| 32 | unsigned long ip; | ||
| 33 | int generated; | ||
| 34 | int total = 0; | ||
| 35 | |||
| 36 | if (WARN_ON(!trace->entries)) | ||
| 37 | return 0; | ||
| 38 | |||
| 39 | for (i = 0; i < trace->nr_entries; i++) { | ||
| 40 | ip = trace->entries[i]; | ||
| 41 | generated = snprintf(buf, size, "%*c[<%p>] %pS\n", | ||
| 42 | 1 + spaces, ' ', (void *) ip, (void *) ip); | ||
| 43 | |||
| 44 | total += generated; | ||
| 45 | |||
| 46 | /* Assume that generated isn't a negative number */ | ||
| 47 | if (generated >= size) { | ||
| 48 | buf += size; | ||
| 49 | size = 0; | ||
| 50 | } else { | ||
| 51 | buf += generated; | ||
| 52 | size -= generated; | ||
| 53 | } | ||
| 54 | } | ||
| 55 | |||
| 56 | return total; | ||
| 57 | } | ||
| 58 | EXPORT_SYMBOL_GPL(snprint_stack_trace); | ||
| 59 | |||
| 28 | /* | 60 | /* |
| 29 | * Architectures that do not implement save_stack_trace_tsk or | 61 | * Architectures that do not implement save_stack_trace_tsk or |
| 30 | * save_stack_trace_regs get this weak alias and a once-per-bootup warning | 62 | * save_stack_trace_regs get this weak alias and a once-per-bootup warning |
diff --git a/kernel/sys.c b/kernel/sys.c index 1eaa2f0b0246..ea9c88109894 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
| @@ -91,6 +91,12 @@ | |||
| 91 | #ifndef SET_TSC_CTL | 91 | #ifndef SET_TSC_CTL |
| 92 | # define SET_TSC_CTL(a) (-EINVAL) | 92 | # define SET_TSC_CTL(a) (-EINVAL) |
| 93 | #endif | 93 | #endif |
| 94 | #ifndef MPX_ENABLE_MANAGEMENT | ||
| 95 | # define MPX_ENABLE_MANAGEMENT(a) (-EINVAL) | ||
| 96 | #endif | ||
| 97 | #ifndef MPX_DISABLE_MANAGEMENT | ||
| 98 | # define MPX_DISABLE_MANAGEMENT(a) (-EINVAL) | ||
| 99 | #endif | ||
| 94 | 100 | ||
| 95 | /* | 101 | /* |
| 96 | * this is where the system-wide overflow UID and GID are defined, for | 102 | * this is where the system-wide overflow UID and GID are defined, for |
| @@ -2203,6 +2209,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
| 2203 | me->mm->def_flags &= ~VM_NOHUGEPAGE; | 2209 | me->mm->def_flags &= ~VM_NOHUGEPAGE; |
| 2204 | up_write(&me->mm->mmap_sem); | 2210 | up_write(&me->mm->mmap_sem); |
| 2205 | break; | 2211 | break; |
| 2212 | case PR_MPX_ENABLE_MANAGEMENT: | ||
| 2213 | if (arg2 || arg3 || arg4 || arg5) | ||
| 2214 | return -EINVAL; | ||
| 2215 | error = MPX_ENABLE_MANAGEMENT(me); | ||
| 2216 | break; | ||
| 2217 | case PR_MPX_DISABLE_MANAGEMENT: | ||
| 2218 | if (arg2 || arg3 || arg4 || arg5) | ||
| 2219 | return -EINVAL; | ||
| 2220 | error = MPX_DISABLE_MANAGEMENT(me); | ||
| 2221 | break; | ||
| 2206 | default: | 2222 | default: |
| 2207 | error = -EINVAL; | 2223 | error = -EINVAL; |
| 2208 | break; | 2224 | break; |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 02aa4185b17e..5adcb0ae3a58 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
| @@ -169,6 +169,8 @@ cond_syscall(ppc_rtas); | |||
| 169 | cond_syscall(sys_spu_run); | 169 | cond_syscall(sys_spu_run); |
| 170 | cond_syscall(sys_spu_create); | 170 | cond_syscall(sys_spu_create); |
| 171 | cond_syscall(sys_subpage_prot); | 171 | cond_syscall(sys_subpage_prot); |
| 172 | cond_syscall(sys_s390_pci_mmio_read); | ||
| 173 | cond_syscall(sys_s390_pci_mmio_write); | ||
| 172 | 174 | ||
| 173 | /* mmu depending weak syscall entries */ | 175 | /* mmu depending weak syscall entries */ |
| 174 | cond_syscall(sys_mprotect); | 176 | cond_syscall(sys_mprotect); |
| @@ -224,3 +226,6 @@ cond_syscall(sys_seccomp); | |||
| 224 | 226 | ||
| 225 | /* access BPF programs and maps */ | 227 | /* access BPF programs and maps */ |
| 226 | cond_syscall(sys_bpf); | 228 | cond_syscall(sys_bpf); |
| 229 | |||
| 230 | /* execveat */ | ||
| 231 | cond_syscall(sys_execveat); | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 15f2511a1b7c..137c7f69b264 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -623,6 +623,13 @@ static struct ctl_table kern_table[] = { | |||
| 623 | .mode = 0644, | 623 | .mode = 0644, |
| 624 | .proc_handler = proc_dointvec, | 624 | .proc_handler = proc_dointvec, |
| 625 | }, | 625 | }, |
| 626 | { | ||
| 627 | .procname = "tracepoint_printk", | ||
| 628 | .data = &tracepoint_printk, | ||
| 629 | .maxlen = sizeof(tracepoint_printk), | ||
| 630 | .mode = 0644, | ||
| 631 | .proc_handler = proc_dointvec, | ||
| 632 | }, | ||
| 626 | #endif | 633 | #endif |
| 627 | #ifdef CONFIG_KEXEC | 634 | #ifdef CONFIG_KEXEC |
| 628 | { | 635 | { |
| @@ -1104,6 +1111,15 @@ static struct ctl_table kern_table[] = { | |||
| 1104 | .proc_handler = proc_dointvec, | 1111 | .proc_handler = proc_dointvec, |
| 1105 | }, | 1112 | }, |
| 1106 | #endif | 1113 | #endif |
| 1114 | { | ||
| 1115 | .procname = "panic_on_warn", | ||
| 1116 | .data = &panic_on_warn, | ||
| 1117 | .maxlen = sizeof(int), | ||
| 1118 | .mode = 0644, | ||
| 1119 | .proc_handler = proc_dointvec_minmax, | ||
| 1120 | .extra1 = &zero, | ||
| 1121 | .extra2 = &one, | ||
| 1122 | }, | ||
| 1107 | { } | 1123 | { } |
| 1108 | }; | 1124 | }; |
| 1109 | 1125 | ||
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 9a4f750a2963..7e7746a42a62 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c | |||
| @@ -137,6 +137,7 @@ static const struct bin_table bin_kern_table[] = { | |||
| 137 | { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, | 137 | { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, |
| 138 | { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, | 138 | { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, |
| 139 | { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, | 139 | { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, |
| 140 | { CTL_INT, KERN_PANIC_ON_WARN, "panic_on_warn" }, | ||
| 140 | {} | 141 | {} |
| 141 | }; | 142 | }; |
| 142 | 143 | ||
diff --git a/kernel/taskstats.c b/kernel/taskstats.c index b312fcc73024..670fff88a961 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c | |||
| @@ -459,7 +459,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) | |||
| 459 | stats = nla_data(na); | 459 | stats = nla_data(na); |
| 460 | memset(stats, 0, sizeof(*stats)); | 460 | memset(stats, 0, sizeof(*stats)); |
| 461 | 461 | ||
| 462 | rc = cgroupstats_build(stats, f.file->f_dentry); | 462 | rc = cgroupstats_build(stats, f.file->f_path.dentry); |
| 463 | if (rc < 0) { | 463 | if (rc < 0) { |
| 464 | nlmsg_free(rep_skb); | 464 | nlmsg_free(rep_skb); |
| 465 | goto err; | 465 | goto err; |
diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 7347426fa68d..f622cf28628a 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile | |||
| @@ -13,7 +13,7 @@ obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o | |||
| 13 | obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o | 13 | obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o |
| 14 | obj-$(CONFIG_TIMER_STATS) += timer_stats.o | 14 | obj-$(CONFIG_TIMER_STATS) += timer_stats.o |
| 15 | obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o | 15 | obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o |
| 16 | obj-$(CONFIG_TEST_UDELAY) += udelay_test.o | 16 | obj-$(CONFIG_TEST_UDELAY) += test_udelay.o |
| 17 | 17 | ||
| 18 | $(obj)/time.o: $(obj)/timeconst.h | 18 | $(obj)/time.o: $(obj)/timeconst.h |
| 19 | 19 | ||
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 2e949cc9c9f1..b79f39bda7e1 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
| @@ -792,7 +792,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) | |||
| 792 | /* Initialize mult/shift and max_idle_ns */ | 792 | /* Initialize mult/shift and max_idle_ns */ |
| 793 | __clocksource_updatefreq_scale(cs, scale, freq); | 793 | __clocksource_updatefreq_scale(cs, scale, freq); |
| 794 | 794 | ||
| 795 | /* Add clocksource to the clcoksource list */ | 795 | /* Add clocksource to the clocksource list */ |
| 796 | mutex_lock(&clocksource_mutex); | 796 | mutex_lock(&clocksource_mutex); |
| 797 | clocksource_enqueue(cs); | 797 | clocksource_enqueue(cs); |
| 798 | clocksource_enqueue_watchdog(cs); | 798 | clocksource_enqueue_watchdog(cs); |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 87a346fd6d61..28bf91c60a0b 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
| @@ -633,6 +633,13 @@ int ntp_validate_timex(struct timex *txc) | |||
| 633 | if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME))) | 633 | if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME))) |
| 634 | return -EPERM; | 634 | return -EPERM; |
| 635 | 635 | ||
| 636 | if (txc->modes & ADJ_FREQUENCY) { | ||
| 637 | if (LONG_MIN / PPM_SCALE > txc->freq) | ||
| 638 | return -EINVAL; | ||
| 639 | if (LONG_MAX / PPM_SCALE < txc->freq) | ||
| 640 | return -EINVAL; | ||
| 641 | } | ||
| 642 | |||
| 636 | return 0; | 643 | return 0; |
| 637 | } | 644 | } |
| 638 | 645 | ||
diff --git a/kernel/time/udelay_test.c b/kernel/time/test_udelay.c index e622ba365a13..e622ba365a13 100644 --- a/kernel/time/udelay_test.c +++ b/kernel/time/test_udelay.c | |||
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 7b5741fc4110..1363d58f07e9 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
| @@ -235,7 +235,7 @@ void tick_nohz_full_kick(void) | |||
| 235 | if (!tick_nohz_full_cpu(smp_processor_id())) | 235 | if (!tick_nohz_full_cpu(smp_processor_id())) |
| 236 | return; | 236 | return; |
| 237 | 237 | ||
| 238 | irq_work_queue(&__get_cpu_var(nohz_full_kick_work)); | 238 | irq_work_queue(this_cpu_ptr(&nohz_full_kick_work)); |
| 239 | } | 239 | } |
| 240 | 240 | ||
| 241 | /* | 241 | /* |
| @@ -585,7 +585,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
| 585 | last_jiffies = jiffies; | 585 | last_jiffies = jiffies; |
| 586 | } while (read_seqretry(&jiffies_lock, seq)); | 586 | } while (read_seqretry(&jiffies_lock, seq)); |
| 587 | 587 | ||
| 588 | if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || | 588 | if (rcu_needs_cpu(&rcu_delta_jiffies) || |
| 589 | arch_needs_cpu() || irq_work_needs_cpu()) { | 589 | arch_needs_cpu() || irq_work_needs_cpu()) { |
| 590 | next_jiffies = last_jiffies + 1; | 590 | next_jiffies = last_jiffies + 1; |
| 591 | delta_jiffies = 1; | 591 | delta_jiffies = 1; |
| @@ -847,7 +847,6 @@ void tick_nohz_idle_enter(void) | |||
| 847 | 847 | ||
| 848 | local_irq_enable(); | 848 | local_irq_enable(); |
| 849 | } | 849 | } |
| 850 | EXPORT_SYMBOL_GPL(tick_nohz_idle_enter); | ||
| 851 | 850 | ||
| 852 | /** | 851 | /** |
| 853 | * tick_nohz_irq_exit - update next tick event from interrupt exit | 852 | * tick_nohz_irq_exit - update next tick event from interrupt exit |
| @@ -974,7 +973,6 @@ void tick_nohz_idle_exit(void) | |||
| 974 | 973 | ||
| 975 | local_irq_enable(); | 974 | local_irq_enable(); |
| 976 | } | 975 | } |
| 977 | EXPORT_SYMBOL_GPL(tick_nohz_idle_exit); | ||
| 978 | 976 | ||
| 979 | static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) | 977 | static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) |
| 980 | { | 978 | { |
diff --git a/kernel/time/time.c b/kernel/time/time.c index a9ae20fb0b11..2c85b7724af4 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c | |||
| @@ -196,6 +196,10 @@ SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, | |||
| 196 | if (tv) { | 196 | if (tv) { |
| 197 | if (copy_from_user(&user_tv, tv, sizeof(*tv))) | 197 | if (copy_from_user(&user_tv, tv, sizeof(*tv))) |
| 198 | return -EFAULT; | 198 | return -EFAULT; |
| 199 | |||
| 200 | if (!timeval_valid(&user_tv)) | ||
| 201 | return -EINVAL; | ||
| 202 | |||
| 199 | new_ts.tv_sec = user_tv.tv_sec; | 203 | new_ts.tv_sec = user_tv.tv_sec; |
| 200 | new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; | 204 | new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; |
| 201 | } | 205 | } |
| @@ -304,7 +308,9 @@ struct timespec timespec_trunc(struct timespec t, unsigned gran) | |||
| 304 | } | 308 | } |
| 305 | EXPORT_SYMBOL(timespec_trunc); | 309 | EXPORT_SYMBOL(timespec_trunc); |
| 306 | 310 | ||
| 307 | /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. | 311 | /* |
| 312 | * mktime64 - Converts date to seconds. | ||
| 313 | * Converts Gregorian date to seconds since 1970-01-01 00:00:00. | ||
| 308 | * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 | 314 | * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 |
| 309 | * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. | 315 | * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. |
| 310 | * | 316 | * |
| @@ -314,15 +320,10 @@ EXPORT_SYMBOL(timespec_trunc); | |||
| 314 | * -year/100+year/400 terms, and add 10.] | 320 | * -year/100+year/400 terms, and add 10.] |
| 315 | * | 321 | * |
| 316 | * This algorithm was first published by Gauss (I think). | 322 | * This algorithm was first published by Gauss (I think). |
| 317 | * | ||
| 318 | * WARNING: this function will overflow on 2106-02-07 06:28:16 on | ||
| 319 | * machines where long is 32-bit! (However, as time_t is signed, we | ||
| 320 | * will already get problems at other places on 2038-01-19 03:14:08) | ||
| 321 | */ | 323 | */ |
| 322 | unsigned long | 324 | time64_t mktime64(const unsigned int year0, const unsigned int mon0, |
| 323 | mktime(const unsigned int year0, const unsigned int mon0, | 325 | const unsigned int day, const unsigned int hour, |
| 324 | const unsigned int day, const unsigned int hour, | 326 | const unsigned int min, const unsigned int sec) |
| 325 | const unsigned int min, const unsigned int sec) | ||
| 326 | { | 327 | { |
| 327 | unsigned int mon = mon0, year = year0; | 328 | unsigned int mon = mon0, year = year0; |
| 328 | 329 | ||
| @@ -332,15 +333,14 @@ mktime(const unsigned int year0, const unsigned int mon0, | |||
| 332 | year -= 1; | 333 | year -= 1; |
| 333 | } | 334 | } |
| 334 | 335 | ||
| 335 | return ((((unsigned long) | 336 | return ((((time64_t) |
| 336 | (year/4 - year/100 + year/400 + 367*mon/12 + day) + | 337 | (year/4 - year/100 + year/400 + 367*mon/12 + day) + |
| 337 | year*365 - 719499 | 338 | year*365 - 719499 |
| 338 | )*24 + hour /* now have hours */ | 339 | )*24 + hour /* now have hours */ |
| 339 | )*60 + min /* now have minutes */ | 340 | )*60 + min /* now have minutes */ |
| 340 | )*60 + sec; /* finally seconds */ | 341 | )*60 + sec; /* finally seconds */ |
| 341 | } | 342 | } |
| 342 | 343 | EXPORT_SYMBOL(mktime64); | |
| 343 | EXPORT_SYMBOL(mktime); | ||
| 344 | 344 | ||
| 345 | /** | 345 | /** |
| 346 | * set_normalized_timespec - set timespec sec and nsec parts and normalize | 346 | * set_normalized_timespec - set timespec sec and nsec parts and normalize |
| @@ -745,6 +745,7 @@ u64 nsecs_to_jiffies64(u64 n) | |||
| 745 | return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ); | 745 | return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ); |
| 746 | #endif | 746 | #endif |
| 747 | } | 747 | } |
| 748 | EXPORT_SYMBOL(nsecs_to_jiffies64); | ||
| 748 | 749 | ||
| 749 | /** | 750 | /** |
| 750 | * nsecs_to_jiffies - Convert nsecs in u64 to jiffies | 751 | * nsecs_to_jiffies - Convert nsecs in u64 to jiffies |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index ec1791fae965..6a931852082f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
| @@ -417,7 +417,8 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); | |||
| 417 | */ | 417 | */ |
| 418 | static inline void tk_update_ktime_data(struct timekeeper *tk) | 418 | static inline void tk_update_ktime_data(struct timekeeper *tk) |
| 419 | { | 419 | { |
| 420 | s64 nsec; | 420 | u64 seconds; |
| 421 | u32 nsec; | ||
| 421 | 422 | ||
| 422 | /* | 423 | /* |
| 423 | * The xtime based monotonic readout is: | 424 | * The xtime based monotonic readout is: |
| @@ -426,13 +427,22 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) | |||
| 426 | * nsec = base_mono + now(); | 427 | * nsec = base_mono + now(); |
| 427 | * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec | 428 | * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec |
| 428 | */ | 429 | */ |
| 429 | nsec = (s64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); | 430 | seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); |
| 430 | nsec *= NSEC_PER_SEC; | 431 | nsec = (u32) tk->wall_to_monotonic.tv_nsec; |
| 431 | nsec += tk->wall_to_monotonic.tv_nsec; | 432 | tk->tkr.base_mono = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); |
| 432 | tk->tkr.base_mono = ns_to_ktime(nsec); | ||
| 433 | 433 | ||
| 434 | /* Update the monotonic raw base */ | 434 | /* Update the monotonic raw base */ |
| 435 | tk->base_raw = timespec64_to_ktime(tk->raw_time); | 435 | tk->base_raw = timespec64_to_ktime(tk->raw_time); |
| 436 | |||
| 437 | /* | ||
| 438 | * The sum of the nanoseconds portions of xtime and | ||
| 439 | * wall_to_monotonic can be greater/equal one second. Take | ||
| 440 | * this into account before updating tk->ktime_sec. | ||
| 441 | */ | ||
| 442 | nsec += (u32)(tk->tkr.xtime_nsec >> tk->tkr.shift); | ||
| 443 | if (nsec >= NSEC_PER_SEC) | ||
| 444 | seconds++; | ||
| 445 | tk->ktime_sec = seconds; | ||
| 436 | } | 446 | } |
| 437 | 447 | ||
| 438 | /* must hold timekeeper_lock */ | 448 | /* must hold timekeeper_lock */ |
| @@ -519,9 +529,9 @@ EXPORT_SYMBOL(__getnstimeofday64); | |||
| 519 | 529 | ||
| 520 | /** | 530 | /** |
| 521 | * getnstimeofday64 - Returns the time of day in a timespec64. | 531 | * getnstimeofday64 - Returns the time of day in a timespec64. |
| 522 | * @ts: pointer to the timespec to be set | 532 | * @ts: pointer to the timespec64 to be set |
| 523 | * | 533 | * |
| 524 | * Returns the time of day in a timespec (WARN if suspended). | 534 | * Returns the time of day in a timespec64 (WARN if suspended). |
| 525 | */ | 535 | */ |
| 526 | void getnstimeofday64(struct timespec64 *ts) | 536 | void getnstimeofday64(struct timespec64 *ts) |
| 527 | { | 537 | { |
| @@ -623,7 +633,7 @@ EXPORT_SYMBOL_GPL(ktime_get_raw); | |||
| 623 | * | 633 | * |
| 624 | * The function calculates the monotonic clock from the realtime | 634 | * The function calculates the monotonic clock from the realtime |
| 625 | * clock and the wall_to_monotonic offset and stores the result | 635 | * clock and the wall_to_monotonic offset and stores the result |
| 626 | * in normalized timespec format in the variable pointed to by @ts. | 636 | * in normalized timespec64 format in the variable pointed to by @ts. |
| 627 | */ | 637 | */ |
| 628 | void ktime_get_ts64(struct timespec64 *ts) | 638 | void ktime_get_ts64(struct timespec64 *ts) |
| 629 | { | 639 | { |
| @@ -648,6 +658,54 @@ void ktime_get_ts64(struct timespec64 *ts) | |||
| 648 | } | 658 | } |
| 649 | EXPORT_SYMBOL_GPL(ktime_get_ts64); | 659 | EXPORT_SYMBOL_GPL(ktime_get_ts64); |
| 650 | 660 | ||
| 661 | /** | ||
| 662 | * ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC | ||
| 663 | * | ||
| 664 | * Returns the seconds portion of CLOCK_MONOTONIC with a single non | ||
| 665 | * serialized read. tk->ktime_sec is of type 'unsigned long' so this | ||
| 666 | * works on both 32 and 64 bit systems. On 32 bit systems the readout | ||
| 667 | * covers ~136 years of uptime which should be enough to prevent | ||
| 668 | * premature wrap arounds. | ||
| 669 | */ | ||
| 670 | time64_t ktime_get_seconds(void) | ||
| 671 | { | ||
| 672 | struct timekeeper *tk = &tk_core.timekeeper; | ||
| 673 | |||
| 674 | WARN_ON(timekeeping_suspended); | ||
| 675 | return tk->ktime_sec; | ||
| 676 | } | ||
| 677 | EXPORT_SYMBOL_GPL(ktime_get_seconds); | ||
| 678 | |||
| 679 | /** | ||
| 680 | * ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME | ||
| 681 | * | ||
| 682 | * Returns the wall clock seconds since 1970. This replaces the | ||
| 683 | * get_seconds() interface which is not y2038 safe on 32bit systems. | ||
| 684 | * | ||
| 685 | * For 64bit systems the fast access to tk->xtime_sec is preserved. On | ||
| 686 | * 32bit systems the access must be protected with the sequence | ||
| 687 | * counter to provide "atomic" access to the 64bit tk->xtime_sec | ||
| 688 | * value. | ||
| 689 | */ | ||
| 690 | time64_t ktime_get_real_seconds(void) | ||
| 691 | { | ||
| 692 | struct timekeeper *tk = &tk_core.timekeeper; | ||
| 693 | time64_t seconds; | ||
| 694 | unsigned int seq; | ||
| 695 | |||
| 696 | if (IS_ENABLED(CONFIG_64BIT)) | ||
| 697 | return tk->xtime_sec; | ||
| 698 | |||
| 699 | do { | ||
| 700 | seq = read_seqcount_begin(&tk_core.seq); | ||
| 701 | seconds = tk->xtime_sec; | ||
| 702 | |||
| 703 | } while (read_seqcount_retry(&tk_core.seq, seq)); | ||
| 704 | |||
| 705 | return seconds; | ||
| 706 | } | ||
| 707 | EXPORT_SYMBOL_GPL(ktime_get_real_seconds); | ||
| 708 | |||
| 651 | #ifdef CONFIG_NTP_PPS | 709 | #ifdef CONFIG_NTP_PPS |
| 652 | 710 | ||
| 653 | /** | 711 | /** |
| @@ -703,18 +761,18 @@ void do_gettimeofday(struct timeval *tv) | |||
| 703 | EXPORT_SYMBOL(do_gettimeofday); | 761 | EXPORT_SYMBOL(do_gettimeofday); |
| 704 | 762 | ||
| 705 | /** | 763 | /** |
| 706 | * do_settimeofday - Sets the time of day | 764 | * do_settimeofday64 - Sets the time of day. |
| 707 | * @tv: pointer to the timespec variable containing the new time | 765 | * @ts: pointer to the timespec64 variable containing the new time |
| 708 | * | 766 | * |
| 709 | * Sets the time of day to the new time and update NTP and notify hrtimers | 767 | * Sets the time of day to the new time and update NTP and notify hrtimers |
| 710 | */ | 768 | */ |
| 711 | int do_settimeofday(const struct timespec *tv) | 769 | int do_settimeofday64(const struct timespec64 *ts) |
| 712 | { | 770 | { |
| 713 | struct timekeeper *tk = &tk_core.timekeeper; | 771 | struct timekeeper *tk = &tk_core.timekeeper; |
| 714 | struct timespec64 ts_delta, xt, tmp; | 772 | struct timespec64 ts_delta, xt; |
| 715 | unsigned long flags; | 773 | unsigned long flags; |
| 716 | 774 | ||
| 717 | if (!timespec_valid_strict(tv)) | 775 | if (!timespec64_valid_strict(ts)) |
| 718 | return -EINVAL; | 776 | return -EINVAL; |
| 719 | 777 | ||
| 720 | raw_spin_lock_irqsave(&timekeeper_lock, flags); | 778 | raw_spin_lock_irqsave(&timekeeper_lock, flags); |
| @@ -723,13 +781,12 @@ int do_settimeofday(const struct timespec *tv) | |||
| 723 | timekeeping_forward_now(tk); | 781 | timekeeping_forward_now(tk); |
| 724 | 782 | ||
| 725 | xt = tk_xtime(tk); | 783 | xt = tk_xtime(tk); |
| 726 | ts_delta.tv_sec = tv->tv_sec - xt.tv_sec; | 784 | ts_delta.tv_sec = ts->tv_sec - xt.tv_sec; |
| 727 | ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec; | 785 | ts_delta.tv_nsec = ts->tv_nsec - xt.tv_nsec; |
| 728 | 786 | ||
| 729 | tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta)); | 787 | tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta)); |
| 730 | 788 | ||
| 731 | tmp = timespec_to_timespec64(*tv); | 789 | tk_set_xtime(tk, ts); |
| 732 | tk_set_xtime(tk, &tmp); | ||
| 733 | 790 | ||
| 734 | timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); | 791 | timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); |
| 735 | 792 | ||
| @@ -741,7 +798,7 @@ int do_settimeofday(const struct timespec *tv) | |||
| 741 | 798 | ||
| 742 | return 0; | 799 | return 0; |
| 743 | } | 800 | } |
| 744 | EXPORT_SYMBOL(do_settimeofday); | 801 | EXPORT_SYMBOL(do_settimeofday64); |
| 745 | 802 | ||
| 746 | /** | 803 | /** |
| 747 | * timekeeping_inject_offset - Adds or subtracts from the current time. | 804 | * timekeeping_inject_offset - Adds or subtracts from the current time. |
| @@ -895,12 +952,12 @@ int timekeeping_notify(struct clocksource *clock) | |||
| 895 | } | 952 | } |
| 896 | 953 | ||
| 897 | /** | 954 | /** |
| 898 | * getrawmonotonic - Returns the raw monotonic time in a timespec | 955 | * getrawmonotonic64 - Returns the raw monotonic time in a timespec |
| 899 | * @ts: pointer to the timespec to be set | 956 | * @ts: pointer to the timespec64 to be set |
| 900 | * | 957 | * |
| 901 | * Returns the raw monotonic time (completely un-modified by ntp) | 958 | * Returns the raw monotonic time (completely un-modified by ntp) |
| 902 | */ | 959 | */ |
| 903 | void getrawmonotonic(struct timespec *ts) | 960 | void getrawmonotonic64(struct timespec64 *ts) |
| 904 | { | 961 | { |
| 905 | struct timekeeper *tk = &tk_core.timekeeper; | 962 | struct timekeeper *tk = &tk_core.timekeeper; |
| 906 | struct timespec64 ts64; | 963 | struct timespec64 ts64; |
| @@ -915,9 +972,10 @@ void getrawmonotonic(struct timespec *ts) | |||
| 915 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 972 | } while (read_seqcount_retry(&tk_core.seq, seq)); |
| 916 | 973 | ||
| 917 | timespec64_add_ns(&ts64, nsecs); | 974 | timespec64_add_ns(&ts64, nsecs); |
| 918 | *ts = timespec64_to_timespec(ts64); | 975 | *ts = ts64; |
| 919 | } | 976 | } |
| 920 | EXPORT_SYMBOL(getrawmonotonic); | 977 | EXPORT_SYMBOL(getrawmonotonic64); |
| 978 | |||
| 921 | 979 | ||
| 922 | /** | 980 | /** |
| 923 | * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres | 981 | * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres |
| @@ -1068,8 +1126,8 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, | |||
| 1068 | } | 1126 | } |
| 1069 | 1127 | ||
| 1070 | /** | 1128 | /** |
| 1071 | * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values | 1129 | * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values |
| 1072 | * @delta: pointer to a timespec delta value | 1130 | * @delta: pointer to a timespec64 delta value |
| 1073 | * | 1131 | * |
| 1074 | * This hook is for architectures that cannot support read_persistent_clock | 1132 | * This hook is for architectures that cannot support read_persistent_clock |
| 1075 | * because their RTC/persistent clock is only accessible when irqs are enabled. | 1133 | * because their RTC/persistent clock is only accessible when irqs are enabled. |
| @@ -1077,10 +1135,9 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, | |||
| 1077 | * This function should only be called by rtc_resume(), and allows | 1135 | * This function should only be called by rtc_resume(), and allows |
| 1078 | * a suspend offset to be injected into the timekeeping values. | 1136 | * a suspend offset to be injected into the timekeeping values. |
| 1079 | */ | 1137 | */ |
| 1080 | void timekeeping_inject_sleeptime(struct timespec *delta) | 1138 | void timekeeping_inject_sleeptime64(struct timespec64 *delta) |
| 1081 | { | 1139 | { |
| 1082 | struct timekeeper *tk = &tk_core.timekeeper; | 1140 | struct timekeeper *tk = &tk_core.timekeeper; |
| 1083 | struct timespec64 tmp; | ||
| 1084 | unsigned long flags; | 1141 | unsigned long flags; |
| 1085 | 1142 | ||
| 1086 | /* | 1143 | /* |
| @@ -1095,8 +1152,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta) | |||
| 1095 | 1152 | ||
| 1096 | timekeeping_forward_now(tk); | 1153 | timekeeping_forward_now(tk); |
| 1097 | 1154 | ||
| 1098 | tmp = timespec_to_timespec64(*delta); | 1155 | __timekeeping_inject_sleeptime(tk, delta); |
| 1099 | __timekeeping_inject_sleeptime(tk, &tmp); | ||
| 1100 | 1156 | ||
| 1101 | timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); | 1157 | timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); |
| 1102 | 1158 | ||
| @@ -1332,6 +1388,12 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, | |||
| 1332 | * | 1388 | * |
| 1333 | * XXX - TODO: Doc ntp_error calculation. | 1389 | * XXX - TODO: Doc ntp_error calculation. |
| 1334 | */ | 1390 | */ |
| 1391 | if ((mult_adj > 0) && (tk->tkr.mult + mult_adj < mult_adj)) { | ||
| 1392 | /* NTP adjustment caused clocksource mult overflow */ | ||
| 1393 | WARN_ON_ONCE(1); | ||
| 1394 | return; | ||
| 1395 | } | ||
| 1396 | |||
| 1335 | tk->tkr.mult += mult_adj; | 1397 | tk->tkr.mult += mult_adj; |
| 1336 | tk->xtime_interval += interval; | 1398 | tk->xtime_interval += interval; |
| 1337 | tk->tkr.xtime_nsec -= offset; | 1399 | tk->tkr.xtime_nsec -= offset; |
| @@ -1397,7 +1459,8 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) | |||
| 1397 | } | 1459 | } |
| 1398 | 1460 | ||
| 1399 | if (unlikely(tk->tkr.clock->maxadj && | 1461 | if (unlikely(tk->tkr.clock->maxadj && |
| 1400 | (tk->tkr.mult > tk->tkr.clock->mult + tk->tkr.clock->maxadj))) { | 1462 | (abs(tk->tkr.mult - tk->tkr.clock->mult) |
| 1463 | > tk->tkr.clock->maxadj))) { | ||
| 1401 | printk_once(KERN_WARNING | 1464 | printk_once(KERN_WARNING |
| 1402 | "Adjusting %s more than 11%% (%ld vs %ld)\n", | 1465 | "Adjusting %s more than 11%% (%ld vs %ld)\n", |
| 1403 | tk->tkr.clock->name, (long)tk->tkr.mult, | 1466 | tk->tkr.clock->name, (long)tk->tkr.mult, |
| @@ -1646,7 +1709,7 @@ struct timespec current_kernel_time(void) | |||
| 1646 | } | 1709 | } |
| 1647 | EXPORT_SYMBOL(current_kernel_time); | 1710 | EXPORT_SYMBOL(current_kernel_time); |
| 1648 | 1711 | ||
| 1649 | struct timespec get_monotonic_coarse(void) | 1712 | struct timespec64 get_monotonic_coarse64(void) |
| 1650 | { | 1713 | { |
| 1651 | struct timekeeper *tk = &tk_core.timekeeper; | 1714 | struct timekeeper *tk = &tk_core.timekeeper; |
| 1652 | struct timespec64 now, mono; | 1715 | struct timespec64 now, mono; |
| @@ -1662,7 +1725,7 @@ struct timespec get_monotonic_coarse(void) | |||
| 1662 | set_normalized_timespec64(&now, now.tv_sec + mono.tv_sec, | 1725 | set_normalized_timespec64(&now, now.tv_sec + mono.tv_sec, |
| 1663 | now.tv_nsec + mono.tv_nsec); | 1726 | now.tv_nsec + mono.tv_nsec); |
| 1664 | 1727 | ||
| 1665 | return timespec64_to_timespec(now); | 1728 | return now; |
| 1666 | } | 1729 | } |
| 1667 | 1730 | ||
| 1668 | /* | 1731 | /* |
diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 3260ffdb368f..2d3f5c504939 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c | |||
| @@ -1377,12 +1377,11 @@ unsigned long get_next_timer_interrupt(unsigned long now) | |||
| 1377 | void update_process_times(int user_tick) | 1377 | void update_process_times(int user_tick) |
| 1378 | { | 1378 | { |
| 1379 | struct task_struct *p = current; | 1379 | struct task_struct *p = current; |
| 1380 | int cpu = smp_processor_id(); | ||
| 1381 | 1380 | ||
| 1382 | /* Note: this timer irq context must be accounted for as well. */ | 1381 | /* Note: this timer irq context must be accounted for as well. */ |
| 1383 | account_process_tick(p, user_tick); | 1382 | account_process_tick(p, user_tick); |
| 1384 | run_local_timers(); | 1383 | run_local_timers(); |
| 1385 | rcu_check_callbacks(cpu, user_tick); | 1384 | rcu_check_callbacks(user_tick); |
| 1386 | #ifdef CONFIG_IRQ_WORK | 1385 | #ifdef CONFIG_IRQ_WORK |
| 1387 | if (in_irq()) | 1386 | if (in_irq()) |
| 1388 | irq_work_tick(); | 1387 | irq_work_tick(); |
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 67d6369ddf83..979ccde26720 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile | |||
| @@ -55,7 +55,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o | |||
| 55 | obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o | 55 | obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o |
| 56 | obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o | 56 | obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o |
| 57 | obj-$(CONFIG_TRACEPOINTS) += power-traces.o | 57 | obj-$(CONFIG_TRACEPOINTS) += power-traces.o |
| 58 | ifeq ($(CONFIG_PM_RUNTIME),y) | 58 | ifeq ($(CONFIG_PM),y) |
| 59 | obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o | 59 | obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o |
| 60 | endif | 60 | endif |
| 61 | ifeq ($(CONFIG_TRACING),y) | 61 | ifeq ($(CONFIG_TRACING),y) |
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c1bd4ada2a04..483cecfa5c17 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c | |||
| @@ -1142,9 +1142,9 @@ static void get_pdu_remap(const struct trace_entry *ent, | |||
| 1142 | r->sector_from = be64_to_cpu(sector_from); | 1142 | r->sector_from = be64_to_cpu(sector_from); |
| 1143 | } | 1143 | } |
| 1144 | 1144 | ||
| 1145 | typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act); | 1145 | typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act); |
| 1146 | 1146 | ||
| 1147 | static int blk_log_action_classic(struct trace_iterator *iter, const char *act) | 1147 | static void blk_log_action_classic(struct trace_iterator *iter, const char *act) |
| 1148 | { | 1148 | { |
| 1149 | char rwbs[RWBS_LEN]; | 1149 | char rwbs[RWBS_LEN]; |
| 1150 | unsigned long long ts = iter->ts; | 1150 | unsigned long long ts = iter->ts; |
| @@ -1154,33 +1154,33 @@ static int blk_log_action_classic(struct trace_iterator *iter, const char *act) | |||
| 1154 | 1154 | ||
| 1155 | fill_rwbs(rwbs, t); | 1155 | fill_rwbs(rwbs, t); |
| 1156 | 1156 | ||
| 1157 | return trace_seq_printf(&iter->seq, | 1157 | trace_seq_printf(&iter->seq, |
| 1158 | "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ", | 1158 | "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ", |
| 1159 | MAJOR(t->device), MINOR(t->device), iter->cpu, | 1159 | MAJOR(t->device), MINOR(t->device), iter->cpu, |
| 1160 | secs, nsec_rem, iter->ent->pid, act, rwbs); | 1160 | secs, nsec_rem, iter->ent->pid, act, rwbs); |
| 1161 | } | 1161 | } |
| 1162 | 1162 | ||
| 1163 | static int blk_log_action(struct trace_iterator *iter, const char *act) | 1163 | static void blk_log_action(struct trace_iterator *iter, const char *act) |
| 1164 | { | 1164 | { |
| 1165 | char rwbs[RWBS_LEN]; | 1165 | char rwbs[RWBS_LEN]; |
| 1166 | const struct blk_io_trace *t = te_blk_io_trace(iter->ent); | 1166 | const struct blk_io_trace *t = te_blk_io_trace(iter->ent); |
| 1167 | 1167 | ||
| 1168 | fill_rwbs(rwbs, t); | 1168 | fill_rwbs(rwbs, t); |
| 1169 | return trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", | 1169 | trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", |
| 1170 | MAJOR(t->device), MINOR(t->device), act, rwbs); | 1170 | MAJOR(t->device), MINOR(t->device), act, rwbs); |
| 1171 | } | 1171 | } |
| 1172 | 1172 | ||
| 1173 | static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) | 1173 | static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) |
| 1174 | { | 1174 | { |
| 1175 | const unsigned char *pdu_buf; | 1175 | const unsigned char *pdu_buf; |
| 1176 | int pdu_len; | 1176 | int pdu_len; |
| 1177 | int i, end, ret; | 1177 | int i, end; |
| 1178 | 1178 | ||
| 1179 | pdu_buf = pdu_start(ent); | 1179 | pdu_buf = pdu_start(ent); |
| 1180 | pdu_len = te_blk_io_trace(ent)->pdu_len; | 1180 | pdu_len = te_blk_io_trace(ent)->pdu_len; |
| 1181 | 1181 | ||
| 1182 | if (!pdu_len) | 1182 | if (!pdu_len) |
| 1183 | return 1; | 1183 | return; |
| 1184 | 1184 | ||
| 1185 | /* find the last zero that needs to be printed */ | 1185 | /* find the last zero that needs to be printed */ |
| 1186 | for (end = pdu_len - 1; end >= 0; end--) | 1186 | for (end = pdu_len - 1; end >= 0; end--) |
| @@ -1188,119 +1188,107 @@ static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) | |||
| 1188 | break; | 1188 | break; |
| 1189 | end++; | 1189 | end++; |
| 1190 | 1190 | ||
| 1191 | if (!trace_seq_putc(s, '(')) | 1191 | trace_seq_putc(s, '('); |
| 1192 | return 0; | ||
| 1193 | 1192 | ||
| 1194 | for (i = 0; i < pdu_len; i++) { | 1193 | for (i = 0; i < pdu_len; i++) { |
| 1195 | 1194 | ||
| 1196 | ret = trace_seq_printf(s, "%s%02x", | 1195 | trace_seq_printf(s, "%s%02x", |
| 1197 | i == 0 ? "" : " ", pdu_buf[i]); | 1196 | i == 0 ? "" : " ", pdu_buf[i]); |
| 1198 | if (!ret) | ||
| 1199 | return ret; | ||
| 1200 | 1197 | ||
| 1201 | /* | 1198 | /* |
| 1202 | * stop when the rest is just zeroes and indicate so | 1199 | * stop when the rest is just zeroes and indicate so |
| 1203 | * with a ".." appended | 1200 | * with a ".." appended |
| 1204 | */ | 1201 | */ |
| 1205 | if (i == end && end != pdu_len - 1) | 1202 | if (i == end && end != pdu_len - 1) { |
| 1206 | return trace_seq_puts(s, " ..) "); | 1203 | trace_seq_puts(s, " ..) "); |
| 1204 | return; | ||
| 1205 | } | ||
| 1207 | } | 1206 | } |
| 1208 | 1207 | ||
| 1209 | return trace_seq_puts(s, ") "); | 1208 | trace_seq_puts(s, ") "); |
| 1210 | } | 1209 | } |
| 1211 | 1210 | ||
| 1212 | static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) | 1211 | static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) |
| 1213 | { | 1212 | { |
| 1214 | char cmd[TASK_COMM_LEN]; | 1213 | char cmd[TASK_COMM_LEN]; |
| 1215 | 1214 | ||
| 1216 | trace_find_cmdline(ent->pid, cmd); | 1215 | trace_find_cmdline(ent->pid, cmd); |
| 1217 | 1216 | ||
| 1218 | if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { | 1217 | if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { |
| 1219 | int ret; | 1218 | trace_seq_printf(s, "%u ", t_bytes(ent)); |
| 1220 | 1219 | blk_log_dump_pdu(s, ent); | |
| 1221 | ret = trace_seq_printf(s, "%u ", t_bytes(ent)); | 1220 | trace_seq_printf(s, "[%s]\n", cmd); |
| 1222 | if (!ret) | ||
| 1223 | return 0; | ||
| 1224 | ret = blk_log_dump_pdu(s, ent); | ||
| 1225 | if (!ret) | ||
| 1226 | return 0; | ||
| 1227 | return trace_seq_printf(s, "[%s]\n", cmd); | ||
| 1228 | } else { | 1221 | } else { |
| 1229 | if (t_sec(ent)) | 1222 | if (t_sec(ent)) |
| 1230 | return trace_seq_printf(s, "%llu + %u [%s]\n", | 1223 | trace_seq_printf(s, "%llu + %u [%s]\n", |
| 1231 | t_sector(ent), t_sec(ent), cmd); | 1224 | t_sector(ent), t_sec(ent), cmd); |
| 1232 | return trace_seq_printf(s, "[%s]\n", cmd); | 1225 | else |
| 1226 | trace_seq_printf(s, "[%s]\n", cmd); | ||
| 1233 | } | 1227 | } |
| 1234 | } | 1228 | } |
| 1235 | 1229 | ||
| 1236 | static int blk_log_with_error(struct trace_seq *s, | 1230 | static void blk_log_with_error(struct trace_seq *s, |
| 1237 | const struct trace_entry *ent) | 1231 | const struct trace_entry *ent) |
| 1238 | { | 1232 | { |
| 1239 | if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { | 1233 | if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { |
| 1240 | int ret; | 1234 | blk_log_dump_pdu(s, ent); |
| 1241 | 1235 | trace_seq_printf(s, "[%d]\n", t_error(ent)); | |
| 1242 | ret = blk_log_dump_pdu(s, ent); | ||
| 1243 | if (ret) | ||
| 1244 | return trace_seq_printf(s, "[%d]\n", t_error(ent)); | ||
| 1245 | return 0; | ||
| 1246 | } else { | 1236 | } else { |
| 1247 | if (t_sec(ent)) | 1237 | if (t_sec(ent)) |
| 1248 | return trace_seq_printf(s, "%llu + %u [%d]\n", | 1238 | trace_seq_printf(s, "%llu + %u [%d]\n", |
| 1249 | t_sector(ent), | 1239 | t_sector(ent), |
| 1250 | t_sec(ent), t_error(ent)); | 1240 | t_sec(ent), t_error(ent)); |
| 1251 | return trace_seq_printf(s, "%llu [%d]\n", | 1241 | else |
| 1252 | t_sector(ent), t_error(ent)); | 1242 | trace_seq_printf(s, "%llu [%d]\n", |
| 1243 | t_sector(ent), t_error(ent)); | ||
| 1253 | } | 1244 | } |
| 1254 | } | 1245 | } |
| 1255 | 1246 | ||
| 1256 | static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) | 1247 | static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) |
| 1257 | { | 1248 | { |
| 1258 | struct blk_io_trace_remap r = { .device_from = 0, }; | 1249 | struct blk_io_trace_remap r = { .device_from = 0, }; |
| 1259 | 1250 | ||
| 1260 | get_pdu_remap(ent, &r); | 1251 | get_pdu_remap(ent, &r); |
| 1261 | return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", | 1252 | trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", |
| 1262 | t_sector(ent), t_sec(ent), | 1253 | t_sector(ent), t_sec(ent), |
| 1263 | MAJOR(r.device_from), MINOR(r.device_from), | 1254 | MAJOR(r.device_from), MINOR(r.device_from), |
| 1264 | (unsigned long long)r.sector_from); | 1255 | (unsigned long long)r.sector_from); |
| 1265 | } | 1256 | } |
| 1266 | 1257 | ||
| 1267 | static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) | 1258 | static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) |
| 1268 | { | 1259 | { |
| 1269 | char cmd[TASK_COMM_LEN]; | 1260 | char cmd[TASK_COMM_LEN]; |
| 1270 | 1261 | ||
| 1271 | trace_find_cmdline(ent->pid, cmd); | 1262 | trace_find_cmdline(ent->pid, cmd); |
| 1272 | 1263 | ||
| 1273 | return trace_seq_printf(s, "[%s]\n", cmd); | 1264 | trace_seq_printf(s, "[%s]\n", cmd); |
| 1274 | } | 1265 | } |
| 1275 | 1266 | ||
| 1276 | static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent) | 1267 | static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent) |
| 1277 | { | 1268 | { |
| 1278 | char cmd[TASK_COMM_LEN]; | 1269 | char cmd[TASK_COMM_LEN]; |
| 1279 | 1270 | ||
| 1280 | trace_find_cmdline(ent->pid, cmd); | 1271 | trace_find_cmdline(ent->pid, cmd); |
| 1281 | 1272 | ||
| 1282 | return trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent)); | 1273 | trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent)); |
| 1283 | } | 1274 | } |
| 1284 | 1275 | ||
| 1285 | static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent) | 1276 | static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent) |
| 1286 | { | 1277 | { |
| 1287 | char cmd[TASK_COMM_LEN]; | 1278 | char cmd[TASK_COMM_LEN]; |
| 1288 | 1279 | ||
| 1289 | trace_find_cmdline(ent->pid, cmd); | 1280 | trace_find_cmdline(ent->pid, cmd); |
| 1290 | 1281 | ||
| 1291 | return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent), | 1282 | trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent), |
| 1292 | get_pdu_int(ent), cmd); | 1283 | get_pdu_int(ent), cmd); |
| 1293 | } | 1284 | } |
| 1294 | 1285 | ||
| 1295 | static int blk_log_msg(struct trace_seq *s, const struct trace_entry *ent) | 1286 | static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent) |
| 1296 | { | 1287 | { |
| 1297 | int ret; | ||
| 1298 | const struct blk_io_trace *t = te_blk_io_trace(ent); | 1288 | const struct blk_io_trace *t = te_blk_io_trace(ent); |
| 1299 | 1289 | ||
| 1300 | ret = trace_seq_putmem(s, t + 1, t->pdu_len); | 1290 | trace_seq_putmem(s, t + 1, t->pdu_len); |
| 1301 | if (ret) | 1291 | trace_seq_putc(s, '\n'); |
| 1302 | return trace_seq_putc(s, '\n'); | ||
| 1303 | return ret; | ||
| 1304 | } | 1292 | } |
| 1305 | 1293 | ||
| 1306 | /* | 1294 | /* |
| @@ -1339,7 +1327,7 @@ static void blk_tracer_reset(struct trace_array *tr) | |||
| 1339 | 1327 | ||
| 1340 | static const struct { | 1328 | static const struct { |
| 1341 | const char *act[2]; | 1329 | const char *act[2]; |
| 1342 | int (*print)(struct trace_seq *s, const struct trace_entry *ent); | 1330 | void (*print)(struct trace_seq *s, const struct trace_entry *ent); |
| 1343 | } what2act[] = { | 1331 | } what2act[] = { |
| 1344 | [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic }, | 1332 | [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic }, |
| 1345 | [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic }, | 1333 | [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic }, |
| @@ -1364,7 +1352,6 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, | |||
| 1364 | struct trace_seq *s = &iter->seq; | 1352 | struct trace_seq *s = &iter->seq; |
| 1365 | const struct blk_io_trace *t; | 1353 | const struct blk_io_trace *t; |
| 1366 | u16 what; | 1354 | u16 what; |
| 1367 | int ret; | ||
| 1368 | bool long_act; | 1355 | bool long_act; |
| 1369 | blk_log_action_t *log_action; | 1356 | blk_log_action_t *log_action; |
| 1370 | 1357 | ||
| @@ -1374,21 +1361,18 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, | |||
| 1374 | log_action = classic ? &blk_log_action_classic : &blk_log_action; | 1361 | log_action = classic ? &blk_log_action_classic : &blk_log_action; |
| 1375 | 1362 | ||
| 1376 | if (t->action == BLK_TN_MESSAGE) { | 1363 | if (t->action == BLK_TN_MESSAGE) { |
| 1377 | ret = log_action(iter, long_act ? "message" : "m"); | 1364 | log_action(iter, long_act ? "message" : "m"); |
| 1378 | if (ret) | 1365 | blk_log_msg(s, iter->ent); |
| 1379 | ret = blk_log_msg(s, iter->ent); | ||
| 1380 | goto out; | ||
| 1381 | } | 1366 | } |
| 1382 | 1367 | ||
| 1383 | if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) | 1368 | if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) |
| 1384 | ret = trace_seq_printf(s, "Unknown action %x\n", what); | 1369 | trace_seq_printf(s, "Unknown action %x\n", what); |
| 1385 | else { | 1370 | else { |
| 1386 | ret = log_action(iter, what2act[what].act[long_act]); | 1371 | log_action(iter, what2act[what].act[long_act]); |
| 1387 | if (ret) | 1372 | what2act[what].print(s, iter->ent); |
| 1388 | ret = what2act[what].print(s, iter->ent); | ||
| 1389 | } | 1373 | } |
| 1390 | out: | 1374 | |
| 1391 | return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; | 1375 | return trace_handle_return(s); |
| 1392 | } | 1376 | } |
| 1393 | 1377 | ||
| 1394 | static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, | 1378 | static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, |
| @@ -1397,7 +1381,7 @@ static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, | |||
| 1397 | return print_one_line(iter, false); | 1381 | return print_one_line(iter, false); |
| 1398 | } | 1382 | } |
| 1399 | 1383 | ||
| 1400 | static int blk_trace_synthesize_old_trace(struct trace_iterator *iter) | 1384 | static void blk_trace_synthesize_old_trace(struct trace_iterator *iter) |
| 1401 | { | 1385 | { |
| 1402 | struct trace_seq *s = &iter->seq; | 1386 | struct trace_seq *s = &iter->seq; |
| 1403 | struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; | 1387 | struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; |
| @@ -1407,18 +1391,18 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter) | |||
| 1407 | .time = iter->ts, | 1391 | .time = iter->ts, |
| 1408 | }; | 1392 | }; |
| 1409 | 1393 | ||
| 1410 | if (!trace_seq_putmem(s, &old, offset)) | 1394 | trace_seq_putmem(s, &old, offset); |
| 1411 | return 0; | 1395 | trace_seq_putmem(s, &t->sector, |
| 1412 | return trace_seq_putmem(s, &t->sector, | 1396 | sizeof(old) - offset + t->pdu_len); |
| 1413 | sizeof(old) - offset + t->pdu_len); | ||
| 1414 | } | 1397 | } |
| 1415 | 1398 | ||
| 1416 | static enum print_line_t | 1399 | static enum print_line_t |
| 1417 | blk_trace_event_print_binary(struct trace_iterator *iter, int flags, | 1400 | blk_trace_event_print_binary(struct trace_iterator *iter, int flags, |
| 1418 | struct trace_event *event) | 1401 | struct trace_event *event) |
| 1419 | { | 1402 | { |
| 1420 | return blk_trace_synthesize_old_trace(iter) ? | 1403 | blk_trace_synthesize_old_trace(iter); |
| 1421 | TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; | 1404 | |
| 1405 | return trace_handle_return(&iter->seq); | ||
| 1422 | } | 1406 | } |
| 1423 | 1407 | ||
| 1424 | static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) | 1408 | static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) |
| @@ -1493,9 +1477,6 @@ static int blk_trace_remove_queue(struct request_queue *q) | |||
| 1493 | if (atomic_dec_and_test(&blk_probes_ref)) | 1477 | if (atomic_dec_and_test(&blk_probes_ref)) |
| 1494 | blk_unregister_tracepoints(); | 1478 | blk_unregister_tracepoints(); |
| 1495 | 1479 | ||
| 1496 | spin_lock_irq(&running_trace_lock); | ||
| 1497 | list_del(&bt->running_list); | ||
| 1498 | spin_unlock_irq(&running_trace_lock); | ||
| 1499 | blk_trace_free(bt); | 1480 | blk_trace_free(bt); |
| 1500 | return 0; | 1481 | return 0; |
| 1501 | } | 1482 | } |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 31c90fec4158..224e768bdc73 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
| @@ -387,6 +387,8 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list, | |||
| 387 | return ret; | 387 | return ret; |
| 388 | } | 388 | } |
| 389 | 389 | ||
| 390 | static void ftrace_update_trampoline(struct ftrace_ops *ops); | ||
| 391 | |||
| 390 | static int __register_ftrace_function(struct ftrace_ops *ops) | 392 | static int __register_ftrace_function(struct ftrace_ops *ops) |
| 391 | { | 393 | { |
| 392 | if (ops->flags & FTRACE_OPS_FL_DELETED) | 394 | if (ops->flags & FTRACE_OPS_FL_DELETED) |
| @@ -416,9 +418,13 @@ static int __register_ftrace_function(struct ftrace_ops *ops) | |||
| 416 | if (control_ops_alloc(ops)) | 418 | if (control_ops_alloc(ops)) |
| 417 | return -ENOMEM; | 419 | return -ENOMEM; |
| 418 | add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); | 420 | add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); |
| 421 | /* The control_ops needs the trampoline update */ | ||
| 422 | ops = &control_ops; | ||
| 419 | } else | 423 | } else |
| 420 | add_ftrace_ops(&ftrace_ops_list, ops); | 424 | add_ftrace_ops(&ftrace_ops_list, ops); |
| 421 | 425 | ||
| 426 | ftrace_update_trampoline(ops); | ||
| 427 | |||
| 422 | if (ftrace_enabled) | 428 | if (ftrace_enabled) |
| 423 | update_ftrace_function(); | 429 | update_ftrace_function(); |
| 424 | 430 | ||
| @@ -565,13 +571,13 @@ static int function_stat_cmp(void *p1, void *p2) | |||
| 565 | static int function_stat_headers(struct seq_file *m) | 571 | static int function_stat_headers(struct seq_file *m) |
| 566 | { | 572 | { |
| 567 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 573 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
| 568 | seq_printf(m, " Function " | 574 | seq_puts(m, " Function " |
| 569 | "Hit Time Avg s^2\n" | 575 | "Hit Time Avg s^2\n" |
| 570 | " -------- " | 576 | " -------- " |
| 571 | "--- ---- --- ---\n"); | 577 | "--- ---- --- ---\n"); |
| 572 | #else | 578 | #else |
| 573 | seq_printf(m, " Function Hit\n" | 579 | seq_puts(m, " Function Hit\n" |
| 574 | " -------- ---\n"); | 580 | " -------- ---\n"); |
| 575 | #endif | 581 | #endif |
| 576 | return 0; | 582 | return 0; |
| 577 | } | 583 | } |
| @@ -598,7 +604,7 @@ static int function_stat_show(struct seq_file *m, void *v) | |||
| 598 | seq_printf(m, " %-30.30s %10lu", str, rec->counter); | 604 | seq_printf(m, " %-30.30s %10lu", str, rec->counter); |
| 599 | 605 | ||
| 600 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 606 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
| 601 | seq_printf(m, " "); | 607 | seq_puts(m, " "); |
| 602 | avg = rec->time; | 608 | avg = rec->time; |
| 603 | do_div(avg, rec->counter); | 609 | do_div(avg, rec->counter); |
| 604 | 610 | ||
| @@ -1111,6 +1117,43 @@ static struct ftrace_ops global_ops = { | |||
| 1111 | FTRACE_OPS_FL_INITIALIZED, | 1117 | FTRACE_OPS_FL_INITIALIZED, |
| 1112 | }; | 1118 | }; |
| 1113 | 1119 | ||
| 1120 | /* | ||
| 1121 | * This is used by __kernel_text_address() to return true if the | ||
| 1122 | * address is on a dynamically allocated trampoline that would | ||
| 1123 | * not return true for either core_kernel_text() or | ||
| 1124 | * is_module_text_address(). | ||
| 1125 | */ | ||
| 1126 | bool is_ftrace_trampoline(unsigned long addr) | ||
| 1127 | { | ||
| 1128 | struct ftrace_ops *op; | ||
| 1129 | bool ret = false; | ||
| 1130 | |||
| 1131 | /* | ||
| 1132 | * Some of the ops may be dynamically allocated, | ||
| 1133 | * they are freed after a synchronize_sched(). | ||
| 1134 | */ | ||
| 1135 | preempt_disable_notrace(); | ||
| 1136 | |||
| 1137 | do_for_each_ftrace_op(op, ftrace_ops_list) { | ||
| 1138 | /* | ||
| 1139 | * This is to check for dynamically allocated trampolines. | ||
| 1140 | * Trampolines that are in kernel text will have | ||
| 1141 | * core_kernel_text() return true. | ||
| 1142 | */ | ||
| 1143 | if (op->trampoline && op->trampoline_size) | ||
| 1144 | if (addr >= op->trampoline && | ||
| 1145 | addr < op->trampoline + op->trampoline_size) { | ||
| 1146 | ret = true; | ||
| 1147 | goto out; | ||
| 1148 | } | ||
| 1149 | } while_for_each_ftrace_op(op); | ||
| 1150 | |||
| 1151 | out: | ||
| 1152 | preempt_enable_notrace(); | ||
| 1153 | |||
| 1154 | return ret; | ||
| 1155 | } | ||
| 1156 | |||
| 1114 | struct ftrace_page { | 1157 | struct ftrace_page { |
| 1115 | struct ftrace_page *next; | 1158 | struct ftrace_page *next; |
| 1116 | struct dyn_ftrace *records; | 1159 | struct dyn_ftrace *records; |
| @@ -1315,6 +1358,9 @@ ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, int filter_hash); | |||
| 1315 | static void | 1358 | static void |
| 1316 | ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash); | 1359 | ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash); |
| 1317 | 1360 | ||
| 1361 | static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, | ||
| 1362 | struct ftrace_hash *new_hash); | ||
| 1363 | |||
| 1318 | static int | 1364 | static int |
| 1319 | ftrace_hash_move(struct ftrace_ops *ops, int enable, | 1365 | ftrace_hash_move(struct ftrace_ops *ops, int enable, |
| 1320 | struct ftrace_hash **dst, struct ftrace_hash *src) | 1366 | struct ftrace_hash **dst, struct ftrace_hash *src) |
| @@ -1325,8 +1371,13 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, | |||
| 1325 | struct ftrace_hash *new_hash; | 1371 | struct ftrace_hash *new_hash; |
| 1326 | int size = src->count; | 1372 | int size = src->count; |
| 1327 | int bits = 0; | 1373 | int bits = 0; |
| 1374 | int ret; | ||
| 1328 | int i; | 1375 | int i; |
| 1329 | 1376 | ||
| 1377 | /* Reject setting notrace hash on IPMODIFY ftrace_ops */ | ||
| 1378 | if (ops->flags & FTRACE_OPS_FL_IPMODIFY && !enable) | ||
| 1379 | return -EINVAL; | ||
| 1380 | |||
| 1330 | /* | 1381 | /* |
| 1331 | * If the new source is empty, just free dst and assign it | 1382 | * If the new source is empty, just free dst and assign it |
| 1332 | * the empty_hash. | 1383 | * the empty_hash. |
| @@ -1360,6 +1411,16 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, | |||
| 1360 | } | 1411 | } |
| 1361 | 1412 | ||
| 1362 | update: | 1413 | update: |
| 1414 | /* Make sure this can be applied if it is IPMODIFY ftrace_ops */ | ||
| 1415 | if (enable) { | ||
| 1416 | /* IPMODIFY should be updated only when filter_hash updating */ | ||
| 1417 | ret = ftrace_hash_ipmodify_update(ops, new_hash); | ||
| 1418 | if (ret < 0) { | ||
| 1419 | free_ftrace_hash(new_hash); | ||
| 1420 | return ret; | ||
| 1421 | } | ||
| 1422 | } | ||
| 1423 | |||
| 1363 | /* | 1424 | /* |
| 1364 | * Remove the current set, update the hash and add | 1425 | * Remove the current set, update the hash and add |
| 1365 | * them back. | 1426 | * them back. |
| @@ -1724,6 +1785,114 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, | |||
| 1724 | ftrace_hash_rec_update_modify(ops, filter_hash, 1); | 1785 | ftrace_hash_rec_update_modify(ops, filter_hash, 1); |
| 1725 | } | 1786 | } |
| 1726 | 1787 | ||
| 1788 | /* | ||
| 1789 | * Try to update IPMODIFY flag on each ftrace_rec. Return 0 if it is OK | ||
| 1790 | * or no-needed to update, -EBUSY if it detects a conflict of the flag | ||
| 1791 | * on a ftrace_rec, and -EINVAL if the new_hash tries to trace all recs. | ||
| 1792 | * Note that old_hash and new_hash has below meanings | ||
| 1793 | * - If the hash is NULL, it hits all recs (if IPMODIFY is set, this is rejected) | ||
| 1794 | * - If the hash is EMPTY_HASH, it hits nothing | ||
| 1795 | * - Anything else hits the recs which match the hash entries. | ||
| 1796 | */ | ||
| 1797 | static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, | ||
| 1798 | struct ftrace_hash *old_hash, | ||
| 1799 | struct ftrace_hash *new_hash) | ||
| 1800 | { | ||
| 1801 | struct ftrace_page *pg; | ||
| 1802 | struct dyn_ftrace *rec, *end = NULL; | ||
| 1803 | int in_old, in_new; | ||
| 1804 | |||
| 1805 | /* Only update if the ops has been registered */ | ||
| 1806 | if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) | ||
| 1807 | return 0; | ||
| 1808 | |||
| 1809 | if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY)) | ||
| 1810 | return 0; | ||
| 1811 | |||
| 1812 | /* | ||
| 1813 | * Since the IPMODIFY is a very address sensitive action, we do not | ||
| 1814 | * allow ftrace_ops to set all functions to new hash. | ||
| 1815 | */ | ||
| 1816 | if (!new_hash || !old_hash) | ||
| 1817 | return -EINVAL; | ||
| 1818 | |||
| 1819 | /* Update rec->flags */ | ||
| 1820 | do_for_each_ftrace_rec(pg, rec) { | ||
| 1821 | /* We need to update only differences of filter_hash */ | ||
| 1822 | in_old = !!ftrace_lookup_ip(old_hash, rec->ip); | ||
| 1823 | in_new = !!ftrace_lookup_ip(new_hash, rec->ip); | ||
| 1824 | if (in_old == in_new) | ||
| 1825 | continue; | ||
| 1826 | |||
| 1827 | if (in_new) { | ||
| 1828 | /* New entries must ensure no others are using it */ | ||
| 1829 | if (rec->flags & FTRACE_FL_IPMODIFY) | ||
| 1830 | goto rollback; | ||
| 1831 | rec->flags |= FTRACE_FL_IPMODIFY; | ||
| 1832 | } else /* Removed entry */ | ||
| 1833 | rec->flags &= ~FTRACE_FL_IPMODIFY; | ||
| 1834 | } while_for_each_ftrace_rec(); | ||
| 1835 | |||
| 1836 | return 0; | ||
| 1837 | |||
| 1838 | rollback: | ||
| 1839 | end = rec; | ||
| 1840 | |||
| 1841 | /* Roll back what we did above */ | ||
| 1842 | do_for_each_ftrace_rec(pg, rec) { | ||
| 1843 | if (rec == end) | ||
| 1844 | goto err_out; | ||
| 1845 | |||
| 1846 | in_old = !!ftrace_lookup_ip(old_hash, rec->ip); | ||
| 1847 | in_new = !!ftrace_lookup_ip(new_hash, rec->ip); | ||
| 1848 | if (in_old == in_new) | ||
| 1849 | continue; | ||
| 1850 | |||
| 1851 | if (in_new) | ||
| 1852 | rec->flags &= ~FTRACE_FL_IPMODIFY; | ||
| 1853 | else | ||
| 1854 | rec->flags |= FTRACE_FL_IPMODIFY; | ||
| 1855 | } while_for_each_ftrace_rec(); | ||
| 1856 | |||
| 1857 | err_out: | ||
| 1858 | return -EBUSY; | ||
| 1859 | } | ||
| 1860 | |||
| 1861 | static int ftrace_hash_ipmodify_enable(struct ftrace_ops *ops) | ||
| 1862 | { | ||
| 1863 | struct ftrace_hash *hash = ops->func_hash->filter_hash; | ||
| 1864 | |||
| 1865 | if (ftrace_hash_empty(hash)) | ||
| 1866 | hash = NULL; | ||
| 1867 | |||
| 1868 | return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash); | ||
| 1869 | } | ||
| 1870 | |||
| 1871 | /* Disabling always succeeds */ | ||
| 1872 | static void ftrace_hash_ipmodify_disable(struct ftrace_ops *ops) | ||
| 1873 | { | ||
| 1874 | struct ftrace_hash *hash = ops->func_hash->filter_hash; | ||
| 1875 | |||
| 1876 | if (ftrace_hash_empty(hash)) | ||
| 1877 | hash = NULL; | ||
| 1878 | |||
| 1879 | __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH); | ||
| 1880 | } | ||
| 1881 | |||
| 1882 | static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, | ||
| 1883 | struct ftrace_hash *new_hash) | ||
| 1884 | { | ||
| 1885 | struct ftrace_hash *old_hash = ops->func_hash->filter_hash; | ||
| 1886 | |||
| 1887 | if (ftrace_hash_empty(old_hash)) | ||
| 1888 | old_hash = NULL; | ||
| 1889 | |||
| 1890 | if (ftrace_hash_empty(new_hash)) | ||
| 1891 | new_hash = NULL; | ||
| 1892 | |||
| 1893 | return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash); | ||
| 1894 | } | ||
| 1895 | |||
| 1727 | static void print_ip_ins(const char *fmt, unsigned char *p) | 1896 | static void print_ip_ins(const char *fmt, unsigned char *p) |
| 1728 | { | 1897 | { |
| 1729 | int i; | 1898 | int i; |
| @@ -1734,10 +1903,13 @@ static void print_ip_ins(const char *fmt, unsigned char *p) | |||
| 1734 | printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); | 1903 | printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); |
| 1735 | } | 1904 | } |
| 1736 | 1905 | ||
| 1906 | static struct ftrace_ops * | ||
| 1907 | ftrace_find_tramp_ops_any(struct dyn_ftrace *rec); | ||
| 1908 | |||
| 1737 | /** | 1909 | /** |
| 1738 | * ftrace_bug - report and shutdown function tracer | 1910 | * ftrace_bug - report and shutdown function tracer |
| 1739 | * @failed: The failed type (EFAULT, EINVAL, EPERM) | 1911 | * @failed: The failed type (EFAULT, EINVAL, EPERM) |
| 1740 | * @ip: The address that failed | 1912 | * @rec: The record that failed |
| 1741 | * | 1913 | * |
| 1742 | * The arch code that enables or disables the function tracing | 1914 | * The arch code that enables or disables the function tracing |
| 1743 | * can call ftrace_bug() when it has detected a problem in | 1915 | * can call ftrace_bug() when it has detected a problem in |
| @@ -1746,8 +1918,10 @@ static void print_ip_ins(const char *fmt, unsigned char *p) | |||
| 1746 | * EINVAL - if what is read at @ip is not what was expected | 1918 | * EINVAL - if what is read at @ip is not what was expected |
| 1747 | * EPERM - if the problem happens on writting to the @ip address | 1919 | * EPERM - if the problem happens on writting to the @ip address |
| 1748 | */ | 1920 | */ |
| 1749 | void ftrace_bug(int failed, unsigned long ip) | 1921 | void ftrace_bug(int failed, struct dyn_ftrace *rec) |
| 1750 | { | 1922 | { |
| 1923 | unsigned long ip = rec ? rec->ip : 0; | ||
| 1924 | |||
| 1751 | switch (failed) { | 1925 | switch (failed) { |
| 1752 | case -EFAULT: | 1926 | case -EFAULT: |
| 1753 | FTRACE_WARN_ON_ONCE(1); | 1927 | FTRACE_WARN_ON_ONCE(1); |
| @@ -1759,7 +1933,7 @@ void ftrace_bug(int failed, unsigned long ip) | |||
| 1759 | pr_info("ftrace failed to modify "); | 1933 | pr_info("ftrace failed to modify "); |
| 1760 | print_ip_sym(ip); | 1934 | print_ip_sym(ip); |
| 1761 | print_ip_ins(" actual: ", (unsigned char *)ip); | 1935 | print_ip_ins(" actual: ", (unsigned char *)ip); |
| 1762 | printk(KERN_CONT "\n"); | 1936 | pr_cont("\n"); |
| 1763 | break; | 1937 | break; |
| 1764 | case -EPERM: | 1938 | case -EPERM: |
| 1765 | FTRACE_WARN_ON_ONCE(1); | 1939 | FTRACE_WARN_ON_ONCE(1); |
| @@ -1771,6 +1945,24 @@ void ftrace_bug(int failed, unsigned long ip) | |||
| 1771 | pr_info("ftrace faulted on unknown error "); | 1945 | pr_info("ftrace faulted on unknown error "); |
| 1772 | print_ip_sym(ip); | 1946 | print_ip_sym(ip); |
| 1773 | } | 1947 | } |
| 1948 | if (rec) { | ||
| 1949 | struct ftrace_ops *ops = NULL; | ||
| 1950 | |||
| 1951 | pr_info("ftrace record flags: %lx\n", rec->flags); | ||
| 1952 | pr_cont(" (%ld)%s", ftrace_rec_count(rec), | ||
| 1953 | rec->flags & FTRACE_FL_REGS ? " R" : " "); | ||
| 1954 | if (rec->flags & FTRACE_FL_TRAMP_EN) { | ||
| 1955 | ops = ftrace_find_tramp_ops_any(rec); | ||
| 1956 | if (ops) | ||
| 1957 | pr_cont("\ttramp: %pS", | ||
| 1958 | (void *)ops->trampoline); | ||
| 1959 | else | ||
| 1960 | pr_cont("\ttramp: ERROR!"); | ||
| 1961 | |||
| 1962 | } | ||
| 1963 | ip = ftrace_get_addr_curr(rec); | ||
| 1964 | pr_cont(" expected tramp: %lx\n", ip); | ||
| 1965 | } | ||
| 1774 | } | 1966 | } |
| 1775 | 1967 | ||
| 1776 | static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) | 1968 | static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) |
| @@ -2093,7 +2285,7 @@ void __weak ftrace_replace_code(int enable) | |||
| 2093 | do_for_each_ftrace_rec(pg, rec) { | 2285 | do_for_each_ftrace_rec(pg, rec) { |
| 2094 | failed = __ftrace_replace_code(rec, enable); | 2286 | failed = __ftrace_replace_code(rec, enable); |
| 2095 | if (failed) { | 2287 | if (failed) { |
| 2096 | ftrace_bug(failed, rec->ip); | 2288 | ftrace_bug(failed, rec); |
| 2097 | /* Stop processing */ | 2289 | /* Stop processing */ |
| 2098 | return; | 2290 | return; |
| 2099 | } | 2291 | } |
| @@ -2175,17 +2367,14 @@ struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter) | |||
| 2175 | static int | 2367 | static int |
| 2176 | ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) | 2368 | ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) |
| 2177 | { | 2369 | { |
| 2178 | unsigned long ip; | ||
| 2179 | int ret; | 2370 | int ret; |
| 2180 | 2371 | ||
| 2181 | ip = rec->ip; | ||
| 2182 | |||
| 2183 | if (unlikely(ftrace_disabled)) | 2372 | if (unlikely(ftrace_disabled)) |
| 2184 | return 0; | 2373 | return 0; |
| 2185 | 2374 | ||
| 2186 | ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); | 2375 | ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); |
| 2187 | if (ret) { | 2376 | if (ret) { |
| 2188 | ftrace_bug(ret, ip); | 2377 | ftrace_bug(ret, rec); |
| 2189 | return 0; | 2378 | return 0; |
| 2190 | } | 2379 | } |
| 2191 | return 1; | 2380 | return 1; |
| @@ -2308,18 +2497,24 @@ static void ftrace_run_update_code(int command) | |||
| 2308 | } | 2497 | } |
| 2309 | 2498 | ||
| 2310 | static void ftrace_run_modify_code(struct ftrace_ops *ops, int command, | 2499 | static void ftrace_run_modify_code(struct ftrace_ops *ops, int command, |
| 2311 | struct ftrace_hash *old_hash) | 2500 | struct ftrace_ops_hash *old_hash) |
| 2312 | { | 2501 | { |
| 2313 | ops->flags |= FTRACE_OPS_FL_MODIFYING; | 2502 | ops->flags |= FTRACE_OPS_FL_MODIFYING; |
| 2314 | ops->old_hash.filter_hash = old_hash; | 2503 | ops->old_hash.filter_hash = old_hash->filter_hash; |
| 2504 | ops->old_hash.notrace_hash = old_hash->notrace_hash; | ||
| 2315 | ftrace_run_update_code(command); | 2505 | ftrace_run_update_code(command); |
| 2316 | ops->old_hash.filter_hash = NULL; | 2506 | ops->old_hash.filter_hash = NULL; |
| 2507 | ops->old_hash.notrace_hash = NULL; | ||
| 2317 | ops->flags &= ~FTRACE_OPS_FL_MODIFYING; | 2508 | ops->flags &= ~FTRACE_OPS_FL_MODIFYING; |
| 2318 | } | 2509 | } |
| 2319 | 2510 | ||
| 2320 | static ftrace_func_t saved_ftrace_func; | 2511 | static ftrace_func_t saved_ftrace_func; |
| 2321 | static int ftrace_start_up; | 2512 | static int ftrace_start_up; |
| 2322 | 2513 | ||
| 2514 | void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops) | ||
| 2515 | { | ||
| 2516 | } | ||
| 2517 | |||
| 2323 | static void control_ops_free(struct ftrace_ops *ops) | 2518 | static void control_ops_free(struct ftrace_ops *ops) |
| 2324 | { | 2519 | { |
| 2325 | free_percpu(ops->disabled); | 2520 | free_percpu(ops->disabled); |
| @@ -2369,6 +2564,15 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) | |||
| 2369 | */ | 2564 | */ |
| 2370 | ops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_ADDING; | 2565 | ops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_ADDING; |
| 2371 | 2566 | ||
| 2567 | ret = ftrace_hash_ipmodify_enable(ops); | ||
| 2568 | if (ret < 0) { | ||
| 2569 | /* Rollback registration process */ | ||
| 2570 | __unregister_ftrace_function(ops); | ||
| 2571 | ftrace_start_up--; | ||
| 2572 | ops->flags &= ~FTRACE_OPS_FL_ENABLED; | ||
| 2573 | return ret; | ||
| 2574 | } | ||
| 2575 | |||
| 2372 | ftrace_hash_rec_enable(ops, 1); | 2576 | ftrace_hash_rec_enable(ops, 1); |
| 2373 | 2577 | ||
| 2374 | ftrace_startup_enable(command); | 2578 | ftrace_startup_enable(command); |
| @@ -2397,6 +2601,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) | |||
| 2397 | */ | 2601 | */ |
| 2398 | WARN_ON_ONCE(ftrace_start_up < 0); | 2602 | WARN_ON_ONCE(ftrace_start_up < 0); |
| 2399 | 2603 | ||
| 2604 | /* Disabling ipmodify never fails */ | ||
| 2605 | ftrace_hash_ipmodify_disable(ops); | ||
| 2400 | ftrace_hash_rec_disable(ops, 1); | 2606 | ftrace_hash_rec_disable(ops, 1); |
| 2401 | 2607 | ||
| 2402 | ops->flags &= ~FTRACE_OPS_FL_ENABLED; | 2608 | ops->flags &= ~FTRACE_OPS_FL_ENABLED; |
| @@ -2471,6 +2677,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) | |||
| 2471 | if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) { | 2677 | if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) { |
| 2472 | schedule_on_each_cpu(ftrace_sync); | 2678 | schedule_on_each_cpu(ftrace_sync); |
| 2473 | 2679 | ||
| 2680 | arch_ftrace_trampoline_free(ops); | ||
| 2681 | |||
| 2474 | if (ops->flags & FTRACE_OPS_FL_CONTROL) | 2682 | if (ops->flags & FTRACE_OPS_FL_CONTROL) |
| 2475 | control_ops_free(ops); | 2683 | control_ops_free(ops); |
| 2476 | } | 2684 | } |
| @@ -2623,7 +2831,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) | |||
| 2623 | if (ftrace_start_up && cnt) { | 2831 | if (ftrace_start_up && cnt) { |
| 2624 | int failed = __ftrace_replace_code(p, 1); | 2832 | int failed = __ftrace_replace_code(p, 1); |
| 2625 | if (failed) | 2833 | if (failed) |
| 2626 | ftrace_bug(failed, p->ip); | 2834 | ftrace_bug(failed, p); |
| 2627 | } | 2835 | } |
| 2628 | } | 2836 | } |
| 2629 | } | 2837 | } |
| @@ -2948,6 +3156,22 @@ static void t_stop(struct seq_file *m, void *p) | |||
| 2948 | mutex_unlock(&ftrace_lock); | 3156 | mutex_unlock(&ftrace_lock); |
| 2949 | } | 3157 | } |
| 2950 | 3158 | ||
| 3159 | void * __weak | ||
| 3160 | arch_ftrace_trampoline_func(struct ftrace_ops *ops, struct dyn_ftrace *rec) | ||
| 3161 | { | ||
| 3162 | return NULL; | ||
| 3163 | } | ||
| 3164 | |||
| 3165 | static void add_trampoline_func(struct seq_file *m, struct ftrace_ops *ops, | ||
| 3166 | struct dyn_ftrace *rec) | ||
| 3167 | { | ||
| 3168 | void *ptr; | ||
| 3169 | |||
| 3170 | ptr = arch_ftrace_trampoline_func(ops, rec); | ||
| 3171 | if (ptr) | ||
| 3172 | seq_printf(m, " ->%pS", ptr); | ||
| 3173 | } | ||
| 3174 | |||
| 2951 | static int t_show(struct seq_file *m, void *v) | 3175 | static int t_show(struct seq_file *m, void *v) |
| 2952 | { | 3176 | { |
| 2953 | struct ftrace_iterator *iter = m->private; | 3177 | struct ftrace_iterator *iter = m->private; |
| @@ -2958,9 +3182,9 @@ static int t_show(struct seq_file *m, void *v) | |||
| 2958 | 3182 | ||
| 2959 | if (iter->flags & FTRACE_ITER_PRINTALL) { | 3183 | if (iter->flags & FTRACE_ITER_PRINTALL) { |
| 2960 | if (iter->flags & FTRACE_ITER_NOTRACE) | 3184 | if (iter->flags & FTRACE_ITER_NOTRACE) |
| 2961 | seq_printf(m, "#### no functions disabled ####\n"); | 3185 | seq_puts(m, "#### no functions disabled ####\n"); |
| 2962 | else | 3186 | else |
| 2963 | seq_printf(m, "#### all functions enabled ####\n"); | 3187 | seq_puts(m, "#### all functions enabled ####\n"); |
| 2964 | return 0; | 3188 | return 0; |
| 2965 | } | 3189 | } |
| 2966 | 3190 | ||
| @@ -2971,22 +3195,25 @@ static int t_show(struct seq_file *m, void *v) | |||
| 2971 | 3195 | ||
| 2972 | seq_printf(m, "%ps", (void *)rec->ip); | 3196 | seq_printf(m, "%ps", (void *)rec->ip); |
| 2973 | if (iter->flags & FTRACE_ITER_ENABLED) { | 3197 | if (iter->flags & FTRACE_ITER_ENABLED) { |
| 2974 | seq_printf(m, " (%ld)%s", | 3198 | struct ftrace_ops *ops = NULL; |
| 3199 | |||
| 3200 | seq_printf(m, " (%ld)%s%s", | ||
| 2975 | ftrace_rec_count(rec), | 3201 | ftrace_rec_count(rec), |
| 2976 | rec->flags & FTRACE_FL_REGS ? " R" : " "); | 3202 | rec->flags & FTRACE_FL_REGS ? " R" : " ", |
| 3203 | rec->flags & FTRACE_FL_IPMODIFY ? " I" : " "); | ||
| 2977 | if (rec->flags & FTRACE_FL_TRAMP_EN) { | 3204 | if (rec->flags & FTRACE_FL_TRAMP_EN) { |
| 2978 | struct ftrace_ops *ops; | ||
| 2979 | |||
| 2980 | ops = ftrace_find_tramp_ops_any(rec); | 3205 | ops = ftrace_find_tramp_ops_any(rec); |
| 2981 | if (ops) | 3206 | if (ops) |
| 2982 | seq_printf(m, "\ttramp: %pS", | 3207 | seq_printf(m, "\ttramp: %pS", |
| 2983 | (void *)ops->trampoline); | 3208 | (void *)ops->trampoline); |
| 2984 | else | 3209 | else |
| 2985 | seq_printf(m, "\ttramp: ERROR!"); | 3210 | seq_puts(m, "\ttramp: ERROR!"); |
| 3211 | |||
| 2986 | } | 3212 | } |
| 3213 | add_trampoline_func(m, ops, rec); | ||
| 2987 | } | 3214 | } |
| 2988 | 3215 | ||
| 2989 | seq_printf(m, "\n"); | 3216 | seq_putc(m, '\n'); |
| 2990 | 3217 | ||
| 2991 | return 0; | 3218 | return 0; |
| 2992 | } | 3219 | } |
| @@ -3020,9 +3247,6 @@ ftrace_enabled_open(struct inode *inode, struct file *file) | |||
| 3020 | { | 3247 | { |
| 3021 | struct ftrace_iterator *iter; | 3248 | struct ftrace_iterator *iter; |
| 3022 | 3249 | ||
| 3023 | if (unlikely(ftrace_disabled)) | ||
| 3024 | return -ENODEV; | ||
| 3025 | |||
| 3026 | iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); | 3250 | iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); |
| 3027 | if (iter) { | 3251 | if (iter) { |
| 3028 | iter->pg = ftrace_pages_start; | 3252 | iter->pg = ftrace_pages_start; |
| @@ -3357,7 +3581,7 @@ static struct ftrace_ops trace_probe_ops __read_mostly = | |||
| 3357 | 3581 | ||
| 3358 | static int ftrace_probe_registered; | 3582 | static int ftrace_probe_registered; |
| 3359 | 3583 | ||
| 3360 | static void __enable_ftrace_function_probe(struct ftrace_hash *old_hash) | 3584 | static void __enable_ftrace_function_probe(struct ftrace_ops_hash *old_hash) |
| 3361 | { | 3585 | { |
| 3362 | int ret; | 3586 | int ret; |
| 3363 | int i; | 3587 | int i; |
| @@ -3415,6 +3639,7 @@ int | |||
| 3415 | register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, | 3639 | register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, |
| 3416 | void *data) | 3640 | void *data) |
| 3417 | { | 3641 | { |
| 3642 | struct ftrace_ops_hash old_hash_ops; | ||
| 3418 | struct ftrace_func_probe *entry; | 3643 | struct ftrace_func_probe *entry; |
| 3419 | struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; | 3644 | struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; |
| 3420 | struct ftrace_hash *old_hash = *orig_hash; | 3645 | struct ftrace_hash *old_hash = *orig_hash; |
| @@ -3436,6 +3661,10 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, | |||
| 3436 | 3661 | ||
| 3437 | mutex_lock(&trace_probe_ops.func_hash->regex_lock); | 3662 | mutex_lock(&trace_probe_ops.func_hash->regex_lock); |
| 3438 | 3663 | ||
| 3664 | old_hash_ops.filter_hash = old_hash; | ||
| 3665 | /* Probes only have filters */ | ||
| 3666 | old_hash_ops.notrace_hash = NULL; | ||
| 3667 | |||
| 3439 | hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); | 3668 | hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); |
| 3440 | if (!hash) { | 3669 | if (!hash) { |
| 3441 | count = -ENOMEM; | 3670 | count = -ENOMEM; |
| @@ -3496,7 +3725,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, | |||
| 3496 | 3725 | ||
| 3497 | ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); | 3726 | ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); |
| 3498 | 3727 | ||
| 3499 | __enable_ftrace_function_probe(old_hash); | 3728 | __enable_ftrace_function_probe(&old_hash_ops); |
| 3500 | 3729 | ||
| 3501 | if (!ret) | 3730 | if (!ret) |
| 3502 | free_ftrace_hash_rcu(old_hash); | 3731 | free_ftrace_hash_rcu(old_hash); |
| @@ -3784,10 +4013,34 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) | |||
| 3784 | } | 4013 | } |
| 3785 | 4014 | ||
| 3786 | static void ftrace_ops_update_code(struct ftrace_ops *ops, | 4015 | static void ftrace_ops_update_code(struct ftrace_ops *ops, |
| 3787 | struct ftrace_hash *old_hash) | 4016 | struct ftrace_ops_hash *old_hash) |
| 3788 | { | 4017 | { |
| 3789 | if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled) | 4018 | struct ftrace_ops *op; |
| 4019 | |||
| 4020 | if (!ftrace_enabled) | ||
| 4021 | return; | ||
| 4022 | |||
| 4023 | if (ops->flags & FTRACE_OPS_FL_ENABLED) { | ||
| 3790 | ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash); | 4024 | ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash); |
| 4025 | return; | ||
| 4026 | } | ||
| 4027 | |||
| 4028 | /* | ||
| 4029 | * If this is the shared global_ops filter, then we need to | ||
| 4030 | * check if there is another ops that shares it, is enabled. | ||
| 4031 | * If so, we still need to run the modify code. | ||
| 4032 | */ | ||
| 4033 | if (ops->func_hash != &global_ops.local_hash) | ||
| 4034 | return; | ||
| 4035 | |||
| 4036 | do_for_each_ftrace_op(op, ftrace_ops_list) { | ||
| 4037 | if (op->func_hash == &global_ops.local_hash && | ||
| 4038 | op->flags & FTRACE_OPS_FL_ENABLED) { | ||
| 4039 | ftrace_run_modify_code(op, FTRACE_UPDATE_CALLS, old_hash); | ||
| 4040 | /* Only need to do this once */ | ||
| 4041 | return; | ||
| 4042 | } | ||
| 4043 | } while_for_each_ftrace_op(op); | ||
| 3791 | } | 4044 | } |
| 3792 | 4045 | ||
| 3793 | static int | 4046 | static int |
| @@ -3795,6 +4048,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, | |||
| 3795 | unsigned long ip, int remove, int reset, int enable) | 4048 | unsigned long ip, int remove, int reset, int enable) |
| 3796 | { | 4049 | { |
| 3797 | struct ftrace_hash **orig_hash; | 4050 | struct ftrace_hash **orig_hash; |
| 4051 | struct ftrace_ops_hash old_hash_ops; | ||
| 3798 | struct ftrace_hash *old_hash; | 4052 | struct ftrace_hash *old_hash; |
| 3799 | struct ftrace_hash *hash; | 4053 | struct ftrace_hash *hash; |
| 3800 | int ret; | 4054 | int ret; |
| @@ -3831,9 +4085,11 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, | |||
| 3831 | 4085 | ||
| 3832 | mutex_lock(&ftrace_lock); | 4086 | mutex_lock(&ftrace_lock); |
| 3833 | old_hash = *orig_hash; | 4087 | old_hash = *orig_hash; |
| 4088 | old_hash_ops.filter_hash = ops->func_hash->filter_hash; | ||
| 4089 | old_hash_ops.notrace_hash = ops->func_hash->notrace_hash; | ||
| 3834 | ret = ftrace_hash_move(ops, enable, orig_hash, hash); | 4090 | ret = ftrace_hash_move(ops, enable, orig_hash, hash); |
| 3835 | if (!ret) { | 4091 | if (!ret) { |
| 3836 | ftrace_ops_update_code(ops, old_hash); | 4092 | ftrace_ops_update_code(ops, &old_hash_ops); |
| 3837 | free_ftrace_hash_rcu(old_hash); | 4093 | free_ftrace_hash_rcu(old_hash); |
| 3838 | } | 4094 | } |
| 3839 | mutex_unlock(&ftrace_lock); | 4095 | mutex_unlock(&ftrace_lock); |
| @@ -3975,6 +4231,9 @@ static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; | |||
| 3975 | static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata; | 4231 | static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata; |
| 3976 | static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer); | 4232 | static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer); |
| 3977 | 4233 | ||
| 4234 | static unsigned long save_global_trampoline; | ||
| 4235 | static unsigned long save_global_flags; | ||
| 4236 | |||
| 3978 | static int __init set_graph_function(char *str) | 4237 | static int __init set_graph_function(char *str) |
| 3979 | { | 4238 | { |
| 3980 | strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); | 4239 | strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); |
| @@ -4042,6 +4301,7 @@ static void __init set_ftrace_early_filters(void) | |||
| 4042 | int ftrace_regex_release(struct inode *inode, struct file *file) | 4301 | int ftrace_regex_release(struct inode *inode, struct file *file) |
| 4043 | { | 4302 | { |
| 4044 | struct seq_file *m = (struct seq_file *)file->private_data; | 4303 | struct seq_file *m = (struct seq_file *)file->private_data; |
| 4304 | struct ftrace_ops_hash old_hash_ops; | ||
| 4045 | struct ftrace_iterator *iter; | 4305 | struct ftrace_iterator *iter; |
| 4046 | struct ftrace_hash **orig_hash; | 4306 | struct ftrace_hash **orig_hash; |
| 4047 | struct ftrace_hash *old_hash; | 4307 | struct ftrace_hash *old_hash; |
| @@ -4075,10 +4335,12 @@ int ftrace_regex_release(struct inode *inode, struct file *file) | |||
| 4075 | 4335 | ||
| 4076 | mutex_lock(&ftrace_lock); | 4336 | mutex_lock(&ftrace_lock); |
| 4077 | old_hash = *orig_hash; | 4337 | old_hash = *orig_hash; |
| 4338 | old_hash_ops.filter_hash = iter->ops->func_hash->filter_hash; | ||
| 4339 | old_hash_ops.notrace_hash = iter->ops->func_hash->notrace_hash; | ||
| 4078 | ret = ftrace_hash_move(iter->ops, filter_hash, | 4340 | ret = ftrace_hash_move(iter->ops, filter_hash, |
| 4079 | orig_hash, iter->hash); | 4341 | orig_hash, iter->hash); |
| 4080 | if (!ret) { | 4342 | if (!ret) { |
| 4081 | ftrace_ops_update_code(iter->ops, old_hash); | 4343 | ftrace_ops_update_code(iter->ops, &old_hash_ops); |
| 4082 | free_ftrace_hash_rcu(old_hash); | 4344 | free_ftrace_hash_rcu(old_hash); |
| 4083 | } | 4345 | } |
| 4084 | mutex_unlock(&ftrace_lock); | 4346 | mutex_unlock(&ftrace_lock); |
| @@ -4183,9 +4445,9 @@ static int g_show(struct seq_file *m, void *v) | |||
| 4183 | struct ftrace_graph_data *fgd = m->private; | 4445 | struct ftrace_graph_data *fgd = m->private; |
| 4184 | 4446 | ||
| 4185 | if (fgd->table == ftrace_graph_funcs) | 4447 | if (fgd->table == ftrace_graph_funcs) |
| 4186 | seq_printf(m, "#### all functions enabled ####\n"); | 4448 | seq_puts(m, "#### all functions enabled ####\n"); |
| 4187 | else | 4449 | else |
| 4188 | seq_printf(m, "#### no functions disabled ####\n"); | 4450 | seq_puts(m, "#### no functions disabled ####\n"); |
| 4189 | return 0; | 4451 | return 0; |
| 4190 | } | 4452 | } |
| 4191 | 4453 | ||
| @@ -4696,6 +4958,32 @@ void __init ftrace_init(void) | |||
| 4696 | ftrace_disabled = 1; | 4958 | ftrace_disabled = 1; |
| 4697 | } | 4959 | } |
| 4698 | 4960 | ||
| 4961 | /* Do nothing if arch does not support this */ | ||
| 4962 | void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops) | ||
| 4963 | { | ||
| 4964 | } | ||
| 4965 | |||
| 4966 | static void ftrace_update_trampoline(struct ftrace_ops *ops) | ||
| 4967 | { | ||
| 4968 | |||
| 4969 | /* | ||
| 4970 | * Currently there's no safe way to free a trampoline when the kernel | ||
| 4971 | * is configured with PREEMPT. That is because a task could be preempted | ||
| 4972 | * when it jumped to the trampoline, it may be preempted for a long time | ||
| 4973 | * depending on the system load, and currently there's no way to know | ||
| 4974 | * when it will be off the trampoline. If the trampoline is freed | ||
| 4975 | * too early, when the task runs again, it will be executing on freed | ||
| 4976 | * memory and crash. | ||
| 4977 | */ | ||
| 4978 | #ifdef CONFIG_PREEMPT | ||
| 4979 | /* Currently, only non dynamic ops can have a trampoline */ | ||
| 4980 | if (ops->flags & FTRACE_OPS_FL_DYNAMIC) | ||
| 4981 | return; | ||
| 4982 | #endif | ||
| 4983 | |||
| 4984 | arch_ftrace_update_trampoline(ops); | ||
| 4985 | } | ||
| 4986 | |||
| 4699 | #else | 4987 | #else |
| 4700 | 4988 | ||
| 4701 | static struct ftrace_ops global_ops = { | 4989 | static struct ftrace_ops global_ops = { |
| @@ -4738,6 +5026,10 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) | |||
| 4738 | return 1; | 5026 | return 1; |
| 4739 | } | 5027 | } |
| 4740 | 5028 | ||
| 5029 | static void ftrace_update_trampoline(struct ftrace_ops *ops) | ||
| 5030 | { | ||
| 5031 | } | ||
| 5032 | |||
| 4741 | #endif /* CONFIG_DYNAMIC_FTRACE */ | 5033 | #endif /* CONFIG_DYNAMIC_FTRACE */ |
| 4742 | 5034 | ||
| 4743 | __init void ftrace_init_global_array_ops(struct trace_array *tr) | 5035 | __init void ftrace_init_global_array_ops(struct trace_array *tr) |
| @@ -5075,12 +5367,12 @@ static int fpid_show(struct seq_file *m, void *v) | |||
| 5075 | const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list); | 5367 | const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list); |
| 5076 | 5368 | ||
| 5077 | if (v == (void *)1) { | 5369 | if (v == (void *)1) { |
| 5078 | seq_printf(m, "no pid\n"); | 5370 | seq_puts(m, "no pid\n"); |
| 5079 | return 0; | 5371 | return 0; |
| 5080 | } | 5372 | } |
| 5081 | 5373 | ||
| 5082 | if (fpid->pid == ftrace_swapper_pid) | 5374 | if (fpid->pid == ftrace_swapper_pid) |
| 5083 | seq_printf(m, "swapper tasks\n"); | 5375 | seq_puts(m, "swapper tasks\n"); |
| 5084 | else | 5376 | else |
| 5085 | seq_printf(m, "%u\n", pid_vnr(fpid->pid)); | 5377 | seq_printf(m, "%u\n", pid_vnr(fpid->pid)); |
| 5086 | 5378 | ||
| @@ -5293,6 +5585,7 @@ static struct ftrace_ops graph_ops = { | |||
| 5293 | FTRACE_OPS_FL_STUB, | 5585 | FTRACE_OPS_FL_STUB, |
| 5294 | #ifdef FTRACE_GRAPH_TRAMP_ADDR | 5586 | #ifdef FTRACE_GRAPH_TRAMP_ADDR |
| 5295 | .trampoline = FTRACE_GRAPH_TRAMP_ADDR, | 5587 | .trampoline = FTRACE_GRAPH_TRAMP_ADDR, |
| 5588 | /* trampoline_size is only needed for dynamically allocated tramps */ | ||
| 5296 | #endif | 5589 | #endif |
| 5297 | ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash) | 5590 | ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash) |
| 5298 | }; | 5591 | }; |
| @@ -5522,7 +5815,6 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, | |||
| 5522 | update_function_graph_func(); | 5815 | update_function_graph_func(); |
| 5523 | 5816 | ||
| 5524 | ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET); | 5817 | ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET); |
| 5525 | |||
| 5526 | out: | 5818 | out: |
| 5527 | mutex_unlock(&ftrace_lock); | 5819 | mutex_unlock(&ftrace_lock); |
| 5528 | return ret; | 5820 | return ret; |
| @@ -5543,6 +5835,17 @@ void unregister_ftrace_graph(void) | |||
| 5543 | unregister_pm_notifier(&ftrace_suspend_notifier); | 5835 | unregister_pm_notifier(&ftrace_suspend_notifier); |
| 5544 | unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); | 5836 | unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); |
| 5545 | 5837 | ||
| 5838 | #ifdef CONFIG_DYNAMIC_FTRACE | ||
| 5839 | /* | ||
| 5840 | * Function graph does not allocate the trampoline, but | ||
| 5841 | * other global_ops do. We need to reset the ALLOC_TRAMP flag | ||
| 5842 | * if one was used. | ||
| 5843 | */ | ||
| 5844 | global_ops.trampoline = save_global_trampoline; | ||
| 5845 | if (save_global_flags & FTRACE_OPS_FL_ALLOC_TRAMP) | ||
| 5846 | global_ops.flags |= FTRACE_OPS_FL_ALLOC_TRAMP; | ||
| 5847 | #endif | ||
| 5848 | |||
| 5546 | out: | 5849 | out: |
| 5547 | mutex_unlock(&ftrace_lock); | 5850 | mutex_unlock(&ftrace_lock); |
| 5548 | } | 5851 | } |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a56e07c8d15b..7a4104cb95cb 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
| @@ -34,21 +34,19 @@ static void update_pages_handler(struct work_struct *work); | |||
| 34 | */ | 34 | */ |
| 35 | int ring_buffer_print_entry_header(struct trace_seq *s) | 35 | int ring_buffer_print_entry_header(struct trace_seq *s) |
| 36 | { | 36 | { |
| 37 | int ret; | 37 | trace_seq_puts(s, "# compressed entry header\n"); |
| 38 | 38 | trace_seq_puts(s, "\ttype_len : 5 bits\n"); | |
| 39 | ret = trace_seq_puts(s, "# compressed entry header\n"); | 39 | trace_seq_puts(s, "\ttime_delta : 27 bits\n"); |
| 40 | ret = trace_seq_puts(s, "\ttype_len : 5 bits\n"); | 40 | trace_seq_puts(s, "\tarray : 32 bits\n"); |
| 41 | ret = trace_seq_puts(s, "\ttime_delta : 27 bits\n"); | 41 | trace_seq_putc(s, '\n'); |
| 42 | ret = trace_seq_puts(s, "\tarray : 32 bits\n"); | 42 | trace_seq_printf(s, "\tpadding : type == %d\n", |
| 43 | ret = trace_seq_putc(s, '\n'); | 43 | RINGBUF_TYPE_PADDING); |
| 44 | ret = trace_seq_printf(s, "\tpadding : type == %d\n", | 44 | trace_seq_printf(s, "\ttime_extend : type == %d\n", |
| 45 | RINGBUF_TYPE_PADDING); | 45 | RINGBUF_TYPE_TIME_EXTEND); |
| 46 | ret = trace_seq_printf(s, "\ttime_extend : type == %d\n", | 46 | trace_seq_printf(s, "\tdata max type_len == %d\n", |
| 47 | RINGBUF_TYPE_TIME_EXTEND); | 47 | RINGBUF_TYPE_DATA_TYPE_LEN_MAX); |
| 48 | ret = trace_seq_printf(s, "\tdata max type_len == %d\n", | ||
| 49 | RINGBUF_TYPE_DATA_TYPE_LEN_MAX); | ||
| 50 | 48 | ||
| 51 | return ret; | 49 | return !trace_seq_has_overflowed(s); |
| 52 | } | 50 | } |
| 53 | 51 | ||
| 54 | /* | 52 | /* |
| @@ -419,32 +417,31 @@ static inline int test_time_stamp(u64 delta) | |||
| 419 | int ring_buffer_print_page_header(struct trace_seq *s) | 417 | int ring_buffer_print_page_header(struct trace_seq *s) |
| 420 | { | 418 | { |
| 421 | struct buffer_data_page field; | 419 | struct buffer_data_page field; |
| 422 | int ret; | ||
| 423 | |||
| 424 | ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t" | ||
| 425 | "offset:0;\tsize:%u;\tsigned:%u;\n", | ||
| 426 | (unsigned int)sizeof(field.time_stamp), | ||
| 427 | (unsigned int)is_signed_type(u64)); | ||
| 428 | |||
| 429 | ret = trace_seq_printf(s, "\tfield: local_t commit;\t" | ||
| 430 | "offset:%u;\tsize:%u;\tsigned:%u;\n", | ||
| 431 | (unsigned int)offsetof(typeof(field), commit), | ||
| 432 | (unsigned int)sizeof(field.commit), | ||
| 433 | (unsigned int)is_signed_type(long)); | ||
| 434 | |||
| 435 | ret = trace_seq_printf(s, "\tfield: int overwrite;\t" | ||
| 436 | "offset:%u;\tsize:%u;\tsigned:%u;\n", | ||
| 437 | (unsigned int)offsetof(typeof(field), commit), | ||
| 438 | 1, | ||
| 439 | (unsigned int)is_signed_type(long)); | ||
| 440 | |||
| 441 | ret = trace_seq_printf(s, "\tfield: char data;\t" | ||
| 442 | "offset:%u;\tsize:%u;\tsigned:%u;\n", | ||
| 443 | (unsigned int)offsetof(typeof(field), data), | ||
| 444 | (unsigned int)BUF_PAGE_SIZE, | ||
| 445 | (unsigned int)is_signed_type(char)); | ||
| 446 | 420 | ||
| 447 | return ret; | 421 | trace_seq_printf(s, "\tfield: u64 timestamp;\t" |
| 422 | "offset:0;\tsize:%u;\tsigned:%u;\n", | ||
| 423 | (unsigned int)sizeof(field.time_stamp), | ||
| 424 | (unsigned int)is_signed_type(u64)); | ||
| 425 | |||
| 426 | trace_seq_printf(s, "\tfield: local_t commit;\t" | ||
| 427 | "offset:%u;\tsize:%u;\tsigned:%u;\n", | ||
| 428 | (unsigned int)offsetof(typeof(field), commit), | ||
| 429 | (unsigned int)sizeof(field.commit), | ||
| 430 | (unsigned int)is_signed_type(long)); | ||
| 431 | |||
| 432 | trace_seq_printf(s, "\tfield: int overwrite;\t" | ||
| 433 | "offset:%u;\tsize:%u;\tsigned:%u;\n", | ||
| 434 | (unsigned int)offsetof(typeof(field), commit), | ||
| 435 | 1, | ||
| 436 | (unsigned int)is_signed_type(long)); | ||
| 437 | |||
| 438 | trace_seq_printf(s, "\tfield: char data;\t" | ||
| 439 | "offset:%u;\tsize:%u;\tsigned:%u;\n", | ||
| 440 | (unsigned int)offsetof(typeof(field), data), | ||
| 441 | (unsigned int)BUF_PAGE_SIZE, | ||
| 442 | (unsigned int)is_signed_type(char)); | ||
| 443 | |||
| 444 | return !trace_seq_has_overflowed(s); | ||
| 448 | } | 445 | } |
| 449 | 446 | ||
| 450 | struct rb_irq_work { | 447 | struct rb_irq_work { |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 92f4a6cee172..4a9079b9f082 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
| @@ -63,6 +63,10 @@ static bool __read_mostly tracing_selftest_running; | |||
| 63 | */ | 63 | */ |
| 64 | bool __read_mostly tracing_selftest_disabled; | 64 | bool __read_mostly tracing_selftest_disabled; |
| 65 | 65 | ||
| 66 | /* Pipe tracepoints to printk */ | ||
| 67 | struct trace_iterator *tracepoint_print_iter; | ||
| 68 | int tracepoint_printk; | ||
| 69 | |||
| 66 | /* For tracers that don't implement custom flags */ | 70 | /* For tracers that don't implement custom flags */ |
| 67 | static struct tracer_opt dummy_tracer_opt[] = { | 71 | static struct tracer_opt dummy_tracer_opt[] = { |
| 68 | { } | 72 | { } |
| @@ -155,10 +159,11 @@ __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); | |||
| 155 | 159 | ||
| 156 | static int __init stop_trace_on_warning(char *str) | 160 | static int __init stop_trace_on_warning(char *str) |
| 157 | { | 161 | { |
| 158 | __disable_trace_on_warning = 1; | 162 | if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0)) |
| 163 | __disable_trace_on_warning = 1; | ||
| 159 | return 1; | 164 | return 1; |
| 160 | } | 165 | } |
| 161 | __setup("traceoff_on_warning=", stop_trace_on_warning); | 166 | __setup("traceoff_on_warning", stop_trace_on_warning); |
| 162 | 167 | ||
| 163 | static int __init boot_alloc_snapshot(char *str) | 168 | static int __init boot_alloc_snapshot(char *str) |
| 164 | { | 169 | { |
| @@ -192,6 +197,13 @@ static int __init set_trace_boot_clock(char *str) | |||
| 192 | } | 197 | } |
| 193 | __setup("trace_clock=", set_trace_boot_clock); | 198 | __setup("trace_clock=", set_trace_boot_clock); |
| 194 | 199 | ||
| 200 | static int __init set_tracepoint_printk(char *str) | ||
| 201 | { | ||
| 202 | if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0)) | ||
| 203 | tracepoint_printk = 1; | ||
| 204 | return 1; | ||
| 205 | } | ||
| 206 | __setup("tp_printk", set_tracepoint_printk); | ||
| 195 | 207 | ||
| 196 | unsigned long long ns2usecs(cycle_t nsec) | 208 | unsigned long long ns2usecs(cycle_t nsec) |
| 197 | { | 209 | { |
| @@ -938,19 +950,20 @@ out: | |||
| 938 | return ret; | 950 | return ret; |
| 939 | } | 951 | } |
| 940 | 952 | ||
| 953 | /* TODO add a seq_buf_to_buffer() */ | ||
| 941 | static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) | 954 | static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) |
| 942 | { | 955 | { |
| 943 | int len; | 956 | int len; |
| 944 | 957 | ||
| 945 | if (s->len <= s->readpos) | 958 | if (trace_seq_used(s) <= s->seq.readpos) |
| 946 | return -EBUSY; | 959 | return -EBUSY; |
| 947 | 960 | ||
| 948 | len = s->len - s->readpos; | 961 | len = trace_seq_used(s) - s->seq.readpos; |
| 949 | if (cnt > len) | 962 | if (cnt > len) |
| 950 | cnt = len; | 963 | cnt = len; |
| 951 | memcpy(buf, s->buffer + s->readpos, cnt); | 964 | memcpy(buf, s->buffer + s->seq.readpos, cnt); |
| 952 | 965 | ||
| 953 | s->readpos += cnt; | 966 | s->seq.readpos += cnt; |
| 954 | return cnt; | 967 | return cnt; |
| 955 | } | 968 | } |
| 956 | 969 | ||
| @@ -2029,7 +2042,7 @@ void trace_printk_init_buffers(void) | |||
| 2029 | pr_warning("** trace_printk() being used. Allocating extra memory. **\n"); | 2042 | pr_warning("** trace_printk() being used. Allocating extra memory. **\n"); |
| 2030 | pr_warning("** **\n"); | 2043 | pr_warning("** **\n"); |
| 2031 | pr_warning("** This means that this is a DEBUG kernel and it is **\n"); | 2044 | pr_warning("** This means that this is a DEBUG kernel and it is **\n"); |
| 2032 | pr_warning("** unsafe for produciton use. **\n"); | 2045 | pr_warning("** unsafe for production use. **\n"); |
| 2033 | pr_warning("** **\n"); | 2046 | pr_warning("** **\n"); |
| 2034 | pr_warning("** If you see this message and you are not debugging **\n"); | 2047 | pr_warning("** If you see this message and you are not debugging **\n"); |
| 2035 | pr_warning("** the kernel, report this immediately to your vendor! **\n"); | 2048 | pr_warning("** the kernel, report this immediately to your vendor! **\n"); |
| @@ -2158,9 +2171,7 @@ __trace_array_vprintk(struct ring_buffer *buffer, | |||
| 2158 | goto out; | 2171 | goto out; |
| 2159 | } | 2172 | } |
| 2160 | 2173 | ||
| 2161 | len = vsnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); | 2174 | len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); |
| 2162 | if (len > TRACE_BUF_SIZE) | ||
| 2163 | goto out; | ||
| 2164 | 2175 | ||
| 2165 | local_save_flags(flags); | 2176 | local_save_flags(flags); |
| 2166 | size = sizeof(*entry) + len + 1; | 2177 | size = sizeof(*entry) + len + 1; |
| @@ -2171,8 +2182,7 @@ __trace_array_vprintk(struct ring_buffer *buffer, | |||
| 2171 | entry = ring_buffer_event_data(event); | 2182 | entry = ring_buffer_event_data(event); |
| 2172 | entry->ip = ip; | 2183 | entry->ip = ip; |
| 2173 | 2184 | ||
| 2174 | memcpy(&entry->buf, tbuffer, len); | 2185 | memcpy(&entry->buf, tbuffer, len + 1); |
| 2175 | entry->buf[len] = '\0'; | ||
| 2176 | if (!call_filter_check_discard(call, entry, buffer, event)) { | 2186 | if (!call_filter_check_discard(call, entry, buffer, event)) { |
| 2177 | __buffer_unlock_commit(buffer, event); | 2187 | __buffer_unlock_commit(buffer, event); |
| 2178 | ftrace_trace_stack(buffer, flags, 6, pc); | 2188 | ftrace_trace_stack(buffer, flags, 6, pc); |
| @@ -2509,14 +2519,14 @@ get_total_entries(struct trace_buffer *buf, | |||
| 2509 | 2519 | ||
| 2510 | static void print_lat_help_header(struct seq_file *m) | 2520 | static void print_lat_help_header(struct seq_file *m) |
| 2511 | { | 2521 | { |
| 2512 | seq_puts(m, "# _------=> CPU# \n"); | 2522 | seq_puts(m, "# _------=> CPU# \n" |
| 2513 | seq_puts(m, "# / _-----=> irqs-off \n"); | 2523 | "# / _-----=> irqs-off \n" |
| 2514 | seq_puts(m, "# | / _----=> need-resched \n"); | 2524 | "# | / _----=> need-resched \n" |
| 2515 | seq_puts(m, "# || / _---=> hardirq/softirq \n"); | 2525 | "# || / _---=> hardirq/softirq \n" |
| 2516 | seq_puts(m, "# ||| / _--=> preempt-depth \n"); | 2526 | "# ||| / _--=> preempt-depth \n" |
| 2517 | seq_puts(m, "# |||| / delay \n"); | 2527 | "# |||| / delay \n" |
| 2518 | seq_puts(m, "# cmd pid ||||| time | caller \n"); | 2528 | "# cmd pid ||||| time | caller \n" |
| 2519 | seq_puts(m, "# \\ / ||||| \\ | / \n"); | 2529 | "# \\ / ||||| \\ | / \n"); |
| 2520 | } | 2530 | } |
| 2521 | 2531 | ||
| 2522 | static void print_event_info(struct trace_buffer *buf, struct seq_file *m) | 2532 | static void print_event_info(struct trace_buffer *buf, struct seq_file *m) |
| @@ -2533,20 +2543,20 @@ static void print_event_info(struct trace_buffer *buf, struct seq_file *m) | |||
| 2533 | static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m) | 2543 | static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m) |
| 2534 | { | 2544 | { |
| 2535 | print_event_info(buf, m); | 2545 | print_event_info(buf, m); |
| 2536 | seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); | 2546 | seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n" |
| 2537 | seq_puts(m, "# | | | | |\n"); | 2547 | "# | | | | |\n"); |
| 2538 | } | 2548 | } |
| 2539 | 2549 | ||
| 2540 | static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m) | 2550 | static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m) |
| 2541 | { | 2551 | { |
| 2542 | print_event_info(buf, m); | 2552 | print_event_info(buf, m); |
| 2543 | seq_puts(m, "# _-----=> irqs-off\n"); | 2553 | seq_puts(m, "# _-----=> irqs-off\n" |
| 2544 | seq_puts(m, "# / _----=> need-resched\n"); | 2554 | "# / _----=> need-resched\n" |
| 2545 | seq_puts(m, "# | / _---=> hardirq/softirq\n"); | 2555 | "# | / _---=> hardirq/softirq\n" |
| 2546 | seq_puts(m, "# || / _--=> preempt-depth\n"); | 2556 | "# || / _--=> preempt-depth\n" |
| 2547 | seq_puts(m, "# ||| / delay\n"); | 2557 | "# ||| / delay\n" |
| 2548 | seq_puts(m, "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n"); | 2558 | "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n" |
| 2549 | seq_puts(m, "# | | | |||| | |\n"); | 2559 | "# | | | |||| | |\n"); |
| 2550 | } | 2560 | } |
| 2551 | 2561 | ||
| 2552 | void | 2562 | void |
| @@ -2649,24 +2659,21 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) | |||
| 2649 | event = ftrace_find_event(entry->type); | 2659 | event = ftrace_find_event(entry->type); |
| 2650 | 2660 | ||
| 2651 | if (trace_flags & TRACE_ITER_CONTEXT_INFO) { | 2661 | if (trace_flags & TRACE_ITER_CONTEXT_INFO) { |
| 2652 | if (iter->iter_flags & TRACE_FILE_LAT_FMT) { | 2662 | if (iter->iter_flags & TRACE_FILE_LAT_FMT) |
| 2653 | if (!trace_print_lat_context(iter)) | 2663 | trace_print_lat_context(iter); |
| 2654 | goto partial; | 2664 | else |
| 2655 | } else { | 2665 | trace_print_context(iter); |
| 2656 | if (!trace_print_context(iter)) | ||
| 2657 | goto partial; | ||
| 2658 | } | ||
| 2659 | } | 2666 | } |
| 2660 | 2667 | ||
| 2668 | if (trace_seq_has_overflowed(s)) | ||
| 2669 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 2670 | |||
| 2661 | if (event) | 2671 | if (event) |
| 2662 | return event->funcs->trace(iter, sym_flags, event); | 2672 | return event->funcs->trace(iter, sym_flags, event); |
| 2663 | 2673 | ||
| 2664 | if (!trace_seq_printf(s, "Unknown type %d\n", entry->type)) | 2674 | trace_seq_printf(s, "Unknown type %d\n", entry->type); |
| 2665 | goto partial; | ||
| 2666 | 2675 | ||
| 2667 | return TRACE_TYPE_HANDLED; | 2676 | return trace_handle_return(s); |
| 2668 | partial: | ||
| 2669 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 2670 | } | 2677 | } |
| 2671 | 2678 | ||
| 2672 | static enum print_line_t print_raw_fmt(struct trace_iterator *iter) | 2679 | static enum print_line_t print_raw_fmt(struct trace_iterator *iter) |
| @@ -2677,22 +2684,20 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter) | |||
| 2677 | 2684 | ||
| 2678 | entry = iter->ent; | 2685 | entry = iter->ent; |
| 2679 | 2686 | ||
| 2680 | if (trace_flags & TRACE_ITER_CONTEXT_INFO) { | 2687 | if (trace_flags & TRACE_ITER_CONTEXT_INFO) |
| 2681 | if (!trace_seq_printf(s, "%d %d %llu ", | 2688 | trace_seq_printf(s, "%d %d %llu ", |
| 2682 | entry->pid, iter->cpu, iter->ts)) | 2689 | entry->pid, iter->cpu, iter->ts); |
| 2683 | goto partial; | 2690 | |
| 2684 | } | 2691 | if (trace_seq_has_overflowed(s)) |
| 2692 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 2685 | 2693 | ||
| 2686 | event = ftrace_find_event(entry->type); | 2694 | event = ftrace_find_event(entry->type); |
| 2687 | if (event) | 2695 | if (event) |
| 2688 | return event->funcs->raw(iter, 0, event); | 2696 | return event->funcs->raw(iter, 0, event); |
| 2689 | 2697 | ||
| 2690 | if (!trace_seq_printf(s, "%d ?\n", entry->type)) | 2698 | trace_seq_printf(s, "%d ?\n", entry->type); |
| 2691 | goto partial; | ||
| 2692 | 2699 | ||
| 2693 | return TRACE_TYPE_HANDLED; | 2700 | return trace_handle_return(s); |
| 2694 | partial: | ||
| 2695 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 2696 | } | 2701 | } |
| 2697 | 2702 | ||
| 2698 | static enum print_line_t print_hex_fmt(struct trace_iterator *iter) | 2703 | static enum print_line_t print_hex_fmt(struct trace_iterator *iter) |
| @@ -2705,9 +2710,11 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) | |||
| 2705 | entry = iter->ent; | 2710 | entry = iter->ent; |
| 2706 | 2711 | ||
| 2707 | if (trace_flags & TRACE_ITER_CONTEXT_INFO) { | 2712 | if (trace_flags & TRACE_ITER_CONTEXT_INFO) { |
| 2708 | SEQ_PUT_HEX_FIELD_RET(s, entry->pid); | 2713 | SEQ_PUT_HEX_FIELD(s, entry->pid); |
| 2709 | SEQ_PUT_HEX_FIELD_RET(s, iter->cpu); | 2714 | SEQ_PUT_HEX_FIELD(s, iter->cpu); |
| 2710 | SEQ_PUT_HEX_FIELD_RET(s, iter->ts); | 2715 | SEQ_PUT_HEX_FIELD(s, iter->ts); |
| 2716 | if (trace_seq_has_overflowed(s)) | ||
| 2717 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 2711 | } | 2718 | } |
| 2712 | 2719 | ||
| 2713 | event = ftrace_find_event(entry->type); | 2720 | event = ftrace_find_event(entry->type); |
| @@ -2717,9 +2724,9 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) | |||
| 2717 | return ret; | 2724 | return ret; |
| 2718 | } | 2725 | } |
| 2719 | 2726 | ||
| 2720 | SEQ_PUT_FIELD_RET(s, newline); | 2727 | SEQ_PUT_FIELD(s, newline); |
| 2721 | 2728 | ||
| 2722 | return TRACE_TYPE_HANDLED; | 2729 | return trace_handle_return(s); |
| 2723 | } | 2730 | } |
| 2724 | 2731 | ||
| 2725 | static enum print_line_t print_bin_fmt(struct trace_iterator *iter) | 2732 | static enum print_line_t print_bin_fmt(struct trace_iterator *iter) |
| @@ -2731,9 +2738,11 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) | |||
| 2731 | entry = iter->ent; | 2738 | entry = iter->ent; |
| 2732 | 2739 | ||
| 2733 | if (trace_flags & TRACE_ITER_CONTEXT_INFO) { | 2740 | if (trace_flags & TRACE_ITER_CONTEXT_INFO) { |
| 2734 | SEQ_PUT_FIELD_RET(s, entry->pid); | 2741 | SEQ_PUT_FIELD(s, entry->pid); |
| 2735 | SEQ_PUT_FIELD_RET(s, iter->cpu); | 2742 | SEQ_PUT_FIELD(s, iter->cpu); |
| 2736 | SEQ_PUT_FIELD_RET(s, iter->ts); | 2743 | SEQ_PUT_FIELD(s, iter->ts); |
| 2744 | if (trace_seq_has_overflowed(s)) | ||
| 2745 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 2737 | } | 2746 | } |
| 2738 | 2747 | ||
| 2739 | event = ftrace_find_event(entry->type); | 2748 | event = ftrace_find_event(entry->type); |
| @@ -2779,10 +2788,12 @@ enum print_line_t print_trace_line(struct trace_iterator *iter) | |||
| 2779 | { | 2788 | { |
| 2780 | enum print_line_t ret; | 2789 | enum print_line_t ret; |
| 2781 | 2790 | ||
| 2782 | if (iter->lost_events && | 2791 | if (iter->lost_events) { |
| 2783 | !trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", | 2792 | trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", |
| 2784 | iter->cpu, iter->lost_events)) | 2793 | iter->cpu, iter->lost_events); |
| 2785 | return TRACE_TYPE_PARTIAL_LINE; | 2794 | if (trace_seq_has_overflowed(&iter->seq)) |
| 2795 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 2796 | } | ||
| 2786 | 2797 | ||
| 2787 | if (iter->trace && iter->trace->print_line) { | 2798 | if (iter->trace && iter->trace->print_line) { |
| 2788 | ret = iter->trace->print_line(iter); | 2799 | ret = iter->trace->print_line(iter); |
| @@ -2860,44 +2871,44 @@ static void test_ftrace_alive(struct seq_file *m) | |||
| 2860 | { | 2871 | { |
| 2861 | if (!ftrace_is_dead()) | 2872 | if (!ftrace_is_dead()) |
| 2862 | return; | 2873 | return; |
| 2863 | seq_printf(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n"); | 2874 | seq_puts(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n" |
| 2864 | seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n"); | 2875 | "# MAY BE MISSING FUNCTION EVENTS\n"); |
| 2865 | } | 2876 | } |
| 2866 | 2877 | ||
| 2867 | #ifdef CONFIG_TRACER_MAX_TRACE | 2878 | #ifdef CONFIG_TRACER_MAX_TRACE |
| 2868 | static void show_snapshot_main_help(struct seq_file *m) | 2879 | static void show_snapshot_main_help(struct seq_file *m) |
| 2869 | { | 2880 | { |
| 2870 | seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"); | 2881 | seq_puts(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n" |
| 2871 | seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); | 2882 | "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n" |
| 2872 | seq_printf(m, "# Takes a snapshot of the main buffer.\n"); | 2883 | "# Takes a snapshot of the main buffer.\n" |
| 2873 | seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n"); | 2884 | "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n" |
| 2874 | seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); | 2885 | "# (Doesn't have to be '2' works with any number that\n" |
| 2875 | seq_printf(m, "# is not a '0' or '1')\n"); | 2886 | "# is not a '0' or '1')\n"); |
| 2876 | } | 2887 | } |
| 2877 | 2888 | ||
| 2878 | static void show_snapshot_percpu_help(struct seq_file *m) | 2889 | static void show_snapshot_percpu_help(struct seq_file *m) |
| 2879 | { | 2890 | { |
| 2880 | seq_printf(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n"); | 2891 | seq_puts(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n"); |
| 2881 | #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP | 2892 | #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP |
| 2882 | seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); | 2893 | seq_puts(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n" |
| 2883 | seq_printf(m, "# Takes a snapshot of the main buffer for this cpu.\n"); | 2894 | "# Takes a snapshot of the main buffer for this cpu.\n"); |
| 2884 | #else | 2895 | #else |
| 2885 | seq_printf(m, "# echo 1 > snapshot : Not supported with this kernel.\n"); | 2896 | seq_puts(m, "# echo 1 > snapshot : Not supported with this kernel.\n" |
| 2886 | seq_printf(m, "# Must use main snapshot file to allocate.\n"); | 2897 | "# Must use main snapshot file to allocate.\n"); |
| 2887 | #endif | 2898 | #endif |
| 2888 | seq_printf(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n"); | 2899 | seq_puts(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n" |
| 2889 | seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); | 2900 | "# (Doesn't have to be '2' works with any number that\n" |
| 2890 | seq_printf(m, "# is not a '0' or '1')\n"); | 2901 | "# is not a '0' or '1')\n"); |
| 2891 | } | 2902 | } |
| 2892 | 2903 | ||
| 2893 | static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) | 2904 | static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) |
| 2894 | { | 2905 | { |
| 2895 | if (iter->tr->allocated_snapshot) | 2906 | if (iter->tr->allocated_snapshot) |
| 2896 | seq_printf(m, "#\n# * Snapshot is allocated *\n#\n"); | 2907 | seq_puts(m, "#\n# * Snapshot is allocated *\n#\n"); |
| 2897 | else | 2908 | else |
| 2898 | seq_printf(m, "#\n# * Snapshot is freed *\n#\n"); | 2909 | seq_puts(m, "#\n# * Snapshot is freed *\n#\n"); |
| 2899 | 2910 | ||
| 2900 | seq_printf(m, "# Snapshot commands:\n"); | 2911 | seq_puts(m, "# Snapshot commands:\n"); |
| 2901 | if (iter->cpu_file == RING_BUFFER_ALL_CPUS) | 2912 | if (iter->cpu_file == RING_BUFFER_ALL_CPUS) |
| 2902 | show_snapshot_main_help(m); | 2913 | show_snapshot_main_help(m); |
| 2903 | else | 2914 | else |
| @@ -3251,7 +3262,7 @@ static int t_show(struct seq_file *m, void *v) | |||
| 3251 | if (!t) | 3262 | if (!t) |
| 3252 | return 0; | 3263 | return 0; |
| 3253 | 3264 | ||
| 3254 | seq_printf(m, "%s", t->name); | 3265 | seq_puts(m, t->name); |
| 3255 | if (t->next) | 3266 | if (t->next) |
| 3256 | seq_putc(m, ' '); | 3267 | seq_putc(m, ' '); |
| 3257 | else | 3268 | else |
| @@ -4314,6 +4325,8 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) | |||
| 4314 | goto out; | 4325 | goto out; |
| 4315 | } | 4326 | } |
| 4316 | 4327 | ||
| 4328 | trace_seq_init(&iter->seq); | ||
| 4329 | |||
| 4317 | /* | 4330 | /* |
| 4318 | * We make a copy of the current tracer to avoid concurrent | 4331 | * We make a copy of the current tracer to avoid concurrent |
| 4319 | * changes on it while we are reading. | 4332 | * changes on it while we are reading. |
| @@ -4507,18 +4520,18 @@ waitagain: | |||
| 4507 | trace_access_lock(iter->cpu_file); | 4520 | trace_access_lock(iter->cpu_file); |
| 4508 | while (trace_find_next_entry_inc(iter) != NULL) { | 4521 | while (trace_find_next_entry_inc(iter) != NULL) { |
| 4509 | enum print_line_t ret; | 4522 | enum print_line_t ret; |
| 4510 | int len = iter->seq.len; | 4523 | int save_len = iter->seq.seq.len; |
| 4511 | 4524 | ||
| 4512 | ret = print_trace_line(iter); | 4525 | ret = print_trace_line(iter); |
| 4513 | if (ret == TRACE_TYPE_PARTIAL_LINE) { | 4526 | if (ret == TRACE_TYPE_PARTIAL_LINE) { |
| 4514 | /* don't print partial lines */ | 4527 | /* don't print partial lines */ |
| 4515 | iter->seq.len = len; | 4528 | iter->seq.seq.len = save_len; |
| 4516 | break; | 4529 | break; |
| 4517 | } | 4530 | } |
| 4518 | if (ret != TRACE_TYPE_NO_CONSUME) | 4531 | if (ret != TRACE_TYPE_NO_CONSUME) |
| 4519 | trace_consume(iter); | 4532 | trace_consume(iter); |
| 4520 | 4533 | ||
| 4521 | if (iter->seq.len >= cnt) | 4534 | if (trace_seq_used(&iter->seq) >= cnt) |
| 4522 | break; | 4535 | break; |
| 4523 | 4536 | ||
| 4524 | /* | 4537 | /* |
| @@ -4534,7 +4547,7 @@ waitagain: | |||
| 4534 | 4547 | ||
| 4535 | /* Now copy what we have to the user */ | 4548 | /* Now copy what we have to the user */ |
| 4536 | sret = trace_seq_to_user(&iter->seq, ubuf, cnt); | 4549 | sret = trace_seq_to_user(&iter->seq, ubuf, cnt); |
| 4537 | if (iter->seq.readpos >= iter->seq.len) | 4550 | if (iter->seq.seq.readpos >= trace_seq_used(&iter->seq)) |
| 4538 | trace_seq_init(&iter->seq); | 4551 | trace_seq_init(&iter->seq); |
| 4539 | 4552 | ||
| 4540 | /* | 4553 | /* |
| @@ -4568,20 +4581,33 @@ static size_t | |||
| 4568 | tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) | 4581 | tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) |
| 4569 | { | 4582 | { |
| 4570 | size_t count; | 4583 | size_t count; |
| 4584 | int save_len; | ||
| 4571 | int ret; | 4585 | int ret; |
| 4572 | 4586 | ||
| 4573 | /* Seq buffer is page-sized, exactly what we need. */ | 4587 | /* Seq buffer is page-sized, exactly what we need. */ |
| 4574 | for (;;) { | 4588 | for (;;) { |
| 4575 | count = iter->seq.len; | 4589 | save_len = iter->seq.seq.len; |
| 4576 | ret = print_trace_line(iter); | 4590 | ret = print_trace_line(iter); |
| 4577 | count = iter->seq.len - count; | 4591 | |
| 4578 | if (rem < count) { | 4592 | if (trace_seq_has_overflowed(&iter->seq)) { |
| 4579 | rem = 0; | 4593 | iter->seq.seq.len = save_len; |
| 4580 | iter->seq.len -= count; | ||
| 4581 | break; | 4594 | break; |
| 4582 | } | 4595 | } |
| 4596 | |||
| 4597 | /* | ||
| 4598 | * This should not be hit, because it should only | ||
| 4599 | * be set if the iter->seq overflowed. But check it | ||
| 4600 | * anyway to be safe. | ||
| 4601 | */ | ||
| 4583 | if (ret == TRACE_TYPE_PARTIAL_LINE) { | 4602 | if (ret == TRACE_TYPE_PARTIAL_LINE) { |
| 4584 | iter->seq.len -= count; | 4603 | iter->seq.seq.len = save_len; |
| 4604 | break; | ||
| 4605 | } | ||
| 4606 | |||
| 4607 | count = trace_seq_used(&iter->seq) - save_len; | ||
| 4608 | if (rem < count) { | ||
| 4609 | rem = 0; | ||
| 4610 | iter->seq.seq.len = save_len; | ||
| 4585 | break; | 4611 | break; |
| 4586 | } | 4612 | } |
| 4587 | 4613 | ||
| @@ -4662,13 +4688,13 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, | |||
| 4662 | /* Copy the data into the page, so we can start over. */ | 4688 | /* Copy the data into the page, so we can start over. */ |
| 4663 | ret = trace_seq_to_buffer(&iter->seq, | 4689 | ret = trace_seq_to_buffer(&iter->seq, |
| 4664 | page_address(spd.pages[i]), | 4690 | page_address(spd.pages[i]), |
| 4665 | iter->seq.len); | 4691 | trace_seq_used(&iter->seq)); |
| 4666 | if (ret < 0) { | 4692 | if (ret < 0) { |
| 4667 | __free_page(spd.pages[i]); | 4693 | __free_page(spd.pages[i]); |
| 4668 | break; | 4694 | break; |
| 4669 | } | 4695 | } |
| 4670 | spd.partial[i].offset = 0; | 4696 | spd.partial[i].offset = 0; |
| 4671 | spd.partial[i].len = iter->seq.len; | 4697 | spd.partial[i].len = trace_seq_used(&iter->seq); |
| 4672 | 4698 | ||
| 4673 | trace_seq_init(&iter->seq); | 4699 | trace_seq_init(&iter->seq); |
| 4674 | } | 4700 | } |
| @@ -5668,7 +5694,8 @@ tracing_stats_read(struct file *filp, char __user *ubuf, | |||
| 5668 | cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu); | 5694 | cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu); |
| 5669 | trace_seq_printf(s, "read events: %ld\n", cnt); | 5695 | trace_seq_printf(s, "read events: %ld\n", cnt); |
| 5670 | 5696 | ||
| 5671 | count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); | 5697 | count = simple_read_from_buffer(ubuf, count, ppos, |
| 5698 | s->buffer, trace_seq_used(s)); | ||
| 5672 | 5699 | ||
| 5673 | kfree(s); | 5700 | kfree(s); |
| 5674 | 5701 | ||
| @@ -5749,10 +5776,10 @@ ftrace_snapshot_print(struct seq_file *m, unsigned long ip, | |||
| 5749 | 5776 | ||
| 5750 | seq_printf(m, "%ps:", (void *)ip); | 5777 | seq_printf(m, "%ps:", (void *)ip); |
| 5751 | 5778 | ||
| 5752 | seq_printf(m, "snapshot"); | 5779 | seq_puts(m, "snapshot"); |
| 5753 | 5780 | ||
| 5754 | if (count == -1) | 5781 | if (count == -1) |
| 5755 | seq_printf(m, ":unlimited\n"); | 5782 | seq_puts(m, ":unlimited\n"); |
| 5756 | else | 5783 | else |
| 5757 | seq_printf(m, ":count=%ld\n", count); | 5784 | seq_printf(m, ":count=%ld\n", count); |
| 5758 | 5785 | ||
| @@ -6417,7 +6444,7 @@ static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t m | |||
| 6417 | int ret; | 6444 | int ret; |
| 6418 | 6445 | ||
| 6419 | /* Paranoid: Make sure the parent is the "instances" directory */ | 6446 | /* Paranoid: Make sure the parent is the "instances" directory */ |
| 6420 | parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); | 6447 | parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); |
| 6421 | if (WARN_ON_ONCE(parent != trace_instance_dir)) | 6448 | if (WARN_ON_ONCE(parent != trace_instance_dir)) |
| 6422 | return -ENOENT; | 6449 | return -ENOENT; |
| 6423 | 6450 | ||
| @@ -6444,7 +6471,7 @@ static int instance_rmdir(struct inode *inode, struct dentry *dentry) | |||
| 6444 | int ret; | 6471 | int ret; |
| 6445 | 6472 | ||
| 6446 | /* Paranoid: Make sure the parent is the "instances" directory */ | 6473 | /* Paranoid: Make sure the parent is the "instances" directory */ |
| 6447 | parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); | 6474 | parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); |
| 6448 | if (WARN_ON_ONCE(parent != trace_instance_dir)) | 6475 | if (WARN_ON_ONCE(parent != trace_instance_dir)) |
| 6449 | return -ENOENT; | 6476 | return -ENOENT; |
| 6450 | 6477 | ||
| @@ -6631,11 +6658,19 @@ void | |||
| 6631 | trace_printk_seq(struct trace_seq *s) | 6658 | trace_printk_seq(struct trace_seq *s) |
| 6632 | { | 6659 | { |
| 6633 | /* Probably should print a warning here. */ | 6660 | /* Probably should print a warning here. */ |
| 6634 | if (s->len >= TRACE_MAX_PRINT) | 6661 | if (s->seq.len >= TRACE_MAX_PRINT) |
| 6635 | s->len = TRACE_MAX_PRINT; | 6662 | s->seq.len = TRACE_MAX_PRINT; |
| 6663 | |||
| 6664 | /* | ||
| 6665 | * More paranoid code. Although the buffer size is set to | ||
| 6666 | * PAGE_SIZE, and TRACE_MAX_PRINT is 1000, this is just | ||
| 6667 | * an extra layer of protection. | ||
| 6668 | */ | ||
| 6669 | if (WARN_ON_ONCE(s->seq.len >= s->seq.size)) | ||
| 6670 | s->seq.len = s->seq.size - 1; | ||
| 6636 | 6671 | ||
| 6637 | /* should be zero ended, but we are paranoid. */ | 6672 | /* should be zero ended, but we are paranoid. */ |
| 6638 | s->buffer[s->len] = 0; | 6673 | s->buffer[s->seq.len] = 0; |
| 6639 | 6674 | ||
| 6640 | printk(KERN_TRACE "%s", s->buffer); | 6675 | printk(KERN_TRACE "%s", s->buffer); |
| 6641 | 6676 | ||
| @@ -6874,6 +6909,18 @@ out: | |||
| 6874 | return ret; | 6909 | return ret; |
| 6875 | } | 6910 | } |
| 6876 | 6911 | ||
| 6912 | void __init trace_init(void) | ||
| 6913 | { | ||
| 6914 | if (tracepoint_printk) { | ||
| 6915 | tracepoint_print_iter = | ||
| 6916 | kmalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL); | ||
| 6917 | if (WARN_ON(!tracepoint_print_iter)) | ||
| 6918 | tracepoint_printk = 0; | ||
| 6919 | } | ||
| 6920 | tracer_alloc_buffers(); | ||
| 6921 | trace_event_init(); | ||
| 6922 | } | ||
| 6923 | |||
| 6877 | __init static int clear_boot_tracer(void) | 6924 | __init static int clear_boot_tracer(void) |
| 6878 | { | 6925 | { |
| 6879 | /* | 6926 | /* |
| @@ -6893,6 +6940,5 @@ __init static int clear_boot_tracer(void) | |||
| 6893 | return 0; | 6940 | return 0; |
| 6894 | } | 6941 | } |
| 6895 | 6942 | ||
| 6896 | early_initcall(tracer_alloc_buffers); | ||
| 6897 | fs_initcall(tracer_init_debugfs); | 6943 | fs_initcall(tracer_init_debugfs); |
| 6898 | late_initcall(clear_boot_tracer); | 6944 | late_initcall(clear_boot_tracer); |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 385391fb1d3b..8de48bac1ce2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
| @@ -14,6 +14,7 @@ | |||
| 14 | #include <linux/trace_seq.h> | 14 | #include <linux/trace_seq.h> |
| 15 | #include <linux/ftrace_event.h> | 15 | #include <linux/ftrace_event.h> |
| 16 | #include <linux/compiler.h> | 16 | #include <linux/compiler.h> |
| 17 | #include <linux/trace_seq.h> | ||
| 17 | 18 | ||
| 18 | #ifdef CONFIG_FTRACE_SYSCALLS | 19 | #ifdef CONFIG_FTRACE_SYSCALLS |
| 19 | #include <asm/unistd.h> /* For NR_SYSCALLS */ | 20 | #include <asm/unistd.h> /* For NR_SYSCALLS */ |
| @@ -569,15 +570,6 @@ void trace_init_global_iter(struct trace_iterator *iter); | |||
| 569 | 570 | ||
| 570 | void tracing_iter_reset(struct trace_iterator *iter, int cpu); | 571 | void tracing_iter_reset(struct trace_iterator *iter, int cpu); |
| 571 | 572 | ||
| 572 | void tracing_sched_switch_trace(struct trace_array *tr, | ||
| 573 | struct task_struct *prev, | ||
| 574 | struct task_struct *next, | ||
| 575 | unsigned long flags, int pc); | ||
| 576 | |||
| 577 | void tracing_sched_wakeup_trace(struct trace_array *tr, | ||
| 578 | struct task_struct *wakee, | ||
| 579 | struct task_struct *cur, | ||
| 580 | unsigned long flags, int pc); | ||
| 581 | void trace_function(struct trace_array *tr, | 573 | void trace_function(struct trace_array *tr, |
| 582 | unsigned long ip, | 574 | unsigned long ip, |
| 583 | unsigned long parent_ip, | 575 | unsigned long parent_ip, |
| @@ -597,9 +589,6 @@ void set_graph_array(struct trace_array *tr); | |||
| 597 | 589 | ||
| 598 | void tracing_start_cmdline_record(void); | 590 | void tracing_start_cmdline_record(void); |
| 599 | void tracing_stop_cmdline_record(void); | 591 | void tracing_stop_cmdline_record(void); |
| 600 | void tracing_sched_switch_assign_trace(struct trace_array *tr); | ||
| 601 | void tracing_stop_sched_switch_record(void); | ||
| 602 | void tracing_start_sched_switch_record(void); | ||
| 603 | int register_tracer(struct tracer *type); | 592 | int register_tracer(struct tracer *type); |
| 604 | int is_tracing_stopped(void); | 593 | int is_tracing_stopped(void); |
| 605 | 594 | ||
| @@ -719,6 +708,8 @@ enum print_line_t print_trace_line(struct trace_iterator *iter); | |||
| 719 | 708 | ||
| 720 | extern unsigned long trace_flags; | 709 | extern unsigned long trace_flags; |
| 721 | 710 | ||
| 711 | extern char trace_find_mark(unsigned long long duration); | ||
| 712 | |||
| 722 | /* Standard output formatting function used for function return traces */ | 713 | /* Standard output formatting function used for function return traces */ |
| 723 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 714 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
| 724 | 715 | ||
| @@ -737,7 +728,7 @@ extern unsigned long trace_flags; | |||
| 737 | extern enum print_line_t | 728 | extern enum print_line_t |
| 738 | print_graph_function_flags(struct trace_iterator *iter, u32 flags); | 729 | print_graph_function_flags(struct trace_iterator *iter, u32 flags); |
| 739 | extern void print_graph_headers_flags(struct seq_file *s, u32 flags); | 730 | extern void print_graph_headers_flags(struct seq_file *s, u32 flags); |
| 740 | extern enum print_line_t | 731 | extern void |
| 741 | trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); | 732 | trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); |
| 742 | extern void graph_trace_open(struct trace_iterator *iter); | 733 | extern void graph_trace_open(struct trace_iterator *iter); |
| 743 | extern void graph_trace_close(struct trace_iterator *iter); | 734 | extern void graph_trace_close(struct trace_iterator *iter); |
| @@ -1310,4 +1301,18 @@ int perf_ftrace_event_register(struct ftrace_event_call *call, | |||
| 1310 | #define perf_ftrace_event_register NULL | 1301 | #define perf_ftrace_event_register NULL |
| 1311 | #endif | 1302 | #endif |
| 1312 | 1303 | ||
| 1304 | #ifdef CONFIG_FTRACE_SYSCALLS | ||
| 1305 | void init_ftrace_syscalls(void); | ||
| 1306 | #else | ||
| 1307 | static inline void init_ftrace_syscalls(void) { } | ||
| 1308 | #endif | ||
| 1309 | |||
| 1310 | #ifdef CONFIG_EVENT_TRACING | ||
| 1311 | void trace_event_init(void); | ||
| 1312 | #else | ||
| 1313 | static inline void __init trace_event_init(void) { } | ||
| 1314 | #endif | ||
| 1315 | |||
| 1316 | extern struct trace_iterator *tracepoint_print_iter; | ||
| 1317 | |||
| 1313 | #endif /* _LINUX_KERNEL_TRACE_H */ | 1318 | #endif /* _LINUX_KERNEL_TRACE_H */ |
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 697fb9bac8f0..7d6e2afde669 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c | |||
| @@ -151,22 +151,21 @@ static enum print_line_t trace_branch_print(struct trace_iterator *iter, | |||
| 151 | 151 | ||
| 152 | trace_assign_type(field, iter->ent); | 152 | trace_assign_type(field, iter->ent); |
| 153 | 153 | ||
| 154 | if (trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n", | 154 | trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n", |
| 155 | field->correct ? " ok " : " MISS ", | 155 | field->correct ? " ok " : " MISS ", |
| 156 | field->func, | 156 | field->func, |
| 157 | field->file, | 157 | field->file, |
| 158 | field->line)) | 158 | field->line); |
| 159 | return TRACE_TYPE_PARTIAL_LINE; | 159 | |
| 160 | 160 | return trace_handle_return(&iter->seq); | |
| 161 | return TRACE_TYPE_HANDLED; | ||
| 162 | } | 161 | } |
| 163 | 162 | ||
| 164 | static void branch_print_header(struct seq_file *s) | 163 | static void branch_print_header(struct seq_file *s) |
| 165 | { | 164 | { |
| 166 | seq_puts(s, "# TASK-PID CPU# TIMESTAMP CORRECT" | 165 | seq_puts(s, "# TASK-PID CPU# TIMESTAMP CORRECT" |
| 167 | " FUNC:FILE:LINE\n"); | 166 | " FUNC:FILE:LINE\n" |
| 168 | seq_puts(s, "# | | | | | " | 167 | "# | | | | | " |
| 169 | " |\n"); | 168 | " |\n"); |
| 170 | } | 169 | } |
| 171 | 170 | ||
| 172 | static struct trace_event_functions trace_branch_funcs = { | 171 | static struct trace_event_functions trace_branch_funcs = { |
| @@ -233,12 +232,12 @@ extern unsigned long __stop_annotated_branch_profile[]; | |||
| 233 | 232 | ||
| 234 | static int annotated_branch_stat_headers(struct seq_file *m) | 233 | static int annotated_branch_stat_headers(struct seq_file *m) |
| 235 | { | 234 | { |
| 236 | seq_printf(m, " correct incorrect %% "); | 235 | seq_puts(m, " correct incorrect % " |
| 237 | seq_printf(m, " Function " | 236 | " Function " |
| 238 | " File Line\n" | 237 | " File Line\n" |
| 239 | " ------- --------- - " | 238 | " ------- --------- - " |
| 240 | " -------- " | 239 | " -------- " |
| 241 | " ---- ----\n"); | 240 | " ---- ----\n"); |
| 242 | return 0; | 241 | return 0; |
| 243 | } | 242 | } |
| 244 | 243 | ||
| @@ -274,7 +273,7 @@ static int branch_stat_show(struct seq_file *m, void *v) | |||
| 274 | 273 | ||
| 275 | seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect); | 274 | seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect); |
| 276 | if (percent < 0) | 275 | if (percent < 0) |
| 277 | seq_printf(m, " X "); | 276 | seq_puts(m, " X "); |
| 278 | else | 277 | else |
| 279 | seq_printf(m, "%3ld ", percent); | 278 | seq_printf(m, "%3ld ", percent); |
| 280 | seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line); | 279 | seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line); |
| @@ -362,12 +361,12 @@ extern unsigned long __stop_branch_profile[]; | |||
| 362 | 361 | ||
| 363 | static int all_branch_stat_headers(struct seq_file *m) | 362 | static int all_branch_stat_headers(struct seq_file *m) |
| 364 | { | 363 | { |
| 365 | seq_printf(m, " miss hit %% "); | 364 | seq_puts(m, " miss hit % " |
| 366 | seq_printf(m, " Function " | 365 | " Function " |
| 367 | " File Line\n" | 366 | " File Line\n" |
| 368 | " ------- --------- - " | 367 | " ------- --------- - " |
| 369 | " -------- " | 368 | " -------- " |
| 370 | " ---- ----\n"); | 369 | " ---- ----\n"); |
| 371 | return 0; | 370 | return 0; |
| 372 | } | 371 | } |
| 373 | 372 | ||
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 4b9c114ee9de..6fa484de2ba1 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c | |||
| @@ -261,7 +261,7 @@ void perf_trace_del(struct perf_event *p_event, int flags) | |||
| 261 | } | 261 | } |
| 262 | 262 | ||
| 263 | void *perf_trace_buf_prepare(int size, unsigned short type, | 263 | void *perf_trace_buf_prepare(int size, unsigned short type, |
| 264 | struct pt_regs *regs, int *rctxp) | 264 | struct pt_regs **regs, int *rctxp) |
| 265 | { | 265 | { |
| 266 | struct trace_entry *entry; | 266 | struct trace_entry *entry; |
| 267 | unsigned long flags; | 267 | unsigned long flags; |
| @@ -280,6 +280,8 @@ void *perf_trace_buf_prepare(int size, unsigned short type, | |||
| 280 | if (*rctxp < 0) | 280 | if (*rctxp < 0) |
| 281 | return NULL; | 281 | return NULL; |
| 282 | 282 | ||
| 283 | if (regs) | ||
| 284 | *regs = this_cpu_ptr(&__perf_regs[*rctxp]); | ||
| 283 | raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]); | 285 | raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]); |
| 284 | 286 | ||
| 285 | /* zero the dead bytes from align to not leak stack to user */ | 287 | /* zero the dead bytes from align to not leak stack to user */ |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 0cc51edde3a8..b03a0ea77b99 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
| @@ -212,8 +212,40 @@ void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer, | |||
| 212 | } | 212 | } |
| 213 | EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve); | 213 | EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve); |
| 214 | 214 | ||
| 215 | static DEFINE_SPINLOCK(tracepoint_iter_lock); | ||
| 216 | |||
| 217 | static void output_printk(struct ftrace_event_buffer *fbuffer) | ||
| 218 | { | ||
| 219 | struct ftrace_event_call *event_call; | ||
| 220 | struct trace_event *event; | ||
| 221 | unsigned long flags; | ||
| 222 | struct trace_iterator *iter = tracepoint_print_iter; | ||
| 223 | |||
| 224 | if (!iter) | ||
| 225 | return; | ||
| 226 | |||
| 227 | event_call = fbuffer->ftrace_file->event_call; | ||
| 228 | if (!event_call || !event_call->event.funcs || | ||
| 229 | !event_call->event.funcs->trace) | ||
| 230 | return; | ||
| 231 | |||
| 232 | event = &fbuffer->ftrace_file->event_call->event; | ||
| 233 | |||
| 234 | spin_lock_irqsave(&tracepoint_iter_lock, flags); | ||
| 235 | trace_seq_init(&iter->seq); | ||
| 236 | iter->ent = fbuffer->entry; | ||
| 237 | event_call->event.funcs->trace(iter, 0, event); | ||
| 238 | trace_seq_putc(&iter->seq, 0); | ||
| 239 | printk("%s", iter->seq.buffer); | ||
| 240 | |||
| 241 | spin_unlock_irqrestore(&tracepoint_iter_lock, flags); | ||
| 242 | } | ||
| 243 | |||
| 215 | void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer) | 244 | void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer) |
| 216 | { | 245 | { |
| 246 | if (tracepoint_printk) | ||
| 247 | output_printk(fbuffer); | ||
| 248 | |||
| 217 | event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer, | 249 | event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer, |
| 218 | fbuffer->event, fbuffer->entry, | 250 | fbuffer->event, fbuffer->entry, |
| 219 | fbuffer->flags, fbuffer->pc); | 251 | fbuffer->flags, fbuffer->pc); |
| @@ -461,7 +493,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file) | |||
| 461 | 493 | ||
| 462 | if (dir) { | 494 | if (dir) { |
| 463 | spin_lock(&dir->d_lock); /* probably unneeded */ | 495 | spin_lock(&dir->d_lock); /* probably unneeded */ |
| 464 | list_for_each_entry(child, &dir->d_subdirs, d_u.d_child) { | 496 | list_for_each_entry(child, &dir->d_subdirs, d_child) { |
| 465 | if (child->d_inode) /* probably unneeded */ | 497 | if (child->d_inode) /* probably unneeded */ |
| 466 | child->d_inode->i_private = NULL; | 498 | child->d_inode->i_private = NULL; |
| 467 | } | 499 | } |
| @@ -918,7 +950,7 @@ static int f_show(struct seq_file *m, void *v) | |||
| 918 | case FORMAT_HEADER: | 950 | case FORMAT_HEADER: |
| 919 | seq_printf(m, "name: %s\n", ftrace_event_name(call)); | 951 | seq_printf(m, "name: %s\n", ftrace_event_name(call)); |
| 920 | seq_printf(m, "ID: %d\n", call->event.type); | 952 | seq_printf(m, "ID: %d\n", call->event.type); |
| 921 | seq_printf(m, "format:\n"); | 953 | seq_puts(m, "format:\n"); |
| 922 | return 0; | 954 | return 0; |
| 923 | 955 | ||
| 924 | case FORMAT_FIELD_SEPERATOR: | 956 | case FORMAT_FIELD_SEPERATOR: |
| @@ -1044,7 +1076,8 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, | |||
| 1044 | mutex_unlock(&event_mutex); | 1076 | mutex_unlock(&event_mutex); |
| 1045 | 1077 | ||
| 1046 | if (file) | 1078 | if (file) |
| 1047 | r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); | 1079 | r = simple_read_from_buffer(ubuf, cnt, ppos, |
| 1080 | s->buffer, trace_seq_used(s)); | ||
| 1048 | 1081 | ||
| 1049 | kfree(s); | 1082 | kfree(s); |
| 1050 | 1083 | ||
| @@ -1210,7 +1243,8 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, | |||
| 1210 | trace_seq_init(s); | 1243 | trace_seq_init(s); |
| 1211 | 1244 | ||
| 1212 | print_subsystem_event_filter(system, s); | 1245 | print_subsystem_event_filter(system, s); |
| 1213 | r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); | 1246 | r = simple_read_from_buffer(ubuf, cnt, ppos, |
| 1247 | s->buffer, trace_seq_used(s)); | ||
| 1214 | 1248 | ||
| 1215 | kfree(s); | 1249 | kfree(s); |
| 1216 | 1250 | ||
| @@ -1265,7 +1299,8 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) | |||
| 1265 | trace_seq_init(s); | 1299 | trace_seq_init(s); |
| 1266 | 1300 | ||
| 1267 | func(s); | 1301 | func(s); |
| 1268 | r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); | 1302 | r = simple_read_from_buffer(ubuf, cnt, ppos, |
| 1303 | s->buffer, trace_seq_used(s)); | ||
| 1269 | 1304 | ||
| 1270 | kfree(s); | 1305 | kfree(s); |
| 1271 | 1306 | ||
| @@ -1988,7 +2023,7 @@ event_enable_print(struct seq_file *m, unsigned long ip, | |||
| 1988 | ftrace_event_name(data->file->event_call)); | 2023 | ftrace_event_name(data->file->event_call)); |
| 1989 | 2024 | ||
| 1990 | if (data->count == -1) | 2025 | if (data->count == -1) |
| 1991 | seq_printf(m, ":unlimited\n"); | 2026 | seq_puts(m, ":unlimited\n"); |
| 1992 | else | 2027 | else |
| 1993 | seq_printf(m, ":count=%ld\n", data->count); | 2028 | seq_printf(m, ":count=%ld\n", data->count); |
| 1994 | 2029 | ||
| @@ -2394,12 +2429,39 @@ static __init int event_trace_memsetup(void) | |||
| 2394 | return 0; | 2429 | return 0; |
| 2395 | } | 2430 | } |
| 2396 | 2431 | ||
| 2432 | static __init void | ||
| 2433 | early_enable_events(struct trace_array *tr, bool disable_first) | ||
| 2434 | { | ||
| 2435 | char *buf = bootup_event_buf; | ||
| 2436 | char *token; | ||
| 2437 | int ret; | ||
| 2438 | |||
| 2439 | while (true) { | ||
| 2440 | token = strsep(&buf, ","); | ||
| 2441 | |||
| 2442 | if (!token) | ||
| 2443 | break; | ||
| 2444 | if (!*token) | ||
| 2445 | continue; | ||
| 2446 | |||
| 2447 | /* Restarting syscalls requires that we stop them first */ | ||
| 2448 | if (disable_first) | ||
| 2449 | ftrace_set_clr_event(tr, token, 0); | ||
| 2450 | |||
| 2451 | ret = ftrace_set_clr_event(tr, token, 1); | ||
| 2452 | if (ret) | ||
| 2453 | pr_warn("Failed to enable trace event: %s\n", token); | ||
| 2454 | |||
| 2455 | /* Put back the comma to allow this to be called again */ | ||
| 2456 | if (buf) | ||
| 2457 | *(buf - 1) = ','; | ||
| 2458 | } | ||
| 2459 | } | ||
| 2460 | |||
| 2397 | static __init int event_trace_enable(void) | 2461 | static __init int event_trace_enable(void) |
| 2398 | { | 2462 | { |
| 2399 | struct trace_array *tr = top_trace_array(); | 2463 | struct trace_array *tr = top_trace_array(); |
| 2400 | struct ftrace_event_call **iter, *call; | 2464 | struct ftrace_event_call **iter, *call; |
| 2401 | char *buf = bootup_event_buf; | ||
| 2402 | char *token; | ||
| 2403 | int ret; | 2465 | int ret; |
| 2404 | 2466 | ||
| 2405 | if (!tr) | 2467 | if (!tr) |
| @@ -2421,18 +2483,7 @@ static __init int event_trace_enable(void) | |||
| 2421 | */ | 2483 | */ |
| 2422 | __trace_early_add_events(tr); | 2484 | __trace_early_add_events(tr); |
| 2423 | 2485 | ||
| 2424 | while (true) { | 2486 | early_enable_events(tr, false); |
| 2425 | token = strsep(&buf, ","); | ||
| 2426 | |||
| 2427 | if (!token) | ||
| 2428 | break; | ||
| 2429 | if (!*token) | ||
| 2430 | continue; | ||
| 2431 | |||
| 2432 | ret = ftrace_set_clr_event(tr, token, 1); | ||
| 2433 | if (ret) | ||
| 2434 | pr_warn("Failed to enable trace event: %s\n", token); | ||
| 2435 | } | ||
| 2436 | 2487 | ||
| 2437 | trace_printk_start_comm(); | 2488 | trace_printk_start_comm(); |
| 2438 | 2489 | ||
| @@ -2443,6 +2494,31 @@ static __init int event_trace_enable(void) | |||
| 2443 | return 0; | 2494 | return 0; |
| 2444 | } | 2495 | } |
| 2445 | 2496 | ||
| 2497 | /* | ||
| 2498 | * event_trace_enable() is called from trace_event_init() first to | ||
| 2499 | * initialize events and perhaps start any events that are on the | ||
| 2500 | * command line. Unfortunately, there are some events that will not | ||
| 2501 | * start this early, like the system call tracepoints that need | ||
| 2502 | * to set the TIF_SYSCALL_TRACEPOINT flag of pid 1. But event_trace_enable() | ||
| 2503 | * is called before pid 1 starts, and this flag is never set, making | ||
| 2504 | * the syscall tracepoint never get reached, but the event is enabled | ||
| 2505 | * regardless (and not doing anything). | ||
| 2506 | */ | ||
| 2507 | static __init int event_trace_enable_again(void) | ||
| 2508 | { | ||
| 2509 | struct trace_array *tr; | ||
| 2510 | |||
| 2511 | tr = top_trace_array(); | ||
| 2512 | if (!tr) | ||
| 2513 | return -ENODEV; | ||
| 2514 | |||
| 2515 | early_enable_events(tr, true); | ||
| 2516 | |||
| 2517 | return 0; | ||
| 2518 | } | ||
| 2519 | |||
| 2520 | early_initcall(event_trace_enable_again); | ||
| 2521 | |||
| 2446 | static __init int event_trace_init(void) | 2522 | static __init int event_trace_init(void) |
| 2447 | { | 2523 | { |
| 2448 | struct trace_array *tr; | 2524 | struct trace_array *tr; |
| @@ -2477,8 +2553,14 @@ static __init int event_trace_init(void) | |||
| 2477 | #endif | 2553 | #endif |
| 2478 | return 0; | 2554 | return 0; |
| 2479 | } | 2555 | } |
| 2480 | early_initcall(event_trace_memsetup); | 2556 | |
| 2481 | core_initcall(event_trace_enable); | 2557 | void __init trace_event_init(void) |
| 2558 | { | ||
| 2559 | event_trace_memsetup(); | ||
| 2560 | init_ftrace_syscalls(); | ||
| 2561 | event_trace_enable(); | ||
| 2562 | } | ||
| 2563 | |||
| 2482 | fs_initcall(event_trace_init); | 2564 | fs_initcall(event_trace_init); |
| 2483 | 2565 | ||
| 2484 | #ifdef CONFIG_FTRACE_STARTUP_TEST | 2566 | #ifdef CONFIG_FTRACE_STARTUP_TEST |
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 7a8c1528e141..ced69da0ff55 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c | |||
| @@ -45,6 +45,7 @@ enum filter_op_ids | |||
| 45 | OP_GT, | 45 | OP_GT, |
| 46 | OP_GE, | 46 | OP_GE, |
| 47 | OP_BAND, | 47 | OP_BAND, |
| 48 | OP_NOT, | ||
| 48 | OP_NONE, | 49 | OP_NONE, |
| 49 | OP_OPEN_PAREN, | 50 | OP_OPEN_PAREN, |
| 50 | }; | 51 | }; |
| @@ -67,6 +68,7 @@ static struct filter_op filter_ops[] = { | |||
| 67 | { OP_GT, ">", 5 }, | 68 | { OP_GT, ">", 5 }, |
| 68 | { OP_GE, ">=", 5 }, | 69 | { OP_GE, ">=", 5 }, |
| 69 | { OP_BAND, "&", 6 }, | 70 | { OP_BAND, "&", 6 }, |
| 71 | { OP_NOT, "!", 6 }, | ||
| 70 | { OP_NONE, "OP_NONE", 0 }, | 72 | { OP_NONE, "OP_NONE", 0 }, |
| 71 | { OP_OPEN_PAREN, "(", 0 }, | 73 | { OP_OPEN_PAREN, "(", 0 }, |
| 72 | }; | 74 | }; |
| @@ -85,6 +87,7 @@ enum { | |||
| 85 | FILT_ERR_MISSING_FIELD, | 87 | FILT_ERR_MISSING_FIELD, |
| 86 | FILT_ERR_INVALID_FILTER, | 88 | FILT_ERR_INVALID_FILTER, |
| 87 | FILT_ERR_IP_FIELD_ONLY, | 89 | FILT_ERR_IP_FIELD_ONLY, |
| 90 | FILT_ERR_ILLEGAL_NOT_OP, | ||
| 88 | }; | 91 | }; |
| 89 | 92 | ||
| 90 | static char *err_text[] = { | 93 | static char *err_text[] = { |
| @@ -101,6 +104,7 @@ static char *err_text[] = { | |||
| 101 | "Missing field name and/or value", | 104 | "Missing field name and/or value", |
| 102 | "Meaningless filter expression", | 105 | "Meaningless filter expression", |
| 103 | "Only 'ip' field is supported for function trace", | 106 | "Only 'ip' field is supported for function trace", |
| 107 | "Illegal use of '!'", | ||
| 104 | }; | 108 | }; |
| 105 | 109 | ||
| 106 | struct opstack_op { | 110 | struct opstack_op { |
| @@ -139,6 +143,7 @@ struct pred_stack { | |||
| 139 | int index; | 143 | int index; |
| 140 | }; | 144 | }; |
| 141 | 145 | ||
| 146 | /* If not of not match is equal to not of not, then it is a match */ | ||
| 142 | #define DEFINE_COMPARISON_PRED(type) \ | 147 | #define DEFINE_COMPARISON_PRED(type) \ |
| 143 | static int filter_pred_##type(struct filter_pred *pred, void *event) \ | 148 | static int filter_pred_##type(struct filter_pred *pred, void *event) \ |
| 144 | { \ | 149 | { \ |
| @@ -166,7 +171,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event) \ | |||
| 166 | break; \ | 171 | break; \ |
| 167 | } \ | 172 | } \ |
| 168 | \ | 173 | \ |
| 169 | return match; \ | 174 | return !!match == !pred->not; \ |
| 170 | } | 175 | } |
| 171 | 176 | ||
| 172 | #define DEFINE_EQUALITY_PRED(size) \ | 177 | #define DEFINE_EQUALITY_PRED(size) \ |
| @@ -484,9 +489,10 @@ static int process_ops(struct filter_pred *preds, | |||
| 484 | if (!WARN_ON_ONCE(!pred->fn)) | 489 | if (!WARN_ON_ONCE(!pred->fn)) |
| 485 | match = pred->fn(pred, rec); | 490 | match = pred->fn(pred, rec); |
| 486 | if (!!match == type) | 491 | if (!!match == type) |
| 487 | return match; | 492 | break; |
| 488 | } | 493 | } |
| 489 | return match; | 494 | /* If not of not match is equal to not of not, then it is a match */ |
| 495 | return !!match == !op->not; | ||
| 490 | } | 496 | } |
| 491 | 497 | ||
| 492 | struct filter_match_preds_data { | 498 | struct filter_match_preds_data { |
| @@ -735,10 +741,10 @@ static int filter_set_pred(struct event_filter *filter, | |||
| 735 | * then this op can be folded. | 741 | * then this op can be folded. |
| 736 | */ | 742 | */ |
| 737 | if (left->index & FILTER_PRED_FOLD && | 743 | if (left->index & FILTER_PRED_FOLD && |
| 738 | (left->op == dest->op || | 744 | ((left->op == dest->op && !left->not) || |
| 739 | left->left == FILTER_PRED_INVALID) && | 745 | left->left == FILTER_PRED_INVALID) && |
| 740 | right->index & FILTER_PRED_FOLD && | 746 | right->index & FILTER_PRED_FOLD && |
| 741 | (right->op == dest->op || | 747 | ((right->op == dest->op && !right->not) || |
| 742 | right->left == FILTER_PRED_INVALID)) | 748 | right->left == FILTER_PRED_INVALID)) |
| 743 | dest->index |= FILTER_PRED_FOLD; | 749 | dest->index |= FILTER_PRED_FOLD; |
| 744 | 750 | ||
| @@ -1028,7 +1034,7 @@ static int init_pred(struct filter_parse_state *ps, | |||
| 1028 | } | 1034 | } |
| 1029 | 1035 | ||
| 1030 | if (pred->op == OP_NE) | 1036 | if (pred->op == OP_NE) |
| 1031 | pred->not = 1; | 1037 | pred->not ^= 1; |
| 1032 | 1038 | ||
| 1033 | pred->fn = fn; | 1039 | pred->fn = fn; |
| 1034 | return 0; | 1040 | return 0; |
| @@ -1590,6 +1596,17 @@ static int replace_preds(struct ftrace_event_call *call, | |||
| 1590 | continue; | 1596 | continue; |
| 1591 | } | 1597 | } |
| 1592 | 1598 | ||
| 1599 | if (elt->op == OP_NOT) { | ||
| 1600 | if (!n_preds || operand1 || operand2) { | ||
| 1601 | parse_error(ps, FILT_ERR_ILLEGAL_NOT_OP, 0); | ||
| 1602 | err = -EINVAL; | ||
| 1603 | goto fail; | ||
| 1604 | } | ||
| 1605 | if (!dry_run) | ||
| 1606 | filter->preds[n_preds - 1].not ^= 1; | ||
| 1607 | continue; | ||
| 1608 | } | ||
| 1609 | |||
| 1593 | if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) { | 1610 | if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) { |
| 1594 | parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); | 1611 | parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); |
| 1595 | err = -ENOSPC; | 1612 | err = -ENOSPC; |
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 4747b476a030..8712df9decb4 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c | |||
| @@ -373,7 +373,7 @@ event_trigger_print(const char *name, struct seq_file *m, | |||
| 373 | { | 373 | { |
| 374 | long count = (long)data; | 374 | long count = (long)data; |
| 375 | 375 | ||
| 376 | seq_printf(m, "%s", name); | 376 | seq_puts(m, name); |
| 377 | 377 | ||
| 378 | if (count == -1) | 378 | if (count == -1) |
| 379 | seq_puts(m, ":unlimited"); | 379 | seq_puts(m, ":unlimited"); |
| @@ -383,7 +383,7 @@ event_trigger_print(const char *name, struct seq_file *m, | |||
| 383 | if (filter_str) | 383 | if (filter_str) |
| 384 | seq_printf(m, " if %s\n", filter_str); | 384 | seq_printf(m, " if %s\n", filter_str); |
| 385 | else | 385 | else |
| 386 | seq_puts(m, "\n"); | 386 | seq_putc(m, '\n'); |
| 387 | 387 | ||
| 388 | return 0; | 388 | return 0; |
| 389 | } | 389 | } |
| @@ -1105,7 +1105,7 @@ event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, | |||
| 1105 | if (data->filter_str) | 1105 | if (data->filter_str) |
| 1106 | seq_printf(m, " if %s\n", data->filter_str); | 1106 | seq_printf(m, " if %s\n", data->filter_str); |
| 1107 | else | 1107 | else |
| 1108 | seq_puts(m, "\n"); | 1108 | seq_putc(m, '\n'); |
| 1109 | 1109 | ||
| 1110 | return 0; | 1110 | return 0; |
| 1111 | } | 1111 | } |
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 57f0ec962d2c..fcd41a166405 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c | |||
| @@ -261,37 +261,74 @@ static struct tracer function_trace __tracer_data = | |||
| 261 | }; | 261 | }; |
| 262 | 262 | ||
| 263 | #ifdef CONFIG_DYNAMIC_FTRACE | 263 | #ifdef CONFIG_DYNAMIC_FTRACE |
| 264 | static int update_count(void **data) | 264 | static void update_traceon_count(void **data, bool on) |
| 265 | { | 265 | { |
| 266 | unsigned long *count = (long *)data; | 266 | long *count = (long *)data; |
| 267 | long old_count = *count; | ||
| 267 | 268 | ||
| 268 | if (!*count) | 269 | /* |
| 269 | return 0; | 270 | * Tracing gets disabled (or enabled) once per count. |
| 271 | * This function can be called at the same time on multiple CPUs. | ||
| 272 | * It is fine if both disable (or enable) tracing, as disabling | ||
| 273 | * (or enabling) the second time doesn't do anything as the | ||
| 274 | * state of the tracer is already disabled (or enabled). | ||
| 275 | * What needs to be synchronized in this case is that the count | ||
| 276 | * only gets decremented once, even if the tracer is disabled | ||
| 277 | * (or enabled) twice, as the second one is really a nop. | ||
| 278 | * | ||
| 279 | * The memory barriers guarantee that we only decrement the | ||
| 280 | * counter once. First the count is read to a local variable | ||
| 281 | * and a read barrier is used to make sure that it is loaded | ||
| 282 | * before checking if the tracer is in the state we want. | ||
| 283 | * If the tracer is not in the state we want, then the count | ||
| 284 | * is guaranteed to be the old count. | ||
| 285 | * | ||
| 286 | * Next the tracer is set to the state we want (disabled or enabled) | ||
| 287 | * then a write memory barrier is used to make sure that | ||
| 288 | * the new state is visible before changing the counter by | ||
| 289 | * one minus the old counter. This guarantees that another CPU | ||
| 290 | * executing this code will see the new state before seeing | ||
| 291 | * the new counter value, and would not do anything if the new | ||
| 292 | * counter is seen. | ||
| 293 | * | ||
| 294 | * Note, there is no synchronization between this and a user | ||
| 295 | * setting the tracing_on file. But we currently don't care | ||
| 296 | * about that. | ||
| 297 | */ | ||
| 298 | if (!old_count) | ||
| 299 | return; | ||
| 270 | 300 | ||
| 271 | if (*count != -1) | 301 | /* Make sure we see count before checking tracing state */ |
| 272 | (*count)--; | 302 | smp_rmb(); |
| 273 | 303 | ||
| 274 | return 1; | 304 | if (on == !!tracing_is_on()) |
| 305 | return; | ||
| 306 | |||
| 307 | if (on) | ||
| 308 | tracing_on(); | ||
| 309 | else | ||
| 310 | tracing_off(); | ||
| 311 | |||
| 312 | /* unlimited? */ | ||
| 313 | if (old_count == -1) | ||
| 314 | return; | ||
| 315 | |||
| 316 | /* Make sure tracing state is visible before updating count */ | ||
| 317 | smp_wmb(); | ||
| 318 | |||
| 319 | *count = old_count - 1; | ||
| 275 | } | 320 | } |
| 276 | 321 | ||
| 277 | static void | 322 | static void |
| 278 | ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data) | 323 | ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data) |
| 279 | { | 324 | { |
| 280 | if (tracing_is_on()) | 325 | update_traceon_count(data, 1); |
| 281 | return; | ||
| 282 | |||
| 283 | if (update_count(data)) | ||
| 284 | tracing_on(); | ||
| 285 | } | 326 | } |
| 286 | 327 | ||
| 287 | static void | 328 | static void |
| 288 | ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data) | 329 | ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data) |
| 289 | { | 330 | { |
| 290 | if (!tracing_is_on()) | 331 | update_traceon_count(data, 0); |
| 291 | return; | ||
| 292 | |||
| 293 | if (update_count(data)) | ||
| 294 | tracing_off(); | ||
| 295 | } | 332 | } |
| 296 | 333 | ||
| 297 | static void | 334 | static void |
| @@ -330,11 +367,49 @@ ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data) | |||
| 330 | static void | 367 | static void |
| 331 | ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data) | 368 | ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data) |
| 332 | { | 369 | { |
| 333 | if (!tracing_is_on()) | 370 | long *count = (long *)data; |
| 334 | return; | 371 | long old_count; |
| 372 | long new_count; | ||
| 335 | 373 | ||
| 336 | if (update_count(data)) | 374 | /* |
| 337 | trace_dump_stack(STACK_SKIP); | 375 | * Stack traces should only execute the number of times the |
| 376 | * user specified in the counter. | ||
| 377 | */ | ||
| 378 | do { | ||
| 379 | |||
| 380 | if (!tracing_is_on()) | ||
| 381 | return; | ||
| 382 | |||
| 383 | old_count = *count; | ||
| 384 | |||
| 385 | if (!old_count) | ||
| 386 | return; | ||
| 387 | |||
| 388 | /* unlimited? */ | ||
| 389 | if (old_count == -1) { | ||
| 390 | trace_dump_stack(STACK_SKIP); | ||
| 391 | return; | ||
| 392 | } | ||
| 393 | |||
| 394 | new_count = old_count - 1; | ||
| 395 | new_count = cmpxchg(count, old_count, new_count); | ||
| 396 | if (new_count == old_count) | ||
| 397 | trace_dump_stack(STACK_SKIP); | ||
| 398 | |||
| 399 | } while (new_count != old_count); | ||
| 400 | } | ||
| 401 | |||
| 402 | static int update_count(void **data) | ||
| 403 | { | ||
| 404 | unsigned long *count = (long *)data; | ||
| 405 | |||
| 406 | if (!*count) | ||
| 407 | return 0; | ||
| 408 | |||
| 409 | if (*count != -1) | ||
| 410 | (*count)--; | ||
| 411 | |||
| 412 | return 1; | ||
| 338 | } | 413 | } |
| 339 | 414 | ||
| 340 | static void | 415 | static void |
| @@ -361,7 +436,7 @@ ftrace_probe_print(const char *name, struct seq_file *m, | |||
| 361 | seq_printf(m, "%ps:%s", (void *)ip, name); | 436 | seq_printf(m, "%ps:%s", (void *)ip, name); |
| 362 | 437 | ||
| 363 | if (count == -1) | 438 | if (count == -1) |
| 364 | seq_printf(m, ":unlimited\n"); | 439 | seq_puts(m, ":unlimited\n"); |
| 365 | else | 440 | else |
| 366 | seq_printf(m, ":count=%ld\n", count); | 441 | seq_printf(m, ":count=%ld\n", count); |
| 367 | 442 | ||
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index f0a0c982cde3..ba476009e5de 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
| @@ -107,7 +107,7 @@ enum { | |||
| 107 | FLAGS_FILL_END = 3 << TRACE_GRAPH_PRINT_FILL_SHIFT, | 107 | FLAGS_FILL_END = 3 << TRACE_GRAPH_PRINT_FILL_SHIFT, |
| 108 | }; | 108 | }; |
| 109 | 109 | ||
| 110 | static enum print_line_t | 110 | static void |
| 111 | print_graph_duration(unsigned long long duration, struct trace_seq *s, | 111 | print_graph_duration(unsigned long long duration, struct trace_seq *s, |
| 112 | u32 flags); | 112 | u32 flags); |
| 113 | 113 | ||
| @@ -483,33 +483,24 @@ static int graph_trace_update_thresh(struct trace_array *tr) | |||
| 483 | 483 | ||
| 484 | static int max_bytes_for_cpu; | 484 | static int max_bytes_for_cpu; |
| 485 | 485 | ||
| 486 | static enum print_line_t | 486 | static void print_graph_cpu(struct trace_seq *s, int cpu) |
| 487 | print_graph_cpu(struct trace_seq *s, int cpu) | ||
| 488 | { | 487 | { |
| 489 | int ret; | ||
| 490 | |||
| 491 | /* | 488 | /* |
| 492 | * Start with a space character - to make it stand out | 489 | * Start with a space character - to make it stand out |
| 493 | * to the right a bit when trace output is pasted into | 490 | * to the right a bit when trace output is pasted into |
| 494 | * email: | 491 | * email: |
| 495 | */ | 492 | */ |
| 496 | ret = trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu); | 493 | trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu); |
| 497 | if (!ret) | ||
| 498 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 499 | |||
| 500 | return TRACE_TYPE_HANDLED; | ||
| 501 | } | 494 | } |
| 502 | 495 | ||
| 503 | #define TRACE_GRAPH_PROCINFO_LENGTH 14 | 496 | #define TRACE_GRAPH_PROCINFO_LENGTH 14 |
| 504 | 497 | ||
| 505 | static enum print_line_t | 498 | static void print_graph_proc(struct trace_seq *s, pid_t pid) |
| 506 | print_graph_proc(struct trace_seq *s, pid_t pid) | ||
| 507 | { | 499 | { |
| 508 | char comm[TASK_COMM_LEN]; | 500 | char comm[TASK_COMM_LEN]; |
| 509 | /* sign + log10(MAX_INT) + '\0' */ | 501 | /* sign + log10(MAX_INT) + '\0' */ |
| 510 | char pid_str[11]; | 502 | char pid_str[11]; |
| 511 | int spaces = 0; | 503 | int spaces = 0; |
| 512 | int ret; | ||
| 513 | int len; | 504 | int len; |
| 514 | int i; | 505 | int i; |
| 515 | 506 | ||
| @@ -524,56 +515,43 @@ print_graph_proc(struct trace_seq *s, pid_t pid) | |||
| 524 | spaces = TRACE_GRAPH_PROCINFO_LENGTH - len; | 515 | spaces = TRACE_GRAPH_PROCINFO_LENGTH - len; |
| 525 | 516 | ||
| 526 | /* First spaces to align center */ | 517 | /* First spaces to align center */ |
| 527 | for (i = 0; i < spaces / 2; i++) { | 518 | for (i = 0; i < spaces / 2; i++) |
| 528 | ret = trace_seq_putc(s, ' '); | 519 | trace_seq_putc(s, ' '); |
| 529 | if (!ret) | ||
| 530 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 531 | } | ||
| 532 | 520 | ||
| 533 | ret = trace_seq_printf(s, "%s-%s", comm, pid_str); | 521 | trace_seq_printf(s, "%s-%s", comm, pid_str); |
| 534 | if (!ret) | ||
| 535 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 536 | 522 | ||
| 537 | /* Last spaces to align center */ | 523 | /* Last spaces to align center */ |
| 538 | for (i = 0; i < spaces - (spaces / 2); i++) { | 524 | for (i = 0; i < spaces - (spaces / 2); i++) |
| 539 | ret = trace_seq_putc(s, ' '); | 525 | trace_seq_putc(s, ' '); |
| 540 | if (!ret) | ||
| 541 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 542 | } | ||
| 543 | return TRACE_TYPE_HANDLED; | ||
| 544 | } | 526 | } |
| 545 | 527 | ||
| 546 | 528 | ||
| 547 | static enum print_line_t | 529 | static void print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry) |
| 548 | print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry) | ||
| 549 | { | 530 | { |
| 550 | if (!trace_seq_putc(s, ' ')) | 531 | trace_seq_putc(s, ' '); |
| 551 | return 0; | 532 | trace_print_lat_fmt(s, entry); |
| 552 | |||
| 553 | return trace_print_lat_fmt(s, entry); | ||
| 554 | } | 533 | } |
| 555 | 534 | ||
| 556 | /* If the pid changed since the last trace, output this event */ | 535 | /* If the pid changed since the last trace, output this event */ |
| 557 | static enum print_line_t | 536 | static void |
| 558 | verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) | 537 | verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) |
| 559 | { | 538 | { |
| 560 | pid_t prev_pid; | 539 | pid_t prev_pid; |
| 561 | pid_t *last_pid; | 540 | pid_t *last_pid; |
| 562 | int ret; | ||
| 563 | 541 | ||
| 564 | if (!data) | 542 | if (!data) |
| 565 | return TRACE_TYPE_HANDLED; | 543 | return; |
| 566 | 544 | ||
| 567 | last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); | 545 | last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); |
| 568 | 546 | ||
| 569 | if (*last_pid == pid) | 547 | if (*last_pid == pid) |
| 570 | return TRACE_TYPE_HANDLED; | 548 | return; |
| 571 | 549 | ||
| 572 | prev_pid = *last_pid; | 550 | prev_pid = *last_pid; |
| 573 | *last_pid = pid; | 551 | *last_pid = pid; |
| 574 | 552 | ||
| 575 | if (prev_pid == -1) | 553 | if (prev_pid == -1) |
| 576 | return TRACE_TYPE_HANDLED; | 554 | return; |
| 577 | /* | 555 | /* |
| 578 | * Context-switch trace line: | 556 | * Context-switch trace line: |
| 579 | 557 | ||
| @@ -582,33 +560,12 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) | |||
| 582 | ------------------------------------------ | 560 | ------------------------------------------ |
| 583 | 561 | ||
| 584 | */ | 562 | */ |
| 585 | ret = trace_seq_puts(s, | 563 | trace_seq_puts(s, " ------------------------------------------\n"); |
| 586 | " ------------------------------------------\n"); | 564 | print_graph_cpu(s, cpu); |
| 587 | if (!ret) | 565 | print_graph_proc(s, prev_pid); |
| 588 | return TRACE_TYPE_PARTIAL_LINE; | 566 | trace_seq_puts(s, " => "); |
| 589 | 567 | print_graph_proc(s, pid); | |
| 590 | ret = print_graph_cpu(s, cpu); | 568 | trace_seq_puts(s, "\n ------------------------------------------\n\n"); |
| 591 | if (ret == TRACE_TYPE_PARTIAL_LINE) | ||
| 592 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 593 | |||
| 594 | ret = print_graph_proc(s, prev_pid); | ||
| 595 | if (ret == TRACE_TYPE_PARTIAL_LINE) | ||
| 596 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 597 | |||
| 598 | ret = trace_seq_puts(s, " => "); | ||
| 599 | if (!ret) | ||
| 600 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 601 | |||
| 602 | ret = print_graph_proc(s, pid); | ||
| 603 | if (ret == TRACE_TYPE_PARTIAL_LINE) | ||
| 604 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 605 | |||
| 606 | ret = trace_seq_puts(s, | ||
| 607 | "\n ------------------------------------------\n\n"); | ||
| 608 | if (!ret) | ||
| 609 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 610 | |||
| 611 | return TRACE_TYPE_HANDLED; | ||
| 612 | } | 569 | } |
| 613 | 570 | ||
| 614 | static struct ftrace_graph_ret_entry * | 571 | static struct ftrace_graph_ret_entry * |
| @@ -682,175 +639,122 @@ get_return_for_leaf(struct trace_iterator *iter, | |||
| 682 | return next; | 639 | return next; |
| 683 | } | 640 | } |
| 684 | 641 | ||
| 685 | static int print_graph_abs_time(u64 t, struct trace_seq *s) | 642 | static void print_graph_abs_time(u64 t, struct trace_seq *s) |
| 686 | { | 643 | { |
| 687 | unsigned long usecs_rem; | 644 | unsigned long usecs_rem; |
| 688 | 645 | ||
| 689 | usecs_rem = do_div(t, NSEC_PER_SEC); | 646 | usecs_rem = do_div(t, NSEC_PER_SEC); |
| 690 | usecs_rem /= 1000; | 647 | usecs_rem /= 1000; |
| 691 | 648 | ||
| 692 | return trace_seq_printf(s, "%5lu.%06lu | ", | 649 | trace_seq_printf(s, "%5lu.%06lu | ", |
| 693 | (unsigned long)t, usecs_rem); | 650 | (unsigned long)t, usecs_rem); |
| 694 | } | 651 | } |
| 695 | 652 | ||
| 696 | static enum print_line_t | 653 | static void |
| 697 | print_graph_irq(struct trace_iterator *iter, unsigned long addr, | 654 | print_graph_irq(struct trace_iterator *iter, unsigned long addr, |
| 698 | enum trace_type type, int cpu, pid_t pid, u32 flags) | 655 | enum trace_type type, int cpu, pid_t pid, u32 flags) |
| 699 | { | 656 | { |
| 700 | int ret; | ||
| 701 | struct trace_seq *s = &iter->seq; | 657 | struct trace_seq *s = &iter->seq; |
| 658 | struct trace_entry *ent = iter->ent; | ||
| 702 | 659 | ||
| 703 | if (addr < (unsigned long)__irqentry_text_start || | 660 | if (addr < (unsigned long)__irqentry_text_start || |
| 704 | addr >= (unsigned long)__irqentry_text_end) | 661 | addr >= (unsigned long)__irqentry_text_end) |
| 705 | return TRACE_TYPE_UNHANDLED; | 662 | return; |
| 706 | 663 | ||
| 707 | if (trace_flags & TRACE_ITER_CONTEXT_INFO) { | 664 | if (trace_flags & TRACE_ITER_CONTEXT_INFO) { |
| 708 | /* Absolute time */ | 665 | /* Absolute time */ |
| 709 | if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { | 666 | if (flags & TRACE_GRAPH_PRINT_ABS_TIME) |
| 710 | ret = print_graph_abs_time(iter->ts, s); | 667 | print_graph_abs_time(iter->ts, s); |
| 711 | if (!ret) | ||
| 712 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 713 | } | ||
| 714 | 668 | ||
| 715 | /* Cpu */ | 669 | /* Cpu */ |
| 716 | if (flags & TRACE_GRAPH_PRINT_CPU) { | 670 | if (flags & TRACE_GRAPH_PRINT_CPU) |
| 717 | ret = print_graph_cpu(s, cpu); | 671 | print_graph_cpu(s, cpu); |
| 718 | if (ret == TRACE_TYPE_PARTIAL_LINE) | ||
| 719 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 720 | } | ||
| 721 | 672 | ||
| 722 | /* Proc */ | 673 | /* Proc */ |
| 723 | if (flags & TRACE_GRAPH_PRINT_PROC) { | 674 | if (flags & TRACE_GRAPH_PRINT_PROC) { |
| 724 | ret = print_graph_proc(s, pid); | 675 | print_graph_proc(s, pid); |
| 725 | if (ret == TRACE_TYPE_PARTIAL_LINE) | 676 | trace_seq_puts(s, " | "); |
| 726 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 727 | ret = trace_seq_puts(s, " | "); | ||
| 728 | if (!ret) | ||
| 729 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 730 | } | 677 | } |
| 678 | |||
| 679 | /* Latency format */ | ||
| 680 | if (trace_flags & TRACE_ITER_LATENCY_FMT) | ||
| 681 | print_graph_lat_fmt(s, ent); | ||
| 731 | } | 682 | } |
| 732 | 683 | ||
| 733 | /* No overhead */ | 684 | /* No overhead */ |
| 734 | ret = print_graph_duration(0, s, flags | FLAGS_FILL_START); | 685 | print_graph_duration(0, s, flags | FLAGS_FILL_START); |
| 735 | if (ret != TRACE_TYPE_HANDLED) | ||
| 736 | return ret; | ||
| 737 | 686 | ||
| 738 | if (type == TRACE_GRAPH_ENT) | 687 | if (type == TRACE_GRAPH_ENT) |
| 739 | ret = trace_seq_puts(s, "==========>"); | 688 | trace_seq_puts(s, "==========>"); |
| 740 | else | 689 | else |
| 741 | ret = trace_seq_puts(s, "<=========="); | 690 | trace_seq_puts(s, "<=========="); |
| 742 | |||
| 743 | if (!ret) | ||
| 744 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 745 | |||
| 746 | ret = print_graph_duration(0, s, flags | FLAGS_FILL_END); | ||
| 747 | if (ret != TRACE_TYPE_HANDLED) | ||
| 748 | return ret; | ||
| 749 | |||
| 750 | ret = trace_seq_putc(s, '\n'); | ||
| 751 | 691 | ||
| 752 | if (!ret) | 692 | print_graph_duration(0, s, flags | FLAGS_FILL_END); |
| 753 | return TRACE_TYPE_PARTIAL_LINE; | 693 | trace_seq_putc(s, '\n'); |
| 754 | return TRACE_TYPE_HANDLED; | ||
| 755 | } | 694 | } |
| 756 | 695 | ||
| 757 | enum print_line_t | 696 | void |
| 758 | trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) | 697 | trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) |
| 759 | { | 698 | { |
| 760 | unsigned long nsecs_rem = do_div(duration, 1000); | 699 | unsigned long nsecs_rem = do_div(duration, 1000); |
| 761 | /* log10(ULONG_MAX) + '\0' */ | 700 | /* log10(ULONG_MAX) + '\0' */ |
| 762 | char msecs_str[21]; | 701 | char usecs_str[21]; |
| 763 | char nsecs_str[5]; | 702 | char nsecs_str[5]; |
| 764 | int ret, len; | 703 | int len; |
| 765 | int i; | 704 | int i; |
| 766 | 705 | ||
| 767 | sprintf(msecs_str, "%lu", (unsigned long) duration); | 706 | sprintf(usecs_str, "%lu", (unsigned long) duration); |
| 768 | 707 | ||
| 769 | /* Print msecs */ | 708 | /* Print msecs */ |
| 770 | ret = trace_seq_printf(s, "%s", msecs_str); | 709 | trace_seq_printf(s, "%s", usecs_str); |
| 771 | if (!ret) | ||
| 772 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 773 | 710 | ||
| 774 | len = strlen(msecs_str); | 711 | len = strlen(usecs_str); |
| 775 | 712 | ||
| 776 | /* Print nsecs (we don't want to exceed 7 numbers) */ | 713 | /* Print nsecs (we don't want to exceed 7 numbers) */ |
| 777 | if (len < 7) { | 714 | if (len < 7) { |
| 778 | size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len); | 715 | size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len); |
| 779 | 716 | ||
| 780 | snprintf(nsecs_str, slen, "%03lu", nsecs_rem); | 717 | snprintf(nsecs_str, slen, "%03lu", nsecs_rem); |
| 781 | ret = trace_seq_printf(s, ".%s", nsecs_str); | 718 | trace_seq_printf(s, ".%s", nsecs_str); |
| 782 | if (!ret) | ||
| 783 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 784 | len += strlen(nsecs_str); | 719 | len += strlen(nsecs_str); |
| 785 | } | 720 | } |
| 786 | 721 | ||
| 787 | ret = trace_seq_puts(s, " us "); | 722 | trace_seq_puts(s, " us "); |
| 788 | if (!ret) | ||
| 789 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 790 | 723 | ||
| 791 | /* Print remaining spaces to fit the row's width */ | 724 | /* Print remaining spaces to fit the row's width */ |
| 792 | for (i = len; i < 7; i++) { | 725 | for (i = len; i < 7; i++) |
| 793 | ret = trace_seq_putc(s, ' '); | 726 | trace_seq_putc(s, ' '); |
| 794 | if (!ret) | ||
| 795 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 796 | } | ||
| 797 | return TRACE_TYPE_HANDLED; | ||
| 798 | } | 727 | } |
| 799 | 728 | ||
| 800 | static enum print_line_t | 729 | static void |
| 801 | print_graph_duration(unsigned long long duration, struct trace_seq *s, | 730 | print_graph_duration(unsigned long long duration, struct trace_seq *s, |
| 802 | u32 flags) | 731 | u32 flags) |
| 803 | { | 732 | { |
| 804 | int ret = -1; | ||
| 805 | |||
| 806 | if (!(flags & TRACE_GRAPH_PRINT_DURATION) || | 733 | if (!(flags & TRACE_GRAPH_PRINT_DURATION) || |
| 807 | !(trace_flags & TRACE_ITER_CONTEXT_INFO)) | 734 | !(trace_flags & TRACE_ITER_CONTEXT_INFO)) |
| 808 | return TRACE_TYPE_HANDLED; | 735 | return; |
| 809 | 736 | ||
| 810 | /* No real adata, just filling the column with spaces */ | 737 | /* No real adata, just filling the column with spaces */ |
| 811 | switch (flags & TRACE_GRAPH_PRINT_FILL_MASK) { | 738 | switch (flags & TRACE_GRAPH_PRINT_FILL_MASK) { |
| 812 | case FLAGS_FILL_FULL: | 739 | case FLAGS_FILL_FULL: |
| 813 | ret = trace_seq_puts(s, " | "); | 740 | trace_seq_puts(s, " | "); |
| 814 | return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; | 741 | return; |
| 815 | case FLAGS_FILL_START: | 742 | case FLAGS_FILL_START: |
| 816 | ret = trace_seq_puts(s, " "); | 743 | trace_seq_puts(s, " "); |
| 817 | return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; | 744 | return; |
| 818 | case FLAGS_FILL_END: | 745 | case FLAGS_FILL_END: |
| 819 | ret = trace_seq_puts(s, " |"); | 746 | trace_seq_puts(s, " |"); |
| 820 | return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; | 747 | return; |
| 821 | } | 748 | } |
| 822 | 749 | ||
| 823 | /* Signal a overhead of time execution to the output */ | 750 | /* Signal a overhead of time execution to the output */ |
| 824 | if (flags & TRACE_GRAPH_PRINT_OVERHEAD) { | 751 | if (flags & TRACE_GRAPH_PRINT_OVERHEAD) |
| 825 | /* Duration exceeded 100 msecs */ | 752 | trace_seq_printf(s, "%c ", trace_find_mark(duration)); |
| 826 | if (duration > 100000ULL) | 753 | else |
| 827 | ret = trace_seq_puts(s, "! "); | 754 | trace_seq_puts(s, " "); |
| 828 | /* Duration exceeded 10 msecs */ | ||
| 829 | else if (duration > 10000ULL) | ||
| 830 | ret = trace_seq_puts(s, "+ "); | ||
| 831 | } | ||
| 832 | |||
| 833 | /* | ||
| 834 | * The -1 means we either did not exceed the duration tresholds | ||
| 835 | * or we dont want to print out the overhead. Either way we need | ||
| 836 | * to fill out the space. | ||
| 837 | */ | ||
| 838 | if (ret == -1) | ||
| 839 | ret = trace_seq_puts(s, " "); | ||
| 840 | |||
| 841 | /* Catching here any failure happenned above */ | ||
| 842 | if (!ret) | ||
| 843 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 844 | |||
| 845 | ret = trace_print_graph_duration(duration, s); | ||
| 846 | if (ret != TRACE_TYPE_HANDLED) | ||
| 847 | return ret; | ||
| 848 | |||
| 849 | ret = trace_seq_puts(s, "| "); | ||
| 850 | if (!ret) | ||
| 851 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 852 | 755 | ||
| 853 | return TRACE_TYPE_HANDLED; | 756 | trace_print_graph_duration(duration, s); |
| 757 | trace_seq_puts(s, "| "); | ||
| 854 | } | 758 | } |
| 855 | 759 | ||
| 856 | /* Case of a leaf function on its call entry */ | 760 | /* Case of a leaf function on its call entry */ |
| @@ -864,7 +768,6 @@ print_graph_entry_leaf(struct trace_iterator *iter, | |||
| 864 | struct ftrace_graph_ret *graph_ret; | 768 | struct ftrace_graph_ret *graph_ret; |
| 865 | struct ftrace_graph_ent *call; | 769 | struct ftrace_graph_ent *call; |
| 866 | unsigned long long duration; | 770 | unsigned long long duration; |
| 867 | int ret; | ||
| 868 | int i; | 771 | int i; |
| 869 | 772 | ||
| 870 | graph_ret = &ret_entry->ret; | 773 | graph_ret = &ret_entry->ret; |
| @@ -890,22 +793,15 @@ print_graph_entry_leaf(struct trace_iterator *iter, | |||
| 890 | } | 793 | } |
| 891 | 794 | ||
| 892 | /* Overhead and duration */ | 795 | /* Overhead and duration */ |
| 893 | ret = print_graph_duration(duration, s, flags); | 796 | print_graph_duration(duration, s, flags); |
| 894 | if (ret == TRACE_TYPE_PARTIAL_LINE) | ||
| 895 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 896 | 797 | ||
| 897 | /* Function */ | 798 | /* Function */ |
| 898 | for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { | 799 | for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) |
| 899 | ret = trace_seq_putc(s, ' '); | 800 | trace_seq_putc(s, ' '); |
| 900 | if (!ret) | ||
| 901 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 902 | } | ||
| 903 | 801 | ||
| 904 | ret = trace_seq_printf(s, "%ps();\n", (void *)call->func); | 802 | trace_seq_printf(s, "%ps();\n", (void *)call->func); |
| 905 | if (!ret) | ||
| 906 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 907 | 803 | ||
| 908 | return TRACE_TYPE_HANDLED; | 804 | return trace_handle_return(s); |
| 909 | } | 805 | } |
| 910 | 806 | ||
| 911 | static enum print_line_t | 807 | static enum print_line_t |
| @@ -915,7 +811,6 @@ print_graph_entry_nested(struct trace_iterator *iter, | |||
| 915 | { | 811 | { |
| 916 | struct ftrace_graph_ent *call = &entry->graph_ent; | 812 | struct ftrace_graph_ent *call = &entry->graph_ent; |
| 917 | struct fgraph_data *data = iter->private; | 813 | struct fgraph_data *data = iter->private; |
| 918 | int ret; | ||
| 919 | int i; | 814 | int i; |
| 920 | 815 | ||
| 921 | if (data) { | 816 | if (data) { |
| @@ -931,19 +826,15 @@ print_graph_entry_nested(struct trace_iterator *iter, | |||
| 931 | } | 826 | } |
| 932 | 827 | ||
| 933 | /* No time */ | 828 | /* No time */ |
| 934 | ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL); | 829 | print_graph_duration(0, s, flags | FLAGS_FILL_FULL); |
| 935 | if (ret != TRACE_TYPE_HANDLED) | ||
| 936 | return ret; | ||
| 937 | 830 | ||
| 938 | /* Function */ | 831 | /* Function */ |
| 939 | for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { | 832 | for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) |
| 940 | ret = trace_seq_putc(s, ' '); | 833 | trace_seq_putc(s, ' '); |
| 941 | if (!ret) | 834 | |
| 942 | return TRACE_TYPE_PARTIAL_LINE; | 835 | trace_seq_printf(s, "%ps() {\n", (void *)call->func); |
| 943 | } | ||
| 944 | 836 | ||
| 945 | ret = trace_seq_printf(s, "%ps() {\n", (void *)call->func); | 837 | if (trace_seq_has_overflowed(s)) |
| 946 | if (!ret) | ||
| 947 | return TRACE_TYPE_PARTIAL_LINE; | 838 | return TRACE_TYPE_PARTIAL_LINE; |
| 948 | 839 | ||
| 949 | /* | 840 | /* |
| @@ -953,62 +844,43 @@ print_graph_entry_nested(struct trace_iterator *iter, | |||
| 953 | return TRACE_TYPE_NO_CONSUME; | 844 | return TRACE_TYPE_NO_CONSUME; |
| 954 | } | 845 | } |
| 955 | 846 | ||
| 956 | static enum print_line_t | 847 | static void |
| 957 | print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, | 848 | print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, |
| 958 | int type, unsigned long addr, u32 flags) | 849 | int type, unsigned long addr, u32 flags) |
| 959 | { | 850 | { |
| 960 | struct fgraph_data *data = iter->private; | 851 | struct fgraph_data *data = iter->private; |
| 961 | struct trace_entry *ent = iter->ent; | 852 | struct trace_entry *ent = iter->ent; |
| 962 | int cpu = iter->cpu; | 853 | int cpu = iter->cpu; |
| 963 | int ret; | ||
| 964 | 854 | ||
| 965 | /* Pid */ | 855 | /* Pid */ |
| 966 | if (verif_pid(s, ent->pid, cpu, data) == TRACE_TYPE_PARTIAL_LINE) | 856 | verif_pid(s, ent->pid, cpu, data); |
| 967 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 968 | 857 | ||
| 969 | if (type) { | 858 | if (type) |
| 970 | /* Interrupt */ | 859 | /* Interrupt */ |
| 971 | ret = print_graph_irq(iter, addr, type, cpu, ent->pid, flags); | 860 | print_graph_irq(iter, addr, type, cpu, ent->pid, flags); |
| 972 | if (ret == TRACE_TYPE_PARTIAL_LINE) | ||
| 973 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 974 | } | ||
| 975 | 861 | ||
| 976 | if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) | 862 | if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) |
| 977 | return 0; | 863 | return; |
| 978 | 864 | ||
| 979 | /* Absolute time */ | 865 | /* Absolute time */ |
| 980 | if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { | 866 | if (flags & TRACE_GRAPH_PRINT_ABS_TIME) |
| 981 | ret = print_graph_abs_time(iter->ts, s); | 867 | print_graph_abs_time(iter->ts, s); |
| 982 | if (!ret) | ||
| 983 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 984 | } | ||
| 985 | 868 | ||
| 986 | /* Cpu */ | 869 | /* Cpu */ |
| 987 | if (flags & TRACE_GRAPH_PRINT_CPU) { | 870 | if (flags & TRACE_GRAPH_PRINT_CPU) |
| 988 | ret = print_graph_cpu(s, cpu); | 871 | print_graph_cpu(s, cpu); |
| 989 | if (ret == TRACE_TYPE_PARTIAL_LINE) | ||
| 990 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 991 | } | ||
| 992 | 872 | ||
| 993 | /* Proc */ | 873 | /* Proc */ |
| 994 | if (flags & TRACE_GRAPH_PRINT_PROC) { | 874 | if (flags & TRACE_GRAPH_PRINT_PROC) { |
| 995 | ret = print_graph_proc(s, ent->pid); | 875 | print_graph_proc(s, ent->pid); |
| 996 | if (ret == TRACE_TYPE_PARTIAL_LINE) | 876 | trace_seq_puts(s, " | "); |
| 997 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 998 | |||
| 999 | ret = trace_seq_puts(s, " | "); | ||
| 1000 | if (!ret) | ||
| 1001 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1002 | } | 877 | } |
| 1003 | 878 | ||
| 1004 | /* Latency format */ | 879 | /* Latency format */ |
| 1005 | if (trace_flags & TRACE_ITER_LATENCY_FMT) { | 880 | if (trace_flags & TRACE_ITER_LATENCY_FMT) |
| 1006 | ret = print_graph_lat_fmt(s, ent); | 881 | print_graph_lat_fmt(s, ent); |
| 1007 | if (ret == TRACE_TYPE_PARTIAL_LINE) | ||
| 1008 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1009 | } | ||
| 1010 | 882 | ||
| 1011 | return 0; | 883 | return; |
| 1012 | } | 884 | } |
| 1013 | 885 | ||
| 1014 | /* | 886 | /* |
| @@ -1126,8 +998,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, | |||
| 1126 | if (check_irq_entry(iter, flags, call->func, call->depth)) | 998 | if (check_irq_entry(iter, flags, call->func, call->depth)) |
| 1127 | return TRACE_TYPE_HANDLED; | 999 | return TRACE_TYPE_HANDLED; |
| 1128 | 1000 | ||
| 1129 | if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags)) | 1001 | print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags); |
| 1130 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1131 | 1002 | ||
| 1132 | leaf_ret = get_return_for_leaf(iter, field); | 1003 | leaf_ret = get_return_for_leaf(iter, field); |
| 1133 | if (leaf_ret) | 1004 | if (leaf_ret) |
| @@ -1160,7 +1031,6 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, | |||
| 1160 | pid_t pid = ent->pid; | 1031 | pid_t pid = ent->pid; |
| 1161 | int cpu = iter->cpu; | 1032 | int cpu = iter->cpu; |
| 1162 | int func_match = 1; | 1033 | int func_match = 1; |
| 1163 | int ret; | ||
| 1164 | int i; | 1034 | int i; |
| 1165 | 1035 | ||
| 1166 | if (check_irq_return(iter, flags, trace->depth)) | 1036 | if (check_irq_return(iter, flags, trace->depth)) |
| @@ -1186,20 +1056,14 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, | |||
| 1186 | } | 1056 | } |
| 1187 | } | 1057 | } |
| 1188 | 1058 | ||
| 1189 | if (print_graph_prologue(iter, s, 0, 0, flags)) | 1059 | print_graph_prologue(iter, s, 0, 0, flags); |
| 1190 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1191 | 1060 | ||
| 1192 | /* Overhead and duration */ | 1061 | /* Overhead and duration */ |
| 1193 | ret = print_graph_duration(duration, s, flags); | 1062 | print_graph_duration(duration, s, flags); |
| 1194 | if (ret == TRACE_TYPE_PARTIAL_LINE) | ||
| 1195 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1196 | 1063 | ||
| 1197 | /* Closing brace */ | 1064 | /* Closing brace */ |
| 1198 | for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { | 1065 | for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) |
| 1199 | ret = trace_seq_putc(s, ' '); | 1066 | trace_seq_putc(s, ' '); |
| 1200 | if (!ret) | ||
| 1201 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1202 | } | ||
| 1203 | 1067 | ||
| 1204 | /* | 1068 | /* |
| 1205 | * If the return function does not have a matching entry, | 1069 | * If the return function does not have a matching entry, |
| @@ -1208,30 +1072,20 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, | |||
| 1208 | * belongs to, write out the function name. Always do | 1072 | * belongs to, write out the function name. Always do |
| 1209 | * that if the funcgraph-tail option is enabled. | 1073 | * that if the funcgraph-tail option is enabled. |
| 1210 | */ | 1074 | */ |
| 1211 | if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) { | 1075 | if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) |
| 1212 | ret = trace_seq_puts(s, "}\n"); | 1076 | trace_seq_puts(s, "}\n"); |
| 1213 | if (!ret) | 1077 | else |
| 1214 | return TRACE_TYPE_PARTIAL_LINE; | 1078 | trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func); |
| 1215 | } else { | ||
| 1216 | ret = trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func); | ||
| 1217 | if (!ret) | ||
| 1218 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1219 | } | ||
| 1220 | 1079 | ||
| 1221 | /* Overrun */ | 1080 | /* Overrun */ |
| 1222 | if (flags & TRACE_GRAPH_PRINT_OVERRUN) { | 1081 | if (flags & TRACE_GRAPH_PRINT_OVERRUN) |
| 1223 | ret = trace_seq_printf(s, " (Overruns: %lu)\n", | 1082 | trace_seq_printf(s, " (Overruns: %lu)\n", |
| 1224 | trace->overrun); | 1083 | trace->overrun); |
| 1225 | if (!ret) | ||
| 1226 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1227 | } | ||
| 1228 | 1084 | ||
| 1229 | ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, | 1085 | print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, |
| 1230 | cpu, pid, flags); | 1086 | cpu, pid, flags); |
| 1231 | if (ret == TRACE_TYPE_PARTIAL_LINE) | ||
| 1232 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1233 | 1087 | ||
| 1234 | return TRACE_TYPE_HANDLED; | 1088 | return trace_handle_return(s); |
| 1235 | } | 1089 | } |
| 1236 | 1090 | ||
| 1237 | static enum print_line_t | 1091 | static enum print_line_t |
| @@ -1248,26 +1102,18 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, | |||
| 1248 | if (data) | 1102 | if (data) |
| 1249 | depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth; | 1103 | depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth; |
| 1250 | 1104 | ||
| 1251 | if (print_graph_prologue(iter, s, 0, 0, flags)) | 1105 | print_graph_prologue(iter, s, 0, 0, flags); |
| 1252 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1253 | 1106 | ||
| 1254 | /* No time */ | 1107 | /* No time */ |
| 1255 | ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL); | 1108 | print_graph_duration(0, s, flags | FLAGS_FILL_FULL); |
| 1256 | if (ret != TRACE_TYPE_HANDLED) | ||
| 1257 | return ret; | ||
| 1258 | 1109 | ||
| 1259 | /* Indentation */ | 1110 | /* Indentation */ |
| 1260 | if (depth > 0) | 1111 | if (depth > 0) |
| 1261 | for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) { | 1112 | for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) |
| 1262 | ret = trace_seq_putc(s, ' '); | 1113 | trace_seq_putc(s, ' '); |
| 1263 | if (!ret) | ||
| 1264 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1265 | } | ||
| 1266 | 1114 | ||
| 1267 | /* The comment */ | 1115 | /* The comment */ |
| 1268 | ret = trace_seq_puts(s, "/* "); | 1116 | trace_seq_puts(s, "/* "); |
| 1269 | if (!ret) | ||
| 1270 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1271 | 1117 | ||
| 1272 | switch (iter->ent->type) { | 1118 | switch (iter->ent->type) { |
| 1273 | case TRACE_BPRINT: | 1119 | case TRACE_BPRINT: |
| @@ -1290,17 +1136,18 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, | |||
| 1290 | return ret; | 1136 | return ret; |
| 1291 | } | 1137 | } |
| 1292 | 1138 | ||
| 1139 | if (trace_seq_has_overflowed(s)) | ||
| 1140 | goto out; | ||
| 1141 | |||
| 1293 | /* Strip ending newline */ | 1142 | /* Strip ending newline */ |
| 1294 | if (s->buffer[s->len - 1] == '\n') { | 1143 | if (s->buffer[s->seq.len - 1] == '\n') { |
| 1295 | s->buffer[s->len - 1] = '\0'; | 1144 | s->buffer[s->seq.len - 1] = '\0'; |
| 1296 | s->len--; | 1145 | s->seq.len--; |
| 1297 | } | 1146 | } |
| 1298 | 1147 | ||
| 1299 | ret = trace_seq_puts(s, " */\n"); | 1148 | trace_seq_puts(s, " */\n"); |
| 1300 | if (!ret) | 1149 | out: |
| 1301 | return TRACE_TYPE_PARTIAL_LINE; | 1150 | return trace_handle_return(s); |
| 1302 | |||
| 1303 | return TRACE_TYPE_HANDLED; | ||
| 1304 | } | 1151 | } |
| 1305 | 1152 | ||
| 1306 | 1153 | ||
| @@ -1407,32 +1254,32 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags) | |||
| 1407 | print_lat_header(s, flags); | 1254 | print_lat_header(s, flags); |
| 1408 | 1255 | ||
| 1409 | /* 1st line */ | 1256 | /* 1st line */ |
| 1410 | seq_printf(s, "#"); | 1257 | seq_putc(s, '#'); |
| 1411 | if (flags & TRACE_GRAPH_PRINT_ABS_TIME) | 1258 | if (flags & TRACE_GRAPH_PRINT_ABS_TIME) |
| 1412 | seq_printf(s, " TIME "); | 1259 | seq_puts(s, " TIME "); |
| 1413 | if (flags & TRACE_GRAPH_PRINT_CPU) | 1260 | if (flags & TRACE_GRAPH_PRINT_CPU) |
| 1414 | seq_printf(s, " CPU"); | 1261 | seq_puts(s, " CPU"); |
| 1415 | if (flags & TRACE_GRAPH_PRINT_PROC) | 1262 | if (flags & TRACE_GRAPH_PRINT_PROC) |
| 1416 | seq_printf(s, " TASK/PID "); | 1263 | seq_puts(s, " TASK/PID "); |
| 1417 | if (lat) | 1264 | if (lat) |
| 1418 | seq_printf(s, "||||"); | 1265 | seq_puts(s, "||||"); |
| 1419 | if (flags & TRACE_GRAPH_PRINT_DURATION) | 1266 | if (flags & TRACE_GRAPH_PRINT_DURATION) |
| 1420 | seq_printf(s, " DURATION "); | 1267 | seq_puts(s, " DURATION "); |
| 1421 | seq_printf(s, " FUNCTION CALLS\n"); | 1268 | seq_puts(s, " FUNCTION CALLS\n"); |
| 1422 | 1269 | ||
| 1423 | /* 2nd line */ | 1270 | /* 2nd line */ |
| 1424 | seq_printf(s, "#"); | 1271 | seq_putc(s, '#'); |
| 1425 | if (flags & TRACE_GRAPH_PRINT_ABS_TIME) | 1272 | if (flags & TRACE_GRAPH_PRINT_ABS_TIME) |
| 1426 | seq_printf(s, " | "); | 1273 | seq_puts(s, " | "); |
| 1427 | if (flags & TRACE_GRAPH_PRINT_CPU) | 1274 | if (flags & TRACE_GRAPH_PRINT_CPU) |
| 1428 | seq_printf(s, " | "); | 1275 | seq_puts(s, " | "); |
| 1429 | if (flags & TRACE_GRAPH_PRINT_PROC) | 1276 | if (flags & TRACE_GRAPH_PRINT_PROC) |
| 1430 | seq_printf(s, " | | "); | 1277 | seq_puts(s, " | | "); |
| 1431 | if (lat) | 1278 | if (lat) |
| 1432 | seq_printf(s, "||||"); | 1279 | seq_puts(s, "||||"); |
| 1433 | if (flags & TRACE_GRAPH_PRINT_DURATION) | 1280 | if (flags & TRACE_GRAPH_PRINT_DURATION) |
| 1434 | seq_printf(s, " | | "); | 1281 | seq_puts(s, " | | "); |
| 1435 | seq_printf(s, " | | | |\n"); | 1282 | seq_puts(s, " | | | |\n"); |
| 1436 | } | 1283 | } |
| 1437 | 1284 | ||
| 1438 | static void print_graph_headers(struct seq_file *s) | 1285 | static void print_graph_headers(struct seq_file *s) |
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index bd90e1b06088..3ccf5c2c1320 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c | |||
| @@ -20,10 +20,12 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) | |||
| 20 | { | 20 | { |
| 21 | /* use static because iter can be a bit big for the stack */ | 21 | /* use static because iter can be a bit big for the stack */ |
| 22 | static struct trace_iterator iter; | 22 | static struct trace_iterator iter; |
| 23 | static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS]; | ||
| 23 | unsigned int old_userobj; | 24 | unsigned int old_userobj; |
| 24 | int cnt = 0, cpu; | 25 | int cnt = 0, cpu; |
| 25 | 26 | ||
| 26 | trace_init_global_iter(&iter); | 27 | trace_init_global_iter(&iter); |
| 28 | iter.buffer_iter = buffer_iter; | ||
| 27 | 29 | ||
| 28 | for_each_tracing_cpu(cpu) { | 30 | for_each_tracing_cpu(cpu) { |
| 29 | atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); | 31 | atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); |
| @@ -57,19 +59,19 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) | |||
| 57 | ring_buffer_read_start(iter.buffer_iter[cpu_file]); | 59 | ring_buffer_read_start(iter.buffer_iter[cpu_file]); |
| 58 | tracing_iter_reset(&iter, cpu_file); | 60 | tracing_iter_reset(&iter, cpu_file); |
| 59 | } | 61 | } |
| 60 | if (!trace_empty(&iter)) | 62 | |
| 61 | trace_find_next_entry_inc(&iter); | 63 | while (trace_find_next_entry_inc(&iter)) { |
| 62 | while (!trace_empty(&iter)) { | ||
| 63 | if (!cnt) | 64 | if (!cnt) |
| 64 | kdb_printf("---------------------------------\n"); | 65 | kdb_printf("---------------------------------\n"); |
| 65 | cnt++; | 66 | cnt++; |
| 66 | 67 | ||
| 67 | if (trace_find_next_entry_inc(&iter) != NULL && !skip_lines) | 68 | if (!skip_lines) { |
| 68 | print_trace_line(&iter); | 69 | print_trace_line(&iter); |
| 69 | if (!skip_lines) | ||
| 70 | trace_printk_seq(&iter.seq); | 70 | trace_printk_seq(&iter.seq); |
| 71 | else | 71 | } else { |
| 72 | skip_lines--; | 72 | skip_lines--; |
| 73 | } | ||
| 74 | |||
| 73 | if (KDB_FLAG(CMD_INTERRUPT)) | 75 | if (KDB_FLAG(CMD_INTERRUPT)) |
| 74 | goto out; | 76 | goto out; |
| 75 | } | 77 | } |
| @@ -86,9 +88,12 @@ out: | |||
| 86 | atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); | 88 | atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); |
| 87 | } | 89 | } |
| 88 | 90 | ||
| 89 | for_each_tracing_cpu(cpu) | 91 | for_each_tracing_cpu(cpu) { |
| 90 | if (iter.buffer_iter[cpu]) | 92 | if (iter.buffer_iter[cpu]) { |
| 91 | ring_buffer_read_finish(iter.buffer_iter[cpu]); | 93 | ring_buffer_read_finish(iter.buffer_iter[cpu]); |
| 94 | iter.buffer_iter[cpu] = NULL; | ||
| 95 | } | ||
| 96 | } | ||
| 92 | } | 97 | } |
| 93 | 98 | ||
| 94 | /* | 99 | /* |
| @@ -127,8 +132,8 @@ static int kdb_ftdump(int argc, const char **argv) | |||
| 127 | 132 | ||
| 128 | static __init int kdb_ftrace_register(void) | 133 | static __init int kdb_ftrace_register(void) |
| 129 | { | 134 | { |
| 130 | kdb_register_repeat("ftdump", kdb_ftdump, "[skip_#lines] [cpu]", | 135 | kdb_register_flags("ftdump", kdb_ftdump, "[skip_#lines] [cpu]", |
| 131 | "Dump ftrace log", 0, KDB_REPEAT_NONE); | 136 | "Dump ftrace log", 0, KDB_ENABLE_ALWAYS_SAFE); |
| 132 | return 0; | 137 | return 0; |
| 133 | } | 138 | } |
| 134 | 139 | ||
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 282f6e4e5539..296079ae6583 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
| @@ -826,7 +826,7 @@ static int probes_seq_show(struct seq_file *m, void *v) | |||
| 826 | struct trace_kprobe *tk = v; | 826 | struct trace_kprobe *tk = v; |
| 827 | int i; | 827 | int i; |
| 828 | 828 | ||
| 829 | seq_printf(m, "%c", trace_kprobe_is_return(tk) ? 'r' : 'p'); | 829 | seq_putc(m, trace_kprobe_is_return(tk) ? 'r' : 'p'); |
| 830 | seq_printf(m, ":%s/%s", tk->tp.call.class->system, | 830 | seq_printf(m, ":%s/%s", tk->tp.call.class->system, |
| 831 | ftrace_event_name(&tk->tp.call)); | 831 | ftrace_event_name(&tk->tp.call)); |
| 832 | 832 | ||
| @@ -840,7 +840,7 @@ static int probes_seq_show(struct seq_file *m, void *v) | |||
| 840 | 840 | ||
| 841 | for (i = 0; i < tk->tp.nr_args; i++) | 841 | for (i = 0; i < tk->tp.nr_args; i++) |
| 842 | seq_printf(m, " %s=%s", tk->tp.args[i].name, tk->tp.args[i].comm); | 842 | seq_printf(m, " %s=%s", tk->tp.args[i].name, tk->tp.args[i].comm); |
| 843 | seq_printf(m, "\n"); | 843 | seq_putc(m, '\n'); |
| 844 | 844 | ||
| 845 | return 0; | 845 | return 0; |
| 846 | } | 846 | } |
| @@ -1024,27 +1024,22 @@ print_kprobe_event(struct trace_iterator *iter, int flags, | |||
| 1024 | field = (struct kprobe_trace_entry_head *)iter->ent; | 1024 | field = (struct kprobe_trace_entry_head *)iter->ent; |
| 1025 | tp = container_of(event, struct trace_probe, call.event); | 1025 | tp = container_of(event, struct trace_probe, call.event); |
| 1026 | 1026 | ||
| 1027 | if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call))) | 1027 | trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)); |
| 1028 | goto partial; | ||
| 1029 | 1028 | ||
| 1030 | if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) | 1029 | if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) |
| 1031 | goto partial; | 1030 | goto out; |
| 1032 | 1031 | ||
| 1033 | if (!trace_seq_puts(s, ")")) | 1032 | trace_seq_putc(s, ')'); |
| 1034 | goto partial; | ||
| 1035 | 1033 | ||
| 1036 | data = (u8 *)&field[1]; | 1034 | data = (u8 *)&field[1]; |
| 1037 | for (i = 0; i < tp->nr_args; i++) | 1035 | for (i = 0; i < tp->nr_args; i++) |
| 1038 | if (!tp->args[i].type->print(s, tp->args[i].name, | 1036 | if (!tp->args[i].type->print(s, tp->args[i].name, |
| 1039 | data + tp->args[i].offset, field)) | 1037 | data + tp->args[i].offset, field)) |
| 1040 | goto partial; | 1038 | goto out; |
| 1041 | |||
| 1042 | if (!trace_seq_puts(s, "\n")) | ||
| 1043 | goto partial; | ||
| 1044 | 1039 | ||
| 1045 | return TRACE_TYPE_HANDLED; | 1040 | trace_seq_putc(s, '\n'); |
| 1046 | partial: | 1041 | out: |
| 1047 | return TRACE_TYPE_PARTIAL_LINE; | 1042 | return trace_handle_return(s); |
| 1048 | } | 1043 | } |
| 1049 | 1044 | ||
| 1050 | static enum print_line_t | 1045 | static enum print_line_t |
| @@ -1060,33 +1055,28 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, | |||
| 1060 | field = (struct kretprobe_trace_entry_head *)iter->ent; | 1055 | field = (struct kretprobe_trace_entry_head *)iter->ent; |
| 1061 | tp = container_of(event, struct trace_probe, call.event); | 1056 | tp = container_of(event, struct trace_probe, call.event); |
| 1062 | 1057 | ||
| 1063 | if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call))) | 1058 | trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)); |
| 1064 | goto partial; | ||
| 1065 | 1059 | ||
| 1066 | if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) | 1060 | if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) |
| 1067 | goto partial; | 1061 | goto out; |
| 1068 | 1062 | ||
| 1069 | if (!trace_seq_puts(s, " <- ")) | 1063 | trace_seq_puts(s, " <- "); |
| 1070 | goto partial; | ||
| 1071 | 1064 | ||
| 1072 | if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) | 1065 | if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) |
| 1073 | goto partial; | 1066 | goto out; |
| 1074 | 1067 | ||
| 1075 | if (!trace_seq_puts(s, ")")) | 1068 | trace_seq_putc(s, ')'); |
| 1076 | goto partial; | ||
| 1077 | 1069 | ||
| 1078 | data = (u8 *)&field[1]; | 1070 | data = (u8 *)&field[1]; |
| 1079 | for (i = 0; i < tp->nr_args; i++) | 1071 | for (i = 0; i < tp->nr_args; i++) |
| 1080 | if (!tp->args[i].type->print(s, tp->args[i].name, | 1072 | if (!tp->args[i].type->print(s, tp->args[i].name, |
| 1081 | data + tp->args[i].offset, field)) | 1073 | data + tp->args[i].offset, field)) |
| 1082 | goto partial; | 1074 | goto out; |
| 1083 | 1075 | ||
| 1084 | if (!trace_seq_puts(s, "\n")) | 1076 | trace_seq_putc(s, '\n'); |
| 1085 | goto partial; | ||
| 1086 | 1077 | ||
| 1087 | return TRACE_TYPE_HANDLED; | 1078 | out: |
| 1088 | partial: | 1079 | return trace_handle_return(s); |
| 1089 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1090 | } | 1080 | } |
| 1091 | 1081 | ||
| 1092 | 1082 | ||
| @@ -1158,7 +1148,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) | |||
| 1158 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); | 1148 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); |
| 1159 | size -= sizeof(u32); | 1149 | size -= sizeof(u32); |
| 1160 | 1150 | ||
| 1161 | entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); | 1151 | entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx); |
| 1162 | if (!entry) | 1152 | if (!entry) |
| 1163 | return; | 1153 | return; |
| 1164 | 1154 | ||
| @@ -1189,7 +1179,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, | |||
| 1189 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); | 1179 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); |
| 1190 | size -= sizeof(u32); | 1180 | size -= sizeof(u32); |
| 1191 | 1181 | ||
| 1192 | entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); | 1182 | entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx); |
| 1193 | if (!entry) | 1183 | if (!entry) |
| 1194 | return; | 1184 | return; |
| 1195 | 1185 | ||
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 0abd9b863474..7a9ba62e9fef 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c | |||
| @@ -59,17 +59,15 @@ static void mmio_trace_start(struct trace_array *tr) | |||
| 59 | mmio_reset_data(tr); | 59 | mmio_reset_data(tr); |
| 60 | } | 60 | } |
| 61 | 61 | ||
| 62 | static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) | 62 | static void mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) |
| 63 | { | 63 | { |
| 64 | int ret = 0; | ||
| 65 | int i; | 64 | int i; |
| 66 | resource_size_t start, end; | 65 | resource_size_t start, end; |
| 67 | const struct pci_driver *drv = pci_dev_driver(dev); | 66 | const struct pci_driver *drv = pci_dev_driver(dev); |
| 68 | 67 | ||
| 69 | /* XXX: incomplete checks for trace_seq_printf() return value */ | 68 | trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x", |
| 70 | ret += trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x", | 69 | dev->bus->number, dev->devfn, |
| 71 | dev->bus->number, dev->devfn, | 70 | dev->vendor, dev->device, dev->irq); |
| 72 | dev->vendor, dev->device, dev->irq); | ||
| 73 | /* | 71 | /* |
| 74 | * XXX: is pci_resource_to_user() appropriate, since we are | 72 | * XXX: is pci_resource_to_user() appropriate, since we are |
| 75 | * supposed to interpret the __ioremap() phys_addr argument based on | 73 | * supposed to interpret the __ioremap() phys_addr argument based on |
| @@ -77,21 +75,20 @@ static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) | |||
| 77 | */ | 75 | */ |
| 78 | for (i = 0; i < 7; i++) { | 76 | for (i = 0; i < 7; i++) { |
| 79 | pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); | 77 | pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); |
| 80 | ret += trace_seq_printf(s, " %llx", | 78 | trace_seq_printf(s, " %llx", |
| 81 | (unsigned long long)(start | | 79 | (unsigned long long)(start | |
| 82 | (dev->resource[i].flags & PCI_REGION_FLAG_MASK))); | 80 | (dev->resource[i].flags & PCI_REGION_FLAG_MASK))); |
| 83 | } | 81 | } |
| 84 | for (i = 0; i < 7; i++) { | 82 | for (i = 0; i < 7; i++) { |
| 85 | pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); | 83 | pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); |
| 86 | ret += trace_seq_printf(s, " %llx", | 84 | trace_seq_printf(s, " %llx", |
| 87 | dev->resource[i].start < dev->resource[i].end ? | 85 | dev->resource[i].start < dev->resource[i].end ? |
| 88 | (unsigned long long)(end - start) + 1 : 0); | 86 | (unsigned long long)(end - start) + 1 : 0); |
| 89 | } | 87 | } |
| 90 | if (drv) | 88 | if (drv) |
| 91 | ret += trace_seq_printf(s, " %s\n", drv->name); | 89 | trace_seq_printf(s, " %s\n", drv->name); |
| 92 | else | 90 | else |
| 93 | ret += trace_seq_puts(s, " \n"); | 91 | trace_seq_puts(s, " \n"); |
| 94 | return ret; | ||
| 95 | } | 92 | } |
| 96 | 93 | ||
| 97 | static void destroy_header_iter(struct header_iter *hiter) | 94 | static void destroy_header_iter(struct header_iter *hiter) |
| @@ -179,28 +176,27 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter) | |||
| 179 | unsigned long long t = ns2usecs(iter->ts); | 176 | unsigned long long t = ns2usecs(iter->ts); |
| 180 | unsigned long usec_rem = do_div(t, USEC_PER_SEC); | 177 | unsigned long usec_rem = do_div(t, USEC_PER_SEC); |
| 181 | unsigned secs = (unsigned long)t; | 178 | unsigned secs = (unsigned long)t; |
| 182 | int ret = 1; | ||
| 183 | 179 | ||
| 184 | trace_assign_type(field, entry); | 180 | trace_assign_type(field, entry); |
| 185 | rw = &field->rw; | 181 | rw = &field->rw; |
| 186 | 182 | ||
| 187 | switch (rw->opcode) { | 183 | switch (rw->opcode) { |
| 188 | case MMIO_READ: | 184 | case MMIO_READ: |
| 189 | ret = trace_seq_printf(s, | 185 | trace_seq_printf(s, |
| 190 | "R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", | 186 | "R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", |
| 191 | rw->width, secs, usec_rem, rw->map_id, | 187 | rw->width, secs, usec_rem, rw->map_id, |
| 192 | (unsigned long long)rw->phys, | 188 | (unsigned long long)rw->phys, |
| 193 | rw->value, rw->pc, 0); | 189 | rw->value, rw->pc, 0); |
| 194 | break; | 190 | break; |
| 195 | case MMIO_WRITE: | 191 | case MMIO_WRITE: |
| 196 | ret = trace_seq_printf(s, | 192 | trace_seq_printf(s, |
| 197 | "W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", | 193 | "W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", |
| 198 | rw->width, secs, usec_rem, rw->map_id, | 194 | rw->width, secs, usec_rem, rw->map_id, |
| 199 | (unsigned long long)rw->phys, | 195 | (unsigned long long)rw->phys, |
| 200 | rw->value, rw->pc, 0); | 196 | rw->value, rw->pc, 0); |
| 201 | break; | 197 | break; |
| 202 | case MMIO_UNKNOWN_OP: | 198 | case MMIO_UNKNOWN_OP: |
| 203 | ret = trace_seq_printf(s, | 199 | trace_seq_printf(s, |
| 204 | "UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx," | 200 | "UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx," |
| 205 | "%02lx 0x%lx %d\n", | 201 | "%02lx 0x%lx %d\n", |
| 206 | secs, usec_rem, rw->map_id, | 202 | secs, usec_rem, rw->map_id, |
| @@ -209,12 +205,11 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter) | |||
| 209 | (rw->value >> 0) & 0xff, rw->pc, 0); | 205 | (rw->value >> 0) & 0xff, rw->pc, 0); |
| 210 | break; | 206 | break; |
| 211 | default: | 207 | default: |
| 212 | ret = trace_seq_puts(s, "rw what?\n"); | 208 | trace_seq_puts(s, "rw what?\n"); |
| 213 | break; | 209 | break; |
| 214 | } | 210 | } |
| 215 | if (ret) | 211 | |
| 216 | return TRACE_TYPE_HANDLED; | 212 | return trace_handle_return(s); |
| 217 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 218 | } | 213 | } |
| 219 | 214 | ||
| 220 | static enum print_line_t mmio_print_map(struct trace_iterator *iter) | 215 | static enum print_line_t mmio_print_map(struct trace_iterator *iter) |
| @@ -226,31 +221,29 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter) | |||
| 226 | unsigned long long t = ns2usecs(iter->ts); | 221 | unsigned long long t = ns2usecs(iter->ts); |
| 227 | unsigned long usec_rem = do_div(t, USEC_PER_SEC); | 222 | unsigned long usec_rem = do_div(t, USEC_PER_SEC); |
| 228 | unsigned secs = (unsigned long)t; | 223 | unsigned secs = (unsigned long)t; |
| 229 | int ret; | ||
| 230 | 224 | ||
| 231 | trace_assign_type(field, entry); | 225 | trace_assign_type(field, entry); |
| 232 | m = &field->map; | 226 | m = &field->map; |
| 233 | 227 | ||
| 234 | switch (m->opcode) { | 228 | switch (m->opcode) { |
| 235 | case MMIO_PROBE: | 229 | case MMIO_PROBE: |
| 236 | ret = trace_seq_printf(s, | 230 | trace_seq_printf(s, |
| 237 | "MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", | 231 | "MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", |
| 238 | secs, usec_rem, m->map_id, | 232 | secs, usec_rem, m->map_id, |
| 239 | (unsigned long long)m->phys, m->virt, m->len, | 233 | (unsigned long long)m->phys, m->virt, m->len, |
| 240 | 0UL, 0); | 234 | 0UL, 0); |
| 241 | break; | 235 | break; |
| 242 | case MMIO_UNPROBE: | 236 | case MMIO_UNPROBE: |
| 243 | ret = trace_seq_printf(s, | 237 | trace_seq_printf(s, |
| 244 | "UNMAP %u.%06lu %d 0x%lx %d\n", | 238 | "UNMAP %u.%06lu %d 0x%lx %d\n", |
| 245 | secs, usec_rem, m->map_id, 0UL, 0); | 239 | secs, usec_rem, m->map_id, 0UL, 0); |
| 246 | break; | 240 | break; |
| 247 | default: | 241 | default: |
| 248 | ret = trace_seq_puts(s, "map what?\n"); | 242 | trace_seq_puts(s, "map what?\n"); |
| 249 | break; | 243 | break; |
| 250 | } | 244 | } |
| 251 | if (ret) | 245 | |
| 252 | return TRACE_TYPE_HANDLED; | 246 | return trace_handle_return(s); |
| 253 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 254 | } | 247 | } |
| 255 | 248 | ||
| 256 | static enum print_line_t mmio_print_mark(struct trace_iterator *iter) | 249 | static enum print_line_t mmio_print_mark(struct trace_iterator *iter) |
| @@ -262,14 +255,11 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter) | |||
| 262 | unsigned long long t = ns2usecs(iter->ts); | 255 | unsigned long long t = ns2usecs(iter->ts); |
| 263 | unsigned long usec_rem = do_div(t, USEC_PER_SEC); | 256 | unsigned long usec_rem = do_div(t, USEC_PER_SEC); |
| 264 | unsigned secs = (unsigned long)t; | 257 | unsigned secs = (unsigned long)t; |
| 265 | int ret; | ||
| 266 | 258 | ||
| 267 | /* The trailing newline must be in the message. */ | 259 | /* The trailing newline must be in the message. */ |
| 268 | ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg); | 260 | trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg); |
| 269 | if (!ret) | ||
| 270 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 271 | 261 | ||
| 272 | return TRACE_TYPE_HANDLED; | 262 | return trace_handle_return(s); |
| 273 | } | 263 | } |
| 274 | 264 | ||
| 275 | static enum print_line_t mmio_print_line(struct trace_iterator *iter) | 265 | static enum print_line_t mmio_print_line(struct trace_iterator *iter) |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index c6977d5a9b12..b77b9a697619 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
| @@ -25,15 +25,12 @@ enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter) | |||
| 25 | struct trace_seq *s = &iter->seq; | 25 | struct trace_seq *s = &iter->seq; |
| 26 | struct trace_entry *entry = iter->ent; | 26 | struct trace_entry *entry = iter->ent; |
| 27 | struct bputs_entry *field; | 27 | struct bputs_entry *field; |
| 28 | int ret; | ||
| 29 | 28 | ||
| 30 | trace_assign_type(field, entry); | 29 | trace_assign_type(field, entry); |
| 31 | 30 | ||
| 32 | ret = trace_seq_puts(s, field->str); | 31 | trace_seq_puts(s, field->str); |
| 33 | if (!ret) | ||
| 34 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 35 | 32 | ||
| 36 | return TRACE_TYPE_HANDLED; | 33 | return trace_handle_return(s); |
| 37 | } | 34 | } |
| 38 | 35 | ||
| 39 | enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) | 36 | enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) |
| @@ -41,15 +38,12 @@ enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) | |||
| 41 | struct trace_seq *s = &iter->seq; | 38 | struct trace_seq *s = &iter->seq; |
| 42 | struct trace_entry *entry = iter->ent; | 39 | struct trace_entry *entry = iter->ent; |
| 43 | struct bprint_entry *field; | 40 | struct bprint_entry *field; |
| 44 | int ret; | ||
| 45 | 41 | ||
| 46 | trace_assign_type(field, entry); | 42 | trace_assign_type(field, entry); |
| 47 | 43 | ||
| 48 | ret = trace_seq_bprintf(s, field->fmt, field->buf); | 44 | trace_seq_bprintf(s, field->fmt, field->buf); |
| 49 | if (!ret) | ||
| 50 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 51 | 45 | ||
| 52 | return TRACE_TYPE_HANDLED; | 46 | return trace_handle_return(s); |
| 53 | } | 47 | } |
| 54 | 48 | ||
| 55 | enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) | 49 | enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) |
| @@ -57,15 +51,12 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) | |||
| 57 | struct trace_seq *s = &iter->seq; | 51 | struct trace_seq *s = &iter->seq; |
| 58 | struct trace_entry *entry = iter->ent; | 52 | struct trace_entry *entry = iter->ent; |
| 59 | struct print_entry *field; | 53 | struct print_entry *field; |
| 60 | int ret; | ||
| 61 | 54 | ||
| 62 | trace_assign_type(field, entry); | 55 | trace_assign_type(field, entry); |
| 63 | 56 | ||
| 64 | ret = trace_seq_puts(s, field->buf); | 57 | trace_seq_puts(s, field->buf); |
| 65 | if (!ret) | ||
| 66 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 67 | 58 | ||
| 68 | return TRACE_TYPE_HANDLED; | 59 | return trace_handle_return(s); |
| 69 | } | 60 | } |
| 70 | 61 | ||
| 71 | const char * | 62 | const char * |
| @@ -124,7 +115,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, | |||
| 124 | 115 | ||
| 125 | if (ret == (const char *)(trace_seq_buffer_ptr(p))) | 116 | if (ret == (const char *)(trace_seq_buffer_ptr(p))) |
| 126 | trace_seq_printf(p, "0x%lx", val); | 117 | trace_seq_printf(p, "0x%lx", val); |
| 127 | 118 | ||
| 128 | trace_seq_putc(p, 0); | 119 | trace_seq_putc(p, 0); |
| 129 | 120 | ||
| 130 | return ret; | 121 | return ret; |
| @@ -193,7 +184,6 @@ int ftrace_raw_output_prep(struct trace_iterator *iter, | |||
| 193 | struct trace_seq *s = &iter->seq; | 184 | struct trace_seq *s = &iter->seq; |
| 194 | struct trace_seq *p = &iter->tmp_seq; | 185 | struct trace_seq *p = &iter->tmp_seq; |
| 195 | struct trace_entry *entry; | 186 | struct trace_entry *entry; |
| 196 | int ret; | ||
| 197 | 187 | ||
| 198 | event = container_of(trace_event, struct ftrace_event_call, event); | 188 | event = container_of(trace_event, struct ftrace_event_call, event); |
| 199 | entry = iter->ent; | 189 | entry = iter->ent; |
| @@ -204,11 +194,9 @@ int ftrace_raw_output_prep(struct trace_iterator *iter, | |||
| 204 | } | 194 | } |
| 205 | 195 | ||
| 206 | trace_seq_init(p); | 196 | trace_seq_init(p); |
| 207 | ret = trace_seq_printf(s, "%s: ", ftrace_event_name(event)); | 197 | trace_seq_printf(s, "%s: ", ftrace_event_name(event)); |
| 208 | if (!ret) | ||
| 209 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 210 | 198 | ||
| 211 | return 0; | 199 | return trace_handle_return(s); |
| 212 | } | 200 | } |
| 213 | EXPORT_SYMBOL(ftrace_raw_output_prep); | 201 | EXPORT_SYMBOL(ftrace_raw_output_prep); |
| 214 | 202 | ||
| @@ -216,18 +204,11 @@ static int ftrace_output_raw(struct trace_iterator *iter, char *name, | |||
| 216 | char *fmt, va_list ap) | 204 | char *fmt, va_list ap) |
| 217 | { | 205 | { |
| 218 | struct trace_seq *s = &iter->seq; | 206 | struct trace_seq *s = &iter->seq; |
| 219 | int ret; | ||
| 220 | |||
| 221 | ret = trace_seq_printf(s, "%s: ", name); | ||
| 222 | if (!ret) | ||
| 223 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 224 | |||
| 225 | ret = trace_seq_vprintf(s, fmt, ap); | ||
| 226 | 207 | ||
| 227 | if (!ret) | 208 | trace_seq_printf(s, "%s: ", name); |
| 228 | return TRACE_TYPE_PARTIAL_LINE; | 209 | trace_seq_vprintf(s, fmt, ap); |
| 229 | 210 | ||
| 230 | return TRACE_TYPE_HANDLED; | 211 | return trace_handle_return(s); |
| 231 | } | 212 | } |
| 232 | 213 | ||
| 233 | int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...) | 214 | int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...) |
| @@ -260,7 +241,7 @@ static inline const char *kretprobed(const char *name) | |||
| 260 | } | 241 | } |
| 261 | #endif /* CONFIG_KRETPROBES */ | 242 | #endif /* CONFIG_KRETPROBES */ |
| 262 | 243 | ||
| 263 | static int | 244 | static void |
| 264 | seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) | 245 | seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) |
| 265 | { | 246 | { |
| 266 | #ifdef CONFIG_KALLSYMS | 247 | #ifdef CONFIG_KALLSYMS |
| @@ -271,12 +252,11 @@ seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) | |||
| 271 | 252 | ||
| 272 | name = kretprobed(str); | 253 | name = kretprobed(str); |
| 273 | 254 | ||
| 274 | return trace_seq_printf(s, fmt, name); | 255 | trace_seq_printf(s, fmt, name); |
| 275 | #endif | 256 | #endif |
| 276 | return 1; | ||
| 277 | } | 257 | } |
| 278 | 258 | ||
| 279 | static int | 259 | static void |
| 280 | seq_print_sym_offset(struct trace_seq *s, const char *fmt, | 260 | seq_print_sym_offset(struct trace_seq *s, const char *fmt, |
| 281 | unsigned long address) | 261 | unsigned long address) |
| 282 | { | 262 | { |
| @@ -287,9 +267,8 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt, | |||
| 287 | sprint_symbol(str, address); | 267 | sprint_symbol(str, address); |
| 288 | name = kretprobed(str); | 268 | name = kretprobed(str); |
| 289 | 269 | ||
| 290 | return trace_seq_printf(s, fmt, name); | 270 | trace_seq_printf(s, fmt, name); |
| 291 | #endif | 271 | #endif |
| 292 | return 1; | ||
| 293 | } | 272 | } |
| 294 | 273 | ||
| 295 | #ifndef CONFIG_64BIT | 274 | #ifndef CONFIG_64BIT |
| @@ -320,14 +299,14 @@ int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, | |||
| 320 | if (file) { | 299 | if (file) { |
| 321 | ret = trace_seq_path(s, &file->f_path); | 300 | ret = trace_seq_path(s, &file->f_path); |
| 322 | if (ret) | 301 | if (ret) |
| 323 | ret = trace_seq_printf(s, "[+0x%lx]", | 302 | trace_seq_printf(s, "[+0x%lx]", |
| 324 | ip - vmstart); | 303 | ip - vmstart); |
| 325 | } | 304 | } |
| 326 | up_read(&mm->mmap_sem); | 305 | up_read(&mm->mmap_sem); |
| 327 | } | 306 | } |
| 328 | if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file)) | 307 | if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file)) |
| 329 | ret = trace_seq_printf(s, " <" IP_FMT ">", ip); | 308 | trace_seq_printf(s, " <" IP_FMT ">", ip); |
| 330 | return ret; | 309 | return !trace_seq_has_overflowed(s); |
| 331 | } | 310 | } |
| 332 | 311 | ||
| 333 | int | 312 | int |
| @@ -335,7 +314,6 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, | |||
| 335 | unsigned long sym_flags) | 314 | unsigned long sym_flags) |
| 336 | { | 315 | { |
| 337 | struct mm_struct *mm = NULL; | 316 | struct mm_struct *mm = NULL; |
| 338 | int ret = 1; | ||
| 339 | unsigned int i; | 317 | unsigned int i; |
| 340 | 318 | ||
| 341 | if (trace_flags & TRACE_ITER_SYM_USEROBJ) { | 319 | if (trace_flags & TRACE_ITER_SYM_USEROBJ) { |
| @@ -354,48 +332,45 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, | |||
| 354 | for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { | 332 | for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { |
| 355 | unsigned long ip = entry->caller[i]; | 333 | unsigned long ip = entry->caller[i]; |
| 356 | 334 | ||
| 357 | if (ip == ULONG_MAX || !ret) | 335 | if (ip == ULONG_MAX || trace_seq_has_overflowed(s)) |
| 358 | break; | 336 | break; |
| 359 | if (ret) | 337 | |
| 360 | ret = trace_seq_puts(s, " => "); | 338 | trace_seq_puts(s, " => "); |
| 339 | |||
| 361 | if (!ip) { | 340 | if (!ip) { |
| 362 | if (ret) | 341 | trace_seq_puts(s, "??"); |
| 363 | ret = trace_seq_puts(s, "??"); | 342 | trace_seq_putc(s, '\n'); |
| 364 | if (ret) | ||
| 365 | ret = trace_seq_putc(s, '\n'); | ||
| 366 | continue; | 343 | continue; |
| 367 | } | 344 | } |
| 368 | if (!ret) | 345 | |
| 369 | break; | 346 | seq_print_user_ip(s, mm, ip, sym_flags); |
| 370 | if (ret) | 347 | trace_seq_putc(s, '\n'); |
| 371 | ret = seq_print_user_ip(s, mm, ip, sym_flags); | ||
| 372 | ret = trace_seq_putc(s, '\n'); | ||
| 373 | } | 348 | } |
| 374 | 349 | ||
| 375 | if (mm) | 350 | if (mm) |
| 376 | mmput(mm); | 351 | mmput(mm); |
| 377 | return ret; | 352 | |
| 353 | return !trace_seq_has_overflowed(s); | ||
| 378 | } | 354 | } |
| 379 | 355 | ||
| 380 | int | 356 | int |
| 381 | seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) | 357 | seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) |
| 382 | { | 358 | { |
| 383 | int ret; | 359 | if (!ip) { |
| 384 | 360 | trace_seq_putc(s, '0'); | |
| 385 | if (!ip) | 361 | goto out; |
| 386 | return trace_seq_putc(s, '0'); | 362 | } |
| 387 | 363 | ||
| 388 | if (sym_flags & TRACE_ITER_SYM_OFFSET) | 364 | if (sym_flags & TRACE_ITER_SYM_OFFSET) |
| 389 | ret = seq_print_sym_offset(s, "%s", ip); | 365 | seq_print_sym_offset(s, "%s", ip); |
| 390 | else | 366 | else |
| 391 | ret = seq_print_sym_short(s, "%s", ip); | 367 | seq_print_sym_short(s, "%s", ip); |
| 392 | |||
| 393 | if (!ret) | ||
| 394 | return 0; | ||
| 395 | 368 | ||
| 396 | if (sym_flags & TRACE_ITER_SYM_ADDR) | 369 | if (sym_flags & TRACE_ITER_SYM_ADDR) |
| 397 | ret = trace_seq_printf(s, " <" IP_FMT ">", ip); | 370 | trace_seq_printf(s, " <" IP_FMT ">", ip); |
| 398 | return ret; | 371 | |
| 372 | out: | ||
| 373 | return !trace_seq_has_overflowed(s); | ||
| 399 | } | 374 | } |
| 400 | 375 | ||
| 401 | /** | 376 | /** |
| @@ -413,7 +388,6 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) | |||
| 413 | char irqs_off; | 388 | char irqs_off; |
| 414 | int hardirq; | 389 | int hardirq; |
| 415 | int softirq; | 390 | int softirq; |
| 416 | int ret; | ||
| 417 | 391 | ||
| 418 | hardirq = entry->flags & TRACE_FLAG_HARDIRQ; | 392 | hardirq = entry->flags & TRACE_FLAG_HARDIRQ; |
| 419 | softirq = entry->flags & TRACE_FLAG_SOFTIRQ; | 393 | softirq = entry->flags & TRACE_FLAG_SOFTIRQ; |
| @@ -445,16 +419,15 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) | |||
| 445 | softirq ? 's' : | 419 | softirq ? 's' : |
| 446 | '.'; | 420 | '.'; |
| 447 | 421 | ||
| 448 | if (!trace_seq_printf(s, "%c%c%c", | 422 | trace_seq_printf(s, "%c%c%c", |
| 449 | irqs_off, need_resched, hardsoft_irq)) | 423 | irqs_off, need_resched, hardsoft_irq); |
| 450 | return 0; | ||
| 451 | 424 | ||
| 452 | if (entry->preempt_count) | 425 | if (entry->preempt_count) |
| 453 | ret = trace_seq_printf(s, "%x", entry->preempt_count); | 426 | trace_seq_printf(s, "%x", entry->preempt_count); |
| 454 | else | 427 | else |
| 455 | ret = trace_seq_putc(s, '.'); | 428 | trace_seq_putc(s, '.'); |
| 456 | 429 | ||
| 457 | return ret; | 430 | return !trace_seq_has_overflowed(s); |
| 458 | } | 431 | } |
| 459 | 432 | ||
| 460 | static int | 433 | static int |
| @@ -464,14 +437,38 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) | |||
| 464 | 437 | ||
| 465 | trace_find_cmdline(entry->pid, comm); | 438 | trace_find_cmdline(entry->pid, comm); |
| 466 | 439 | ||
| 467 | if (!trace_seq_printf(s, "%8.8s-%-5d %3d", | 440 | trace_seq_printf(s, "%8.8s-%-5d %3d", |
| 468 | comm, entry->pid, cpu)) | 441 | comm, entry->pid, cpu); |
| 469 | return 0; | ||
| 470 | 442 | ||
| 471 | return trace_print_lat_fmt(s, entry); | 443 | return trace_print_lat_fmt(s, entry); |
| 472 | } | 444 | } |
| 473 | 445 | ||
| 474 | static unsigned long preempt_mark_thresh_us = 100; | 446 | #undef MARK |
| 447 | #define MARK(v, s) {.val = v, .sym = s} | ||
| 448 | /* trace overhead mark */ | ||
| 449 | static const struct trace_mark { | ||
| 450 | unsigned long long val; /* unit: nsec */ | ||
| 451 | char sym; | ||
| 452 | } mark[] = { | ||
| 453 | MARK(1000000000ULL , '$'), /* 1 sec */ | ||
| 454 | MARK(1000000ULL , '#'), /* 1000 usecs */ | ||
| 455 | MARK(100000ULL , '!'), /* 100 usecs */ | ||
| 456 | MARK(10000ULL , '+'), /* 10 usecs */ | ||
| 457 | }; | ||
| 458 | #undef MARK | ||
| 459 | |||
| 460 | char trace_find_mark(unsigned long long d) | ||
| 461 | { | ||
| 462 | int i; | ||
| 463 | int size = ARRAY_SIZE(mark); | ||
| 464 | |||
| 465 | for (i = 0; i < size; i++) { | ||
| 466 | if (d >= mark[i].val) | ||
| 467 | break; | ||
| 468 | } | ||
| 469 | |||
| 470 | return (i == size) ? ' ' : mark[i].sym; | ||
| 471 | } | ||
| 475 | 472 | ||
| 476 | static int | 473 | static int |
| 477 | lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) | 474 | lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) |
| @@ -493,24 +490,28 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) | |||
| 493 | unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC); | 490 | unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC); |
| 494 | unsigned long rel_msec = (unsigned long)rel_ts; | 491 | unsigned long rel_msec = (unsigned long)rel_ts; |
| 495 | 492 | ||
| 496 | return trace_seq_printf( | 493 | trace_seq_printf( |
| 497 | s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ", | 494 | s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ", |
| 498 | ns2usecs(iter->ts), | 495 | ns2usecs(iter->ts), |
| 499 | abs_msec, abs_usec, | 496 | abs_msec, abs_usec, |
| 500 | rel_msec, rel_usec); | 497 | rel_msec, rel_usec); |
| 498 | |||
| 501 | } else if (verbose && !in_ns) { | 499 | } else if (verbose && !in_ns) { |
| 502 | return trace_seq_printf( | 500 | trace_seq_printf( |
| 503 | s, "[%016llx] %lld (+%lld): ", | 501 | s, "[%016llx] %lld (+%lld): ", |
| 504 | iter->ts, abs_ts, rel_ts); | 502 | iter->ts, abs_ts, rel_ts); |
| 503 | |||
| 505 | } else if (!verbose && in_ns) { | 504 | } else if (!verbose && in_ns) { |
| 506 | return trace_seq_printf( | 505 | trace_seq_printf( |
| 507 | s, " %4lldus%c: ", | 506 | s, " %4lldus%c: ", |
| 508 | abs_ts, | 507 | abs_ts, |
| 509 | rel_ts > preempt_mark_thresh_us ? '!' : | 508 | trace_find_mark(rel_ts * NSEC_PER_USEC)); |
| 510 | rel_ts > 1 ? '+' : ' '); | 509 | |
| 511 | } else { /* !verbose && !in_ns */ | 510 | } else { /* !verbose && !in_ns */ |
| 512 | return trace_seq_printf(s, " %4lld: ", abs_ts); | 511 | trace_seq_printf(s, " %4lld: ", abs_ts); |
| 513 | } | 512 | } |
| 513 | |||
| 514 | return !trace_seq_has_overflowed(s); | ||
| 514 | } | 515 | } |
| 515 | 516 | ||
| 516 | int trace_print_context(struct trace_iterator *iter) | 517 | int trace_print_context(struct trace_iterator *iter) |
| @@ -520,34 +521,29 @@ int trace_print_context(struct trace_iterator *iter) | |||
| 520 | unsigned long long t; | 521 | unsigned long long t; |
| 521 | unsigned long secs, usec_rem; | 522 | unsigned long secs, usec_rem; |
| 522 | char comm[TASK_COMM_LEN]; | 523 | char comm[TASK_COMM_LEN]; |
| 523 | int ret; | ||
| 524 | 524 | ||
| 525 | trace_find_cmdline(entry->pid, comm); | 525 | trace_find_cmdline(entry->pid, comm); |
| 526 | 526 | ||
| 527 | ret = trace_seq_printf(s, "%16s-%-5d [%03d] ", | 527 | trace_seq_printf(s, "%16s-%-5d [%03d] ", |
| 528 | comm, entry->pid, iter->cpu); | 528 | comm, entry->pid, iter->cpu); |
| 529 | if (!ret) | ||
| 530 | return 0; | ||
| 531 | 529 | ||
| 532 | if (trace_flags & TRACE_ITER_IRQ_INFO) { | 530 | if (trace_flags & TRACE_ITER_IRQ_INFO) |
| 533 | ret = trace_print_lat_fmt(s, entry); | 531 | trace_print_lat_fmt(s, entry); |
| 534 | if (!ret) | ||
| 535 | return 0; | ||
| 536 | } | ||
| 537 | 532 | ||
| 538 | if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) { | 533 | if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) { |
| 539 | t = ns2usecs(iter->ts); | 534 | t = ns2usecs(iter->ts); |
| 540 | usec_rem = do_div(t, USEC_PER_SEC); | 535 | usec_rem = do_div(t, USEC_PER_SEC); |
| 541 | secs = (unsigned long)t; | 536 | secs = (unsigned long)t; |
| 542 | return trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem); | 537 | trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem); |
| 543 | } else | 538 | } else |
| 544 | return trace_seq_printf(s, " %12llu: ", iter->ts); | 539 | trace_seq_printf(s, " %12llu: ", iter->ts); |
| 540 | |||
| 541 | return !trace_seq_has_overflowed(s); | ||
| 545 | } | 542 | } |
| 546 | 543 | ||
| 547 | int trace_print_lat_context(struct trace_iterator *iter) | 544 | int trace_print_lat_context(struct trace_iterator *iter) |
| 548 | { | 545 | { |
| 549 | u64 next_ts; | 546 | u64 next_ts; |
| 550 | int ret; | ||
| 551 | /* trace_find_next_entry will reset ent_size */ | 547 | /* trace_find_next_entry will reset ent_size */ |
| 552 | int ent_size = iter->ent_size; | 548 | int ent_size = iter->ent_size; |
| 553 | struct trace_seq *s = &iter->seq; | 549 | struct trace_seq *s = &iter->seq; |
| @@ -567,18 +563,17 @@ int trace_print_lat_context(struct trace_iterator *iter) | |||
| 567 | 563 | ||
| 568 | trace_find_cmdline(entry->pid, comm); | 564 | trace_find_cmdline(entry->pid, comm); |
| 569 | 565 | ||
| 570 | ret = trace_seq_printf( | 566 | trace_seq_printf( |
| 571 | s, "%16s %5d %3d %d %08x %08lx ", | 567 | s, "%16s %5d %3d %d %08x %08lx ", |
| 572 | comm, entry->pid, iter->cpu, entry->flags, | 568 | comm, entry->pid, iter->cpu, entry->flags, |
| 573 | entry->preempt_count, iter->idx); | 569 | entry->preempt_count, iter->idx); |
| 574 | } else { | 570 | } else { |
| 575 | ret = lat_print_generic(s, entry, iter->cpu); | 571 | lat_print_generic(s, entry, iter->cpu); |
| 576 | } | 572 | } |
| 577 | 573 | ||
| 578 | if (ret) | 574 | lat_print_timestamp(iter, next_ts); |
| 579 | ret = lat_print_timestamp(iter, next_ts); | ||
| 580 | 575 | ||
| 581 | return ret; | 576 | return !trace_seq_has_overflowed(s); |
| 582 | } | 577 | } |
| 583 | 578 | ||
| 584 | static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; | 579 | static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; |
| @@ -692,7 +687,7 @@ int register_ftrace_event(struct trace_event *event) | |||
| 692 | goto out; | 687 | goto out; |
| 693 | 688 | ||
| 694 | } else { | 689 | } else { |
| 695 | 690 | ||
| 696 | event->type = next_event_type++; | 691 | event->type = next_event_type++; |
| 697 | list = &ftrace_event_list; | 692 | list = &ftrace_event_list; |
| 698 | } | 693 | } |
| @@ -764,10 +759,9 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event); | |||
| 764 | enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, | 759 | enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, |
| 765 | struct trace_event *event) | 760 | struct trace_event *event) |
| 766 | { | 761 | { |
| 767 | if (!trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type)) | 762 | trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type); |
| 768 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 769 | 763 | ||
| 770 | return TRACE_TYPE_HANDLED; | 764 | return trace_handle_return(&iter->seq); |
| 771 | } | 765 | } |
| 772 | 766 | ||
| 773 | /* TRACE_FN */ | 767 | /* TRACE_FN */ |
| @@ -779,24 +773,16 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags, | |||
| 779 | 773 | ||
| 780 | trace_assign_type(field, iter->ent); | 774 | trace_assign_type(field, iter->ent); |
| 781 | 775 | ||
| 782 | if (!seq_print_ip_sym(s, field->ip, flags)) | 776 | seq_print_ip_sym(s, field->ip, flags); |
| 783 | goto partial; | ||
| 784 | 777 | ||
| 785 | if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) { | 778 | if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) { |
| 786 | if (!trace_seq_puts(s, " <-")) | 779 | trace_seq_puts(s, " <-"); |
| 787 | goto partial; | 780 | seq_print_ip_sym(s, field->parent_ip, flags); |
| 788 | if (!seq_print_ip_sym(s, | ||
| 789 | field->parent_ip, | ||
| 790 | flags)) | ||
| 791 | goto partial; | ||
| 792 | } | 781 | } |
| 793 | if (!trace_seq_putc(s, '\n')) | ||
| 794 | goto partial; | ||
| 795 | 782 | ||
| 796 | return TRACE_TYPE_HANDLED; | 783 | trace_seq_putc(s, '\n'); |
| 797 | 784 | ||
| 798 | partial: | 785 | return trace_handle_return(s); |
| 799 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 800 | } | 786 | } |
| 801 | 787 | ||
| 802 | static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags, | 788 | static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags, |
| @@ -806,12 +792,11 @@ static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags, | |||
| 806 | 792 | ||
| 807 | trace_assign_type(field, iter->ent); | 793 | trace_assign_type(field, iter->ent); |
| 808 | 794 | ||
| 809 | if (!trace_seq_printf(&iter->seq, "%lx %lx\n", | 795 | trace_seq_printf(&iter->seq, "%lx %lx\n", |
| 810 | field->ip, | 796 | field->ip, |
| 811 | field->parent_ip)) | 797 | field->parent_ip); |
| 812 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 813 | 798 | ||
| 814 | return TRACE_TYPE_HANDLED; | 799 | return trace_handle_return(&iter->seq); |
| 815 | } | 800 | } |
| 816 | 801 | ||
| 817 | static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags, | 802 | static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags, |
| @@ -822,10 +807,10 @@ static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags, | |||
| 822 | 807 | ||
| 823 | trace_assign_type(field, iter->ent); | 808 | trace_assign_type(field, iter->ent); |
| 824 | 809 | ||
| 825 | SEQ_PUT_HEX_FIELD_RET(s, field->ip); | 810 | SEQ_PUT_HEX_FIELD(s, field->ip); |
| 826 | SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip); | 811 | SEQ_PUT_HEX_FIELD(s, field->parent_ip); |
| 827 | 812 | ||
| 828 | return TRACE_TYPE_HANDLED; | 813 | return trace_handle_return(s); |
| 829 | } | 814 | } |
| 830 | 815 | ||
| 831 | static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags, | 816 | static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags, |
| @@ -836,10 +821,10 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags, | |||
| 836 | 821 | ||
| 837 | trace_assign_type(field, iter->ent); | 822 | trace_assign_type(field, iter->ent); |
| 838 | 823 | ||
| 839 | SEQ_PUT_FIELD_RET(s, field->ip); | 824 | SEQ_PUT_FIELD(s, field->ip); |
| 840 | SEQ_PUT_FIELD_RET(s, field->parent_ip); | 825 | SEQ_PUT_FIELD(s, field->parent_ip); |
| 841 | 826 | ||
| 842 | return TRACE_TYPE_HANDLED; | 827 | return trace_handle_return(s); |
| 843 | } | 828 | } |
| 844 | 829 | ||
| 845 | static struct trace_event_functions trace_fn_funcs = { | 830 | static struct trace_event_functions trace_fn_funcs = { |
| @@ -868,18 +853,17 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, | |||
| 868 | T = task_state_char(field->next_state); | 853 | T = task_state_char(field->next_state); |
| 869 | S = task_state_char(field->prev_state); | 854 | S = task_state_char(field->prev_state); |
| 870 | trace_find_cmdline(field->next_pid, comm); | 855 | trace_find_cmdline(field->next_pid, comm); |
| 871 | if (!trace_seq_printf(&iter->seq, | 856 | trace_seq_printf(&iter->seq, |
| 872 | " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", | 857 | " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", |
| 873 | field->prev_pid, | 858 | field->prev_pid, |
| 874 | field->prev_prio, | 859 | field->prev_prio, |
| 875 | S, delim, | 860 | S, delim, |
| 876 | field->next_cpu, | 861 | field->next_cpu, |
| 877 | field->next_pid, | 862 | field->next_pid, |
| 878 | field->next_prio, | 863 | field->next_prio, |
| 879 | T, comm)) | 864 | T, comm); |
| 880 | return TRACE_TYPE_PARTIAL_LINE; | 865 | |
| 881 | 866 | return trace_handle_return(&iter->seq); | |
| 882 | return TRACE_TYPE_HANDLED; | ||
| 883 | } | 867 | } |
| 884 | 868 | ||
| 885 | static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags, | 869 | static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags, |
| @@ -904,17 +888,16 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S) | |||
| 904 | if (!S) | 888 | if (!S) |
| 905 | S = task_state_char(field->prev_state); | 889 | S = task_state_char(field->prev_state); |
| 906 | T = task_state_char(field->next_state); | 890 | T = task_state_char(field->next_state); |
| 907 | if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", | 891 | trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", |
| 908 | field->prev_pid, | 892 | field->prev_pid, |
| 909 | field->prev_prio, | 893 | field->prev_prio, |
| 910 | S, | 894 | S, |
| 911 | field->next_cpu, | 895 | field->next_cpu, |
| 912 | field->next_pid, | 896 | field->next_pid, |
| 913 | field->next_prio, | 897 | field->next_prio, |
| 914 | T)) | 898 | T); |
| 915 | return TRACE_TYPE_PARTIAL_LINE; | 899 | |
| 916 | 900 | return trace_handle_return(&iter->seq); | |
| 917 | return TRACE_TYPE_HANDLED; | ||
| 918 | } | 901 | } |
| 919 | 902 | ||
| 920 | static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags, | 903 | static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags, |
| @@ -942,15 +925,15 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S) | |||
| 942 | S = task_state_char(field->prev_state); | 925 | S = task_state_char(field->prev_state); |
| 943 | T = task_state_char(field->next_state); | 926 | T = task_state_char(field->next_state); |
| 944 | 927 | ||
| 945 | SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); | 928 | SEQ_PUT_HEX_FIELD(s, field->prev_pid); |
| 946 | SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio); | 929 | SEQ_PUT_HEX_FIELD(s, field->prev_prio); |
| 947 | SEQ_PUT_HEX_FIELD_RET(s, S); | 930 | SEQ_PUT_HEX_FIELD(s, S); |
| 948 | SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu); | 931 | SEQ_PUT_HEX_FIELD(s, field->next_cpu); |
| 949 | SEQ_PUT_HEX_FIELD_RET(s, field->next_pid); | 932 | SEQ_PUT_HEX_FIELD(s, field->next_pid); |
| 950 | SEQ_PUT_HEX_FIELD_RET(s, field->next_prio); | 933 | SEQ_PUT_HEX_FIELD(s, field->next_prio); |
| 951 | SEQ_PUT_HEX_FIELD_RET(s, T); | 934 | SEQ_PUT_HEX_FIELD(s, T); |
| 952 | 935 | ||
| 953 | return TRACE_TYPE_HANDLED; | 936 | return trace_handle_return(s); |
| 954 | } | 937 | } |
| 955 | 938 | ||
| 956 | static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags, | 939 | static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags, |
| @@ -973,14 +956,15 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter, | |||
| 973 | 956 | ||
| 974 | trace_assign_type(field, iter->ent); | 957 | trace_assign_type(field, iter->ent); |
| 975 | 958 | ||
| 976 | SEQ_PUT_FIELD_RET(s, field->prev_pid); | 959 | SEQ_PUT_FIELD(s, field->prev_pid); |
| 977 | SEQ_PUT_FIELD_RET(s, field->prev_prio); | 960 | SEQ_PUT_FIELD(s, field->prev_prio); |
| 978 | SEQ_PUT_FIELD_RET(s, field->prev_state); | 961 | SEQ_PUT_FIELD(s, field->prev_state); |
| 979 | SEQ_PUT_FIELD_RET(s, field->next_pid); | 962 | SEQ_PUT_FIELD(s, field->next_cpu); |
| 980 | SEQ_PUT_FIELD_RET(s, field->next_prio); | 963 | SEQ_PUT_FIELD(s, field->next_pid); |
| 981 | SEQ_PUT_FIELD_RET(s, field->next_state); | 964 | SEQ_PUT_FIELD(s, field->next_prio); |
| 965 | SEQ_PUT_FIELD(s, field->next_state); | ||
| 982 | 966 | ||
| 983 | return TRACE_TYPE_HANDLED; | 967 | return trace_handle_return(s); |
| 984 | } | 968 | } |
| 985 | 969 | ||
| 986 | static struct trace_event_functions trace_ctx_funcs = { | 970 | static struct trace_event_functions trace_ctx_funcs = { |
| @@ -1020,23 +1004,19 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, | |||
| 1020 | trace_assign_type(field, iter->ent); | 1004 | trace_assign_type(field, iter->ent); |
| 1021 | end = (unsigned long *)((long)iter->ent + iter->ent_size); | 1005 | end = (unsigned long *)((long)iter->ent + iter->ent_size); |
| 1022 | 1006 | ||
| 1023 | if (!trace_seq_puts(s, "<stack trace>\n")) | 1007 | trace_seq_puts(s, "<stack trace>\n"); |
| 1024 | goto partial; | ||
| 1025 | 1008 | ||
| 1026 | for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) { | 1009 | for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) { |
| 1027 | if (!trace_seq_puts(s, " => ")) | ||
| 1028 | goto partial; | ||
| 1029 | 1010 | ||
| 1030 | if (!seq_print_ip_sym(s, *p, flags)) | 1011 | if (trace_seq_has_overflowed(s)) |
| 1031 | goto partial; | 1012 | break; |
| 1032 | if (!trace_seq_putc(s, '\n')) | ||
| 1033 | goto partial; | ||
| 1034 | } | ||
| 1035 | 1013 | ||
| 1036 | return TRACE_TYPE_HANDLED; | 1014 | trace_seq_puts(s, " => "); |
| 1015 | seq_print_ip_sym(s, *p, flags); | ||
| 1016 | trace_seq_putc(s, '\n'); | ||
| 1017 | } | ||
| 1037 | 1018 | ||
| 1038 | partial: | 1019 | return trace_handle_return(s); |
| 1039 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1040 | } | 1020 | } |
| 1041 | 1021 | ||
| 1042 | static struct trace_event_functions trace_stack_funcs = { | 1022 | static struct trace_event_functions trace_stack_funcs = { |
| @@ -1057,16 +1037,10 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, | |||
| 1057 | 1037 | ||
| 1058 | trace_assign_type(field, iter->ent); | 1038 | trace_assign_type(field, iter->ent); |
| 1059 | 1039 | ||
| 1060 | if (!trace_seq_puts(s, "<user stack trace>\n")) | 1040 | trace_seq_puts(s, "<user stack trace>\n"); |
| 1061 | goto partial; | 1041 | seq_print_userip_objs(field, s, flags); |
| 1062 | |||
| 1063 | if (!seq_print_userip_objs(field, s, flags)) | ||
| 1064 | goto partial; | ||
| 1065 | |||
| 1066 | return TRACE_TYPE_HANDLED; | ||
| 1067 | 1042 | ||
| 1068 | partial: | 1043 | return trace_handle_return(s); |
| 1069 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1070 | } | 1044 | } |
| 1071 | 1045 | ||
| 1072 | static struct trace_event_functions trace_user_stack_funcs = { | 1046 | static struct trace_event_functions trace_user_stack_funcs = { |
| @@ -1089,19 +1063,11 @@ trace_bputs_print(struct trace_iterator *iter, int flags, | |||
| 1089 | 1063 | ||
| 1090 | trace_assign_type(field, entry); | 1064 | trace_assign_type(field, entry); |
| 1091 | 1065 | ||
| 1092 | if (!seq_print_ip_sym(s, field->ip, flags)) | 1066 | seq_print_ip_sym(s, field->ip, flags); |
| 1093 | goto partial; | 1067 | trace_seq_puts(s, ": "); |
| 1068 | trace_seq_puts(s, field->str); | ||
| 1094 | 1069 | ||
| 1095 | if (!trace_seq_puts(s, ": ")) | 1070 | return trace_handle_return(s); |
| 1096 | goto partial; | ||
| 1097 | |||
| 1098 | if (!trace_seq_puts(s, field->str)) | ||
| 1099 | goto partial; | ||
| 1100 | |||
| 1101 | return TRACE_TYPE_HANDLED; | ||
| 1102 | |||
| 1103 | partial: | ||
| 1104 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1105 | } | 1071 | } |
| 1106 | 1072 | ||
| 1107 | 1073 | ||
| @@ -1114,16 +1080,10 @@ trace_bputs_raw(struct trace_iterator *iter, int flags, | |||
| 1114 | 1080 | ||
| 1115 | trace_assign_type(field, iter->ent); | 1081 | trace_assign_type(field, iter->ent); |
| 1116 | 1082 | ||
| 1117 | if (!trace_seq_printf(s, ": %lx : ", field->ip)) | 1083 | trace_seq_printf(s, ": %lx : ", field->ip); |
| 1118 | goto partial; | 1084 | trace_seq_puts(s, field->str); |
| 1119 | |||
| 1120 | if (!trace_seq_puts(s, field->str)) | ||
| 1121 | goto partial; | ||
| 1122 | 1085 | ||
| 1123 | return TRACE_TYPE_HANDLED; | 1086 | return trace_handle_return(s); |
| 1124 | |||
| 1125 | partial: | ||
| 1126 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1127 | } | 1087 | } |
| 1128 | 1088 | ||
| 1129 | static struct trace_event_functions trace_bputs_funcs = { | 1089 | static struct trace_event_functions trace_bputs_funcs = { |
| @@ -1147,19 +1107,11 @@ trace_bprint_print(struct trace_iterator *iter, int flags, | |||
| 1147 | 1107 | ||
| 1148 | trace_assign_type(field, entry); | 1108 | trace_assign_type(field, entry); |
| 1149 | 1109 | ||
| 1150 | if (!seq_print_ip_sym(s, field->ip, flags)) | 1110 | seq_print_ip_sym(s, field->ip, flags); |
| 1151 | goto partial; | 1111 | trace_seq_puts(s, ": "); |
| 1152 | 1112 | trace_seq_bprintf(s, field->fmt, field->buf); | |
| 1153 | if (!trace_seq_puts(s, ": ")) | ||
| 1154 | goto partial; | ||
| 1155 | |||
| 1156 | if (!trace_seq_bprintf(s, field->fmt, field->buf)) | ||
| 1157 | goto partial; | ||
| 1158 | 1113 | ||
| 1159 | return TRACE_TYPE_HANDLED; | 1114 | return trace_handle_return(s); |
| 1160 | |||
| 1161 | partial: | ||
| 1162 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1163 | } | 1115 | } |
| 1164 | 1116 | ||
| 1165 | 1117 | ||
| @@ -1172,16 +1124,10 @@ trace_bprint_raw(struct trace_iterator *iter, int flags, | |||
| 1172 | 1124 | ||
| 1173 | trace_assign_type(field, iter->ent); | 1125 | trace_assign_type(field, iter->ent); |
| 1174 | 1126 | ||
| 1175 | if (!trace_seq_printf(s, ": %lx : ", field->ip)) | 1127 | trace_seq_printf(s, ": %lx : ", field->ip); |
| 1176 | goto partial; | 1128 | trace_seq_bprintf(s, field->fmt, field->buf); |
| 1177 | |||
| 1178 | if (!trace_seq_bprintf(s, field->fmt, field->buf)) | ||
| 1179 | goto partial; | ||
| 1180 | 1129 | ||
| 1181 | return TRACE_TYPE_HANDLED; | 1130 | return trace_handle_return(s); |
| 1182 | |||
| 1183 | partial: | ||
| 1184 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1185 | } | 1131 | } |
| 1186 | 1132 | ||
| 1187 | static struct trace_event_functions trace_bprint_funcs = { | 1133 | static struct trace_event_functions trace_bprint_funcs = { |
| @@ -1203,16 +1149,10 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter, | |||
| 1203 | 1149 | ||
| 1204 | trace_assign_type(field, iter->ent); | 1150 | trace_assign_type(field, iter->ent); |
| 1205 | 1151 | ||
| 1206 | if (!seq_print_ip_sym(s, field->ip, flags)) | 1152 | seq_print_ip_sym(s, field->ip, flags); |
| 1207 | goto partial; | 1153 | trace_seq_printf(s, ": %s", field->buf); |
| 1208 | |||
| 1209 | if (!trace_seq_printf(s, ": %s", field->buf)) | ||
| 1210 | goto partial; | ||
| 1211 | 1154 | ||
| 1212 | return TRACE_TYPE_HANDLED; | 1155 | return trace_handle_return(s); |
| 1213 | |||
| 1214 | partial: | ||
| 1215 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1216 | } | 1156 | } |
| 1217 | 1157 | ||
| 1218 | static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags, | 1158 | static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags, |
| @@ -1222,13 +1162,9 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags, | |||
| 1222 | 1162 | ||
| 1223 | trace_assign_type(field, iter->ent); | 1163 | trace_assign_type(field, iter->ent); |
| 1224 | 1164 | ||
| 1225 | if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf)) | 1165 | trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf); |
| 1226 | goto partial; | ||
| 1227 | |||
| 1228 | return TRACE_TYPE_HANDLED; | ||
| 1229 | 1166 | ||
| 1230 | partial: | 1167 | return trace_handle_return(&iter->seq); |
| 1231 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1232 | } | 1168 | } |
| 1233 | 1169 | ||
| 1234 | static struct trace_event_functions trace_print_funcs = { | 1170 | static struct trace_event_functions trace_print_funcs = { |
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 80b25b585a70..8ef2c40efb3c 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h | |||
| @@ -35,17 +35,11 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry); | |||
| 35 | extern int __unregister_ftrace_event(struct trace_event *event); | 35 | extern int __unregister_ftrace_event(struct trace_event *event); |
| 36 | extern struct rw_semaphore trace_event_sem; | 36 | extern struct rw_semaphore trace_event_sem; |
| 37 | 37 | ||
| 38 | #define SEQ_PUT_FIELD_RET(s, x) \ | 38 | #define SEQ_PUT_FIELD(s, x) \ |
| 39 | do { \ | 39 | trace_seq_putmem(s, &(x), sizeof(x)) |
| 40 | if (!trace_seq_putmem(s, &(x), sizeof(x))) \ | 40 | |
| 41 | return TRACE_TYPE_PARTIAL_LINE; \ | 41 | #define SEQ_PUT_HEX_FIELD(s, x) \ |
| 42 | } while (0) | 42 | trace_seq_putmem_hex(s, &(x), sizeof(x)) |
| 43 | |||
| 44 | #define SEQ_PUT_HEX_FIELD_RET(s, x) \ | ||
| 45 | do { \ | ||
| 46 | if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \ | ||
| 47 | return TRACE_TYPE_PARTIAL_LINE; \ | ||
| 48 | } while (0) | ||
| 49 | 43 | ||
| 50 | #endif | 44 | #endif |
| 51 | 45 | ||
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 2900817ba65c..c4e70b6bd7fa 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c | |||
| @@ -305,7 +305,7 @@ static int t_show(struct seq_file *m, void *v) | |||
| 305 | seq_puts(m, "\\t"); | 305 | seq_puts(m, "\\t"); |
| 306 | break; | 306 | break; |
| 307 | case '\\': | 307 | case '\\': |
| 308 | seq_puts(m, "\\"); | 308 | seq_putc(m, '\\'); |
| 309 | break; | 309 | break; |
| 310 | case '"': | 310 | case '"': |
| 311 | seq_puts(m, "\\\""); | 311 | seq_puts(m, "\\\""); |
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index d4b9fc22cd27..b983b2fd2ca1 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c | |||
| @@ -40,7 +40,8 @@ const char *reserved_field_names[] = { | |||
| 40 | int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \ | 40 | int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \ |
| 41 | void *data, void *ent) \ | 41 | void *data, void *ent) \ |
| 42 | { \ | 42 | { \ |
| 43 | return trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ | 43 | trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ |
| 44 | return !trace_seq_has_overflowed(s); \ | ||
| 44 | } \ | 45 | } \ |
| 45 | const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \ | 46 | const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \ |
| 46 | NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type)); | 47 | NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type)); |
| @@ -61,10 +62,11 @@ int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name, | |||
| 61 | int len = *(u32 *)data >> 16; | 62 | int len = *(u32 *)data >> 16; |
| 62 | 63 | ||
| 63 | if (!len) | 64 | if (!len) |
| 64 | return trace_seq_printf(s, " %s=(fault)", name); | 65 | trace_seq_printf(s, " %s=(fault)", name); |
| 65 | else | 66 | else |
| 66 | return trace_seq_printf(s, " %s=\"%s\"", name, | 67 | trace_seq_printf(s, " %s=\"%s\"", name, |
| 67 | (const char *)get_loc_data(data, ent)); | 68 | (const char *)get_loc_data(data, ent)); |
| 69 | return !trace_seq_has_overflowed(s); | ||
| 68 | } | 70 | } |
| 69 | NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string)); | 71 | NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string)); |
| 70 | 72 | ||
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 3f34dc9b40f3..2e293beb186e 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c | |||
| @@ -14,122 +14,26 @@ | |||
| 14 | 14 | ||
| 15 | #include "trace.h" | 15 | #include "trace.h" |
| 16 | 16 | ||
| 17 | static struct trace_array *ctx_trace; | ||
| 18 | static int __read_mostly tracer_enabled; | ||
| 19 | static int sched_ref; | 17 | static int sched_ref; |
| 20 | static DEFINE_MUTEX(sched_register_mutex); | 18 | static DEFINE_MUTEX(sched_register_mutex); |
| 21 | static int sched_stopped; | ||
| 22 | |||
| 23 | |||
| 24 | void | ||
| 25 | tracing_sched_switch_trace(struct trace_array *tr, | ||
| 26 | struct task_struct *prev, | ||
| 27 | struct task_struct *next, | ||
| 28 | unsigned long flags, int pc) | ||
| 29 | { | ||
| 30 | struct ftrace_event_call *call = &event_context_switch; | ||
| 31 | struct ring_buffer *buffer = tr->trace_buffer.buffer; | ||
| 32 | struct ring_buffer_event *event; | ||
| 33 | struct ctx_switch_entry *entry; | ||
| 34 | |||
| 35 | event = trace_buffer_lock_reserve(buffer, TRACE_CTX, | ||
| 36 | sizeof(*entry), flags, pc); | ||
| 37 | if (!event) | ||
| 38 | return; | ||
| 39 | entry = ring_buffer_event_data(event); | ||
| 40 | entry->prev_pid = prev->pid; | ||
| 41 | entry->prev_prio = prev->prio; | ||
| 42 | entry->prev_state = prev->state; | ||
| 43 | entry->next_pid = next->pid; | ||
| 44 | entry->next_prio = next->prio; | ||
| 45 | entry->next_state = next->state; | ||
| 46 | entry->next_cpu = task_cpu(next); | ||
| 47 | |||
| 48 | if (!call_filter_check_discard(call, entry, buffer, event)) | ||
| 49 | trace_buffer_unlock_commit(buffer, event, flags, pc); | ||
| 50 | } | ||
| 51 | 19 | ||
| 52 | static void | 20 | static void |
| 53 | probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next) | 21 | probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next) |
| 54 | { | 22 | { |
| 55 | struct trace_array_cpu *data; | ||
| 56 | unsigned long flags; | ||
| 57 | int cpu; | ||
| 58 | int pc; | ||
| 59 | |||
| 60 | if (unlikely(!sched_ref)) | 23 | if (unlikely(!sched_ref)) |
| 61 | return; | 24 | return; |
| 62 | 25 | ||
| 63 | tracing_record_cmdline(prev); | 26 | tracing_record_cmdline(prev); |
| 64 | tracing_record_cmdline(next); | 27 | tracing_record_cmdline(next); |
| 65 | |||
| 66 | if (!tracer_enabled || sched_stopped) | ||
| 67 | return; | ||
| 68 | |||
| 69 | pc = preempt_count(); | ||
| 70 | local_irq_save(flags); | ||
| 71 | cpu = raw_smp_processor_id(); | ||
| 72 | data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu); | ||
| 73 | |||
| 74 | if (likely(!atomic_read(&data->disabled))) | ||
| 75 | tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc); | ||
| 76 | |||
| 77 | local_irq_restore(flags); | ||
| 78 | } | ||
| 79 | |||
| 80 | void | ||
| 81 | tracing_sched_wakeup_trace(struct trace_array *tr, | ||
| 82 | struct task_struct *wakee, | ||
| 83 | struct task_struct *curr, | ||
| 84 | unsigned long flags, int pc) | ||
| 85 | { | ||
| 86 | struct ftrace_event_call *call = &event_wakeup; | ||
| 87 | struct ring_buffer_event *event; | ||
| 88 | struct ctx_switch_entry *entry; | ||
| 89 | struct ring_buffer *buffer = tr->trace_buffer.buffer; | ||
| 90 | |||
| 91 | event = trace_buffer_lock_reserve(buffer, TRACE_WAKE, | ||
| 92 | sizeof(*entry), flags, pc); | ||
| 93 | if (!event) | ||
| 94 | return; | ||
| 95 | entry = ring_buffer_event_data(event); | ||
| 96 | entry->prev_pid = curr->pid; | ||
| 97 | entry->prev_prio = curr->prio; | ||
| 98 | entry->prev_state = curr->state; | ||
| 99 | entry->next_pid = wakee->pid; | ||
| 100 | entry->next_prio = wakee->prio; | ||
| 101 | entry->next_state = wakee->state; | ||
| 102 | entry->next_cpu = task_cpu(wakee); | ||
| 103 | |||
| 104 | if (!call_filter_check_discard(call, entry, buffer, event)) | ||
| 105 | trace_buffer_unlock_commit(buffer, event, flags, pc); | ||
| 106 | } | 28 | } |
| 107 | 29 | ||
| 108 | static void | 30 | static void |
| 109 | probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success) | 31 | probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success) |
| 110 | { | 32 | { |
| 111 | struct trace_array_cpu *data; | ||
| 112 | unsigned long flags; | ||
| 113 | int cpu, pc; | ||
| 114 | |||
| 115 | if (unlikely(!sched_ref)) | 33 | if (unlikely(!sched_ref)) |
| 116 | return; | 34 | return; |
| 117 | 35 | ||
| 118 | tracing_record_cmdline(current); | 36 | tracing_record_cmdline(current); |
| 119 | |||
| 120 | if (!tracer_enabled || sched_stopped) | ||
| 121 | return; | ||
| 122 | |||
| 123 | pc = preempt_count(); | ||
| 124 | local_irq_save(flags); | ||
| 125 | cpu = raw_smp_processor_id(); | ||
| 126 | data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu); | ||
| 127 | |||
| 128 | if (likely(!atomic_read(&data->disabled))) | ||
| 129 | tracing_sched_wakeup_trace(ctx_trace, wakee, current, | ||
| 130 | flags, pc); | ||
| 131 | |||
| 132 | local_irq_restore(flags); | ||
| 133 | } | 37 | } |
| 134 | 38 | ||
| 135 | static int tracing_sched_register(void) | 39 | static int tracing_sched_register(void) |
| @@ -197,51 +101,3 @@ void tracing_stop_cmdline_record(void) | |||
| 197 | { | 101 | { |
| 198 | tracing_stop_sched_switch(); | 102 | tracing_stop_sched_switch(); |
| 199 | } | 103 | } |
| 200 | |||
| 201 | /** | ||
| 202 | * tracing_start_sched_switch_record - start tracing context switches | ||
| 203 | * | ||
| 204 | * Turns on context switch tracing for a tracer. | ||
| 205 | */ | ||
| 206 | void tracing_start_sched_switch_record(void) | ||
| 207 | { | ||
| 208 | if (unlikely(!ctx_trace)) { | ||
| 209 | WARN_ON(1); | ||
| 210 | return; | ||
| 211 | } | ||
| 212 | |||
| 213 | tracing_start_sched_switch(); | ||
| 214 | |||
| 215 | mutex_lock(&sched_register_mutex); | ||
| 216 | tracer_enabled++; | ||
| 217 | mutex_unlock(&sched_register_mutex); | ||
| 218 | } | ||
| 219 | |||
| 220 | /** | ||
| 221 | * tracing_stop_sched_switch_record - start tracing context switches | ||
| 222 | * | ||
| 223 | * Turns off context switch tracing for a tracer. | ||
| 224 | */ | ||
| 225 | void tracing_stop_sched_switch_record(void) | ||
| 226 | { | ||
| 227 | mutex_lock(&sched_register_mutex); | ||
| 228 | tracer_enabled--; | ||
| 229 | WARN_ON(tracer_enabled < 0); | ||
| 230 | mutex_unlock(&sched_register_mutex); | ||
| 231 | |||
| 232 | tracing_stop_sched_switch(); | ||
| 233 | } | ||
| 234 | |||
| 235 | /** | ||
| 236 | * tracing_sched_switch_assign_trace - assign a trace array for ctx switch | ||
| 237 | * @tr: trace array pointer to assign | ||
| 238 | * | ||
| 239 | * Some tracers might want to record the context switches in their | ||
| 240 | * trace. This function lets those tracers assign the trace array | ||
| 241 | * to use. | ||
| 242 | */ | ||
| 243 | void tracing_sched_switch_assign_trace(struct trace_array *tr) | ||
| 244 | { | ||
| 245 | ctx_trace = tr; | ||
| 246 | } | ||
| 247 | |||
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 19bd8928ce94..8fb84b362816 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c | |||
| @@ -365,6 +365,62 @@ probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu) | |||
| 365 | wakeup_current_cpu = cpu; | 365 | wakeup_current_cpu = cpu; |
| 366 | } | 366 | } |
| 367 | 367 | ||
| 368 | static void | ||
| 369 | tracing_sched_switch_trace(struct trace_array *tr, | ||
| 370 | struct task_struct *prev, | ||
| 371 | struct task_struct *next, | ||
| 372 | unsigned long flags, int pc) | ||
| 373 | { | ||
| 374 | struct ftrace_event_call *call = &event_context_switch; | ||
| 375 | struct ring_buffer *buffer = tr->trace_buffer.buffer; | ||
| 376 | struct ring_buffer_event *event; | ||
| 377 | struct ctx_switch_entry *entry; | ||
| 378 | |||
| 379 | event = trace_buffer_lock_reserve(buffer, TRACE_CTX, | ||
| 380 | sizeof(*entry), flags, pc); | ||
| 381 | if (!event) | ||
| 382 | return; | ||
| 383 | entry = ring_buffer_event_data(event); | ||
| 384 | entry->prev_pid = prev->pid; | ||
| 385 | entry->prev_prio = prev->prio; | ||
| 386 | entry->prev_state = prev->state; | ||
| 387 | entry->next_pid = next->pid; | ||
| 388 | entry->next_prio = next->prio; | ||
| 389 | entry->next_state = next->state; | ||
| 390 | entry->next_cpu = task_cpu(next); | ||
| 391 | |||
| 392 | if (!call_filter_check_discard(call, entry, buffer, event)) | ||
| 393 | trace_buffer_unlock_commit(buffer, event, flags, pc); | ||
| 394 | } | ||
| 395 | |||
| 396 | static void | ||
| 397 | tracing_sched_wakeup_trace(struct trace_array *tr, | ||
| 398 | struct task_struct *wakee, | ||
| 399 | struct task_struct *curr, | ||
| 400 | unsigned long flags, int pc) | ||
| 401 | { | ||
| 402 | struct ftrace_event_call *call = &event_wakeup; | ||
| 403 | struct ring_buffer_event *event; | ||
| 404 | struct ctx_switch_entry *entry; | ||
| 405 | struct ring_buffer *buffer = tr->trace_buffer.buffer; | ||
| 406 | |||
| 407 | event = trace_buffer_lock_reserve(buffer, TRACE_WAKE, | ||
| 408 | sizeof(*entry), flags, pc); | ||
| 409 | if (!event) | ||
| 410 | return; | ||
| 411 | entry = ring_buffer_event_data(event); | ||
| 412 | entry->prev_pid = curr->pid; | ||
| 413 | entry->prev_prio = curr->prio; | ||
| 414 | entry->prev_state = curr->state; | ||
| 415 | entry->next_pid = wakee->pid; | ||
| 416 | entry->next_prio = wakee->prio; | ||
| 417 | entry->next_state = wakee->state; | ||
| 418 | entry->next_cpu = task_cpu(wakee); | ||
| 419 | |||
| 420 | if (!call_filter_check_discard(call, entry, buffer, event)) | ||
| 421 | trace_buffer_unlock_commit(buffer, event, flags, pc); | ||
| 422 | } | ||
| 423 | |||
| 368 | static void notrace | 424 | static void notrace |
| 369 | probe_wakeup_sched_switch(void *ignore, | 425 | probe_wakeup_sched_switch(void *ignore, |
| 370 | struct task_struct *prev, struct task_struct *next) | 426 | struct task_struct *prev, struct task_struct *next) |
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index 1f24ed99dca2..f8b45d8792f9 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c | |||
| @@ -27,10 +27,19 @@ | |||
| 27 | #include <linux/trace_seq.h> | 27 | #include <linux/trace_seq.h> |
| 28 | 28 | ||
| 29 | /* How much buffer is left on the trace_seq? */ | 29 | /* How much buffer is left on the trace_seq? */ |
| 30 | #define TRACE_SEQ_BUF_LEFT(s) ((PAGE_SIZE - 1) - (s)->len) | 30 | #define TRACE_SEQ_BUF_LEFT(s) seq_buf_buffer_left(&(s)->seq) |
| 31 | 31 | ||
| 32 | /* How much buffer is written? */ | 32 | /* How much buffer is written? */ |
| 33 | #define TRACE_SEQ_BUF_USED(s) min((s)->len, (unsigned int)(PAGE_SIZE - 1)) | 33 | #define TRACE_SEQ_BUF_USED(s) seq_buf_used(&(s)->seq) |
| 34 | |||
| 35 | /* | ||
| 36 | * trace_seq should work with being initialized with 0s. | ||
| 37 | */ | ||
| 38 | static inline void __trace_seq_init(struct trace_seq *s) | ||
| 39 | { | ||
| 40 | if (unlikely(!s->seq.size)) | ||
| 41 | trace_seq_init(s); | ||
| 42 | } | ||
| 34 | 43 | ||
| 35 | /** | 44 | /** |
| 36 | * trace_print_seq - move the contents of trace_seq into a seq_file | 45 | * trace_print_seq - move the contents of trace_seq into a seq_file |
| @@ -43,10 +52,11 @@ | |||
| 43 | */ | 52 | */ |
| 44 | int trace_print_seq(struct seq_file *m, struct trace_seq *s) | 53 | int trace_print_seq(struct seq_file *m, struct trace_seq *s) |
| 45 | { | 54 | { |
| 46 | unsigned int len = TRACE_SEQ_BUF_USED(s); | ||
| 47 | int ret; | 55 | int ret; |
| 48 | 56 | ||
| 49 | ret = seq_write(m, s->buffer, len); | 57 | __trace_seq_init(s); |
| 58 | |||
| 59 | ret = seq_buf_print_seq(m, &s->seq); | ||
| 50 | 60 | ||
| 51 | /* | 61 | /* |
| 52 | * Only reset this buffer if we successfully wrote to the | 62 | * Only reset this buffer if we successfully wrote to the |
| @@ -69,34 +79,26 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s) | |||
| 69 | * trace_seq_printf() is used to store strings into a special | 79 | * trace_seq_printf() is used to store strings into a special |
| 70 | * buffer (@s). Then the output may be either used by | 80 | * buffer (@s). Then the output may be either used by |
| 71 | * the sequencer or pulled into another buffer. | 81 | * the sequencer or pulled into another buffer. |
| 72 | * | ||
| 73 | * Returns 1 if we successfully written all the contents to | ||
| 74 | * the buffer. | ||
| 75 | * Returns 0 if we the length to write is bigger than the | ||
| 76 | * reserved buffer space. In this case, nothing gets written. | ||
| 77 | */ | 82 | */ |
| 78 | int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) | 83 | void trace_seq_printf(struct trace_seq *s, const char *fmt, ...) |
| 79 | { | 84 | { |
| 80 | unsigned int len = TRACE_SEQ_BUF_LEFT(s); | 85 | unsigned int save_len = s->seq.len; |
| 81 | va_list ap; | 86 | va_list ap; |
| 82 | int ret; | ||
| 83 | 87 | ||
| 84 | if (s->full || !len) | 88 | if (s->full) |
| 85 | return 0; | 89 | return; |
| 90 | |||
| 91 | __trace_seq_init(s); | ||
| 86 | 92 | ||
| 87 | va_start(ap, fmt); | 93 | va_start(ap, fmt); |
| 88 | ret = vsnprintf(s->buffer + s->len, len, fmt, ap); | 94 | seq_buf_vprintf(&s->seq, fmt, ap); |
| 89 | va_end(ap); | 95 | va_end(ap); |
| 90 | 96 | ||
| 91 | /* If we can't write it all, don't bother writing anything */ | 97 | /* If we can't write it all, don't bother writing anything */ |
| 92 | if (ret >= len) { | 98 | if (unlikely(seq_buf_has_overflowed(&s->seq))) { |
| 99 | s->seq.len = save_len; | ||
| 93 | s->full = 1; | 100 | s->full = 1; |
| 94 | return 0; | ||
| 95 | } | 101 | } |
| 96 | |||
| 97 | s->len += ret; | ||
| 98 | |||
| 99 | return 1; | ||
| 100 | } | 102 | } |
| 101 | EXPORT_SYMBOL_GPL(trace_seq_printf); | 103 | EXPORT_SYMBOL_GPL(trace_seq_printf); |
| 102 | 104 | ||
| @@ -107,25 +109,23 @@ EXPORT_SYMBOL_GPL(trace_seq_printf); | |||
| 107 | * @nmaskbits: The number of bits that are valid in @maskp | 109 | * @nmaskbits: The number of bits that are valid in @maskp |
| 108 | * | 110 | * |
| 109 | * Writes a ASCII representation of a bitmask string into @s. | 111 | * Writes a ASCII representation of a bitmask string into @s. |
| 110 | * | ||
| 111 | * Returns 1 if we successfully written all the contents to | ||
| 112 | * the buffer. | ||
| 113 | * Returns 0 if we the length to write is bigger than the | ||
| 114 | * reserved buffer space. In this case, nothing gets written. | ||
| 115 | */ | 112 | */ |
| 116 | int trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, | 113 | void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, |
| 117 | int nmaskbits) | 114 | int nmaskbits) |
| 118 | { | 115 | { |
| 119 | unsigned int len = TRACE_SEQ_BUF_LEFT(s); | 116 | unsigned int save_len = s->seq.len; |
| 120 | int ret; | ||
| 121 | 117 | ||
| 122 | if (s->full || !len) | 118 | if (s->full) |
| 123 | return 0; | 119 | return; |
| 124 | 120 | ||
| 125 | ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits); | 121 | __trace_seq_init(s); |
| 126 | s->len += ret; | ||
| 127 | 122 | ||
| 128 | return 1; | 123 | seq_buf_bitmask(&s->seq, maskp, nmaskbits); |
| 124 | |||
| 125 | if (unlikely(seq_buf_has_overflowed(&s->seq))) { | ||
| 126 | s->seq.len = save_len; | ||
| 127 | s->full = 1; | ||
| 128 | } | ||
| 129 | } | 129 | } |
| 130 | EXPORT_SYMBOL_GPL(trace_seq_bitmask); | 130 | EXPORT_SYMBOL_GPL(trace_seq_bitmask); |
| 131 | 131 | ||
| @@ -139,28 +139,23 @@ EXPORT_SYMBOL_GPL(trace_seq_bitmask); | |||
| 139 | * trace_seq_printf is used to store strings into a special | 139 | * trace_seq_printf is used to store strings into a special |
| 140 | * buffer (@s). Then the output may be either used by | 140 | * buffer (@s). Then the output may be either used by |
| 141 | * the sequencer or pulled into another buffer. | 141 | * the sequencer or pulled into another buffer. |
| 142 | * | ||
| 143 | * Returns how much it wrote to the buffer. | ||
| 144 | */ | 142 | */ |
| 145 | int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) | 143 | void trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) |
| 146 | { | 144 | { |
| 147 | unsigned int len = TRACE_SEQ_BUF_LEFT(s); | 145 | unsigned int save_len = s->seq.len; |
| 148 | int ret; | ||
| 149 | 146 | ||
| 150 | if (s->full || !len) | 147 | if (s->full) |
| 151 | return 0; | 148 | return; |
| 152 | 149 | ||
| 153 | ret = vsnprintf(s->buffer + s->len, len, fmt, args); | 150 | __trace_seq_init(s); |
| 151 | |||
| 152 | seq_buf_vprintf(&s->seq, fmt, args); | ||
| 154 | 153 | ||
| 155 | /* If we can't write it all, don't bother writing anything */ | 154 | /* If we can't write it all, don't bother writing anything */ |
| 156 | if (ret >= len) { | 155 | if (unlikely(seq_buf_has_overflowed(&s->seq))) { |
| 156 | s->seq.len = save_len; | ||
| 157 | s->full = 1; | 157 | s->full = 1; |
| 158 | return 0; | ||
| 159 | } | 158 | } |
| 160 | |||
| 161 | s->len += ret; | ||
| 162 | |||
| 163 | return len; | ||
| 164 | } | 159 | } |
| 165 | EXPORT_SYMBOL_GPL(trace_seq_vprintf); | 160 | EXPORT_SYMBOL_GPL(trace_seq_vprintf); |
| 166 | 161 | ||
| @@ -178,28 +173,24 @@ EXPORT_SYMBOL_GPL(trace_seq_vprintf); | |||
| 178 | * | 173 | * |
| 179 | * This function will take the format and the binary array and finish | 174 | * This function will take the format and the binary array and finish |
| 180 | * the conversion into the ASCII string within the buffer. | 175 | * the conversion into the ASCII string within the buffer. |
| 181 | * | ||
| 182 | * Returns how much it wrote to the buffer. | ||
| 183 | */ | 176 | */ |
| 184 | int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) | 177 | void trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) |
| 185 | { | 178 | { |
| 186 | unsigned int len = TRACE_SEQ_BUF_LEFT(s); | 179 | unsigned int save_len = s->seq.len; |
| 187 | int ret; | ||
| 188 | 180 | ||
| 189 | if (s->full || !len) | 181 | if (s->full) |
| 190 | return 0; | 182 | return; |
| 183 | |||
| 184 | __trace_seq_init(s); | ||
| 191 | 185 | ||
| 192 | ret = bstr_printf(s->buffer + s->len, len, fmt, binary); | 186 | seq_buf_bprintf(&s->seq, fmt, binary); |
| 193 | 187 | ||
| 194 | /* If we can't write it all, don't bother writing anything */ | 188 | /* If we can't write it all, don't bother writing anything */ |
| 195 | if (ret >= len) { | 189 | if (unlikely(seq_buf_has_overflowed(&s->seq))) { |
| 190 | s->seq.len = save_len; | ||
| 196 | s->full = 1; | 191 | s->full = 1; |
| 197 | return 0; | 192 | return; |
| 198 | } | 193 | } |
| 199 | |||
| 200 | s->len += ret; | ||
| 201 | |||
| 202 | return len; | ||
| 203 | } | 194 | } |
| 204 | EXPORT_SYMBOL_GPL(trace_seq_bprintf); | 195 | EXPORT_SYMBOL_GPL(trace_seq_bprintf); |
| 205 | 196 | ||
| @@ -212,25 +203,22 @@ EXPORT_SYMBOL_GPL(trace_seq_bprintf); | |||
| 212 | * copy to user routines. This function records a simple string | 203 | * copy to user routines. This function records a simple string |
| 213 | * into a special buffer (@s) for later retrieval by a sequencer | 204 | * into a special buffer (@s) for later retrieval by a sequencer |
| 214 | * or other mechanism. | 205 | * or other mechanism. |
| 215 | * | ||
| 216 | * Returns how much it wrote to the buffer. | ||
| 217 | */ | 206 | */ |
| 218 | int trace_seq_puts(struct trace_seq *s, const char *str) | 207 | void trace_seq_puts(struct trace_seq *s, const char *str) |
| 219 | { | 208 | { |
| 220 | unsigned int len = strlen(str); | 209 | unsigned int len = strlen(str); |
| 221 | 210 | ||
| 222 | if (s->full) | 211 | if (s->full) |
| 223 | return 0; | 212 | return; |
| 213 | |||
| 214 | __trace_seq_init(s); | ||
| 224 | 215 | ||
| 225 | if (len > TRACE_SEQ_BUF_LEFT(s)) { | 216 | if (len > TRACE_SEQ_BUF_LEFT(s)) { |
| 226 | s->full = 1; | 217 | s->full = 1; |
| 227 | return 0; | 218 | return; |
| 228 | } | 219 | } |
| 229 | 220 | ||
| 230 | memcpy(s->buffer + s->len, str, len); | 221 | seq_buf_putmem(&s->seq, str, len); |
| 231 | s->len += len; | ||
| 232 | |||
| 233 | return len; | ||
| 234 | } | 222 | } |
| 235 | EXPORT_SYMBOL_GPL(trace_seq_puts); | 223 | EXPORT_SYMBOL_GPL(trace_seq_puts); |
| 236 | 224 | ||
| @@ -243,22 +231,20 @@ EXPORT_SYMBOL_GPL(trace_seq_puts); | |||
| 243 | * copy to user routines. This function records a simple charater | 231 | * copy to user routines. This function records a simple charater |
| 244 | * into a special buffer (@s) for later retrieval by a sequencer | 232 | * into a special buffer (@s) for later retrieval by a sequencer |
| 245 | * or other mechanism. | 233 | * or other mechanism. |
| 246 | * | ||
| 247 | * Returns how much it wrote to the buffer. | ||
| 248 | */ | 234 | */ |
| 249 | int trace_seq_putc(struct trace_seq *s, unsigned char c) | 235 | void trace_seq_putc(struct trace_seq *s, unsigned char c) |
| 250 | { | 236 | { |
| 251 | if (s->full) | 237 | if (s->full) |
| 252 | return 0; | 238 | return; |
| 239 | |||
| 240 | __trace_seq_init(s); | ||
| 253 | 241 | ||
| 254 | if (TRACE_SEQ_BUF_LEFT(s) < 1) { | 242 | if (TRACE_SEQ_BUF_LEFT(s) < 1) { |
| 255 | s->full = 1; | 243 | s->full = 1; |
| 256 | return 0; | 244 | return; |
| 257 | } | 245 | } |
| 258 | 246 | ||
| 259 | s->buffer[s->len++] = c; | 247 | seq_buf_putc(&s->seq, c); |
| 260 | |||
| 261 | return 1; | ||
| 262 | } | 248 | } |
| 263 | EXPORT_SYMBOL_GPL(trace_seq_putc); | 249 | EXPORT_SYMBOL_GPL(trace_seq_putc); |
| 264 | 250 | ||
| @@ -271,29 +257,23 @@ EXPORT_SYMBOL_GPL(trace_seq_putc); | |||
| 271 | * There may be cases where raw memory needs to be written into the | 257 | * There may be cases where raw memory needs to be written into the |
| 272 | * buffer and a strcpy() would not work. Using this function allows | 258 | * buffer and a strcpy() would not work. Using this function allows |
| 273 | * for such cases. | 259 | * for such cases. |
| 274 | * | ||
| 275 | * Returns how much it wrote to the buffer. | ||
| 276 | */ | 260 | */ |
| 277 | int trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len) | 261 | void trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len) |
| 278 | { | 262 | { |
| 279 | if (s->full) | 263 | if (s->full) |
| 280 | return 0; | 264 | return; |
| 265 | |||
| 266 | __trace_seq_init(s); | ||
| 281 | 267 | ||
| 282 | if (len > TRACE_SEQ_BUF_LEFT(s)) { | 268 | if (len > TRACE_SEQ_BUF_LEFT(s)) { |
| 283 | s->full = 1; | 269 | s->full = 1; |
| 284 | return 0; | 270 | return; |
| 285 | } | 271 | } |
| 286 | 272 | ||
| 287 | memcpy(s->buffer + s->len, mem, len); | 273 | seq_buf_putmem(&s->seq, mem, len); |
| 288 | s->len += len; | ||
| 289 | |||
| 290 | return len; | ||
| 291 | } | 274 | } |
| 292 | EXPORT_SYMBOL_GPL(trace_seq_putmem); | 275 | EXPORT_SYMBOL_GPL(trace_seq_putmem); |
| 293 | 276 | ||
| 294 | #define MAX_MEMHEX_BYTES 8U | ||
| 295 | #define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) | ||
| 296 | |||
| 297 | /** | 277 | /** |
| 298 | * trace_seq_putmem_hex - write raw memory into the buffer in ASCII hex | 278 | * trace_seq_putmem_hex - write raw memory into the buffer in ASCII hex |
| 299 | * @s: trace sequence descriptor | 279 | * @s: trace sequence descriptor |
| @@ -303,41 +283,31 @@ EXPORT_SYMBOL_GPL(trace_seq_putmem); | |||
| 303 | * This is similar to trace_seq_putmem() except instead of just copying the | 283 | * This is similar to trace_seq_putmem() except instead of just copying the |
| 304 | * raw memory into the buffer it writes its ASCII representation of it | 284 | * raw memory into the buffer it writes its ASCII representation of it |
| 305 | * in hex characters. | 285 | * in hex characters. |
| 306 | * | ||
| 307 | * Returns how much it wrote to the buffer. | ||
| 308 | */ | 286 | */ |
| 309 | int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, | 287 | void trace_seq_putmem_hex(struct trace_seq *s, const void *mem, |
| 310 | unsigned int len) | 288 | unsigned int len) |
| 311 | { | 289 | { |
| 312 | unsigned char hex[HEX_CHARS]; | 290 | unsigned int save_len = s->seq.len; |
| 313 | const unsigned char *data = mem; | ||
| 314 | unsigned int start_len; | ||
| 315 | int i, j; | ||
| 316 | int cnt = 0; | ||
| 317 | 291 | ||
| 318 | if (s->full) | 292 | if (s->full) |
| 319 | return 0; | 293 | return; |
| 320 | 294 | ||
| 321 | while (len) { | 295 | __trace_seq_init(s); |
| 322 | start_len = min(len, HEX_CHARS - 1); | 296 | |
| 323 | #ifdef __BIG_ENDIAN | 297 | /* Each byte is represented by two chars */ |
| 324 | for (i = 0, j = 0; i < start_len; i++) { | 298 | if (len * 2 > TRACE_SEQ_BUF_LEFT(s)) { |
| 325 | #else | 299 | s->full = 1; |
| 326 | for (i = start_len-1, j = 0; i >= 0; i--) { | 300 | return; |
| 327 | #endif | 301 | } |
| 328 | hex[j++] = hex_asc_hi(data[i]); | 302 | |
| 329 | hex[j++] = hex_asc_lo(data[i]); | 303 | /* The added spaces can still cause an overflow */ |
| 330 | } | 304 | seq_buf_putmem_hex(&s->seq, mem, len); |
| 331 | if (WARN_ON_ONCE(j == 0 || j/2 > len)) | 305 | |
| 332 | break; | 306 | if (unlikely(seq_buf_has_overflowed(&s->seq))) { |
| 333 | 307 | s->seq.len = save_len; | |
| 334 | /* j increments twice per loop */ | 308 | s->full = 1; |
| 335 | len -= j / 2; | 309 | return; |
| 336 | hex[j++] = ' '; | ||
| 337 | |||
| 338 | cnt += trace_seq_putmem(s, hex, j); | ||
| 339 | } | 310 | } |
| 340 | return cnt; | ||
| 341 | } | 311 | } |
| 342 | EXPORT_SYMBOL_GPL(trace_seq_putmem_hex); | 312 | EXPORT_SYMBOL_GPL(trace_seq_putmem_hex); |
| 343 | 313 | ||
| @@ -355,30 +325,27 @@ EXPORT_SYMBOL_GPL(trace_seq_putmem_hex); | |||
| 355 | */ | 325 | */ |
| 356 | int trace_seq_path(struct trace_seq *s, const struct path *path) | 326 | int trace_seq_path(struct trace_seq *s, const struct path *path) |
| 357 | { | 327 | { |
| 358 | unsigned char *p; | 328 | unsigned int save_len = s->seq.len; |
| 359 | 329 | ||
| 360 | if (s->full) | 330 | if (s->full) |
| 361 | return 0; | 331 | return 0; |
| 362 | 332 | ||
| 333 | __trace_seq_init(s); | ||
| 334 | |||
| 363 | if (TRACE_SEQ_BUF_LEFT(s) < 1) { | 335 | if (TRACE_SEQ_BUF_LEFT(s) < 1) { |
| 364 | s->full = 1; | 336 | s->full = 1; |
| 365 | return 0; | 337 | return 0; |
| 366 | } | 338 | } |
| 367 | 339 | ||
| 368 | p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); | 340 | seq_buf_path(&s->seq, path, "\n"); |
| 369 | if (!IS_ERR(p)) { | 341 | |
| 370 | p = mangle_path(s->buffer + s->len, p, "\n"); | 342 | if (unlikely(seq_buf_has_overflowed(&s->seq))) { |
| 371 | if (p) { | 343 | s->seq.len = save_len; |
| 372 | s->len = p - s->buffer; | 344 | s->full = 1; |
| 373 | return 1; | 345 | return 0; |
| 374 | } | ||
| 375 | } else { | ||
| 376 | s->buffer[s->len++] = '?'; | ||
| 377 | return 1; | ||
| 378 | } | 346 | } |
| 379 | 347 | ||
| 380 | s->full = 1; | 348 | return 1; |
| 381 | return 0; | ||
| 382 | } | 349 | } |
| 383 | EXPORT_SYMBOL_GPL(trace_seq_path); | 350 | EXPORT_SYMBOL_GPL(trace_seq_path); |
| 384 | 351 | ||
| @@ -404,25 +371,7 @@ EXPORT_SYMBOL_GPL(trace_seq_path); | |||
| 404 | */ | 371 | */ |
| 405 | int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt) | 372 | int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt) |
| 406 | { | 373 | { |
| 407 | int len; | 374 | __trace_seq_init(s); |
| 408 | int ret; | 375 | return seq_buf_to_user(&s->seq, ubuf, cnt); |
| 409 | |||
| 410 | if (!cnt) | ||
| 411 | return 0; | ||
| 412 | |||
| 413 | if (s->len <= s->readpos) | ||
| 414 | return -EBUSY; | ||
| 415 | |||
| 416 | len = s->len - s->readpos; | ||
| 417 | if (cnt > len) | ||
| 418 | cnt = len; | ||
| 419 | ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt); | ||
| 420 | if (ret == cnt) | ||
| 421 | return -EFAULT; | ||
| 422 | |||
| 423 | cnt -= ret; | ||
| 424 | |||
| 425 | s->readpos += cnt; | ||
| 426 | return cnt; | ||
| 427 | } | 376 | } |
| 428 | EXPORT_SYMBOL_GPL(trace_seq_to_user); | 377 | EXPORT_SYMBOL_GPL(trace_seq_to_user); |
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 29228c4d5696..f97f6e3a676c 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
| @@ -114,7 +114,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags, | |||
| 114 | struct trace_entry *ent = iter->ent; | 114 | struct trace_entry *ent = iter->ent; |
| 115 | struct syscall_trace_enter *trace; | 115 | struct syscall_trace_enter *trace; |
| 116 | struct syscall_metadata *entry; | 116 | struct syscall_metadata *entry; |
| 117 | int i, ret, syscall; | 117 | int i, syscall; |
| 118 | 118 | ||
| 119 | trace = (typeof(trace))ent; | 119 | trace = (typeof(trace))ent; |
| 120 | syscall = trace->nr; | 120 | syscall = trace->nr; |
| @@ -128,35 +128,28 @@ print_syscall_enter(struct trace_iterator *iter, int flags, | |||
| 128 | goto end; | 128 | goto end; |
| 129 | } | 129 | } |
| 130 | 130 | ||
| 131 | ret = trace_seq_printf(s, "%s(", entry->name); | 131 | trace_seq_printf(s, "%s(", entry->name); |
| 132 | if (!ret) | ||
| 133 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 134 | 132 | ||
| 135 | for (i = 0; i < entry->nb_args; i++) { | 133 | for (i = 0; i < entry->nb_args; i++) { |
| 134 | |||
| 135 | if (trace_seq_has_overflowed(s)) | ||
| 136 | goto end; | ||
| 137 | |||
| 136 | /* parameter types */ | 138 | /* parameter types */ |
| 137 | if (trace_flags & TRACE_ITER_VERBOSE) { | 139 | if (trace_flags & TRACE_ITER_VERBOSE) |
| 138 | ret = trace_seq_printf(s, "%s ", entry->types[i]); | 140 | trace_seq_printf(s, "%s ", entry->types[i]); |
| 139 | if (!ret) | 141 | |
| 140 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 141 | } | ||
| 142 | /* parameter values */ | 142 | /* parameter values */ |
| 143 | ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i], | 143 | trace_seq_printf(s, "%s: %lx%s", entry->args[i], |
| 144 | trace->args[i], | 144 | trace->args[i], |
| 145 | i == entry->nb_args - 1 ? "" : ", "); | 145 | i == entry->nb_args - 1 ? "" : ", "); |
| 146 | if (!ret) | ||
| 147 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 148 | } | 146 | } |
| 149 | 147 | ||
| 150 | ret = trace_seq_putc(s, ')'); | 148 | trace_seq_putc(s, ')'); |
| 151 | if (!ret) | ||
| 152 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 153 | |||
| 154 | end: | 149 | end: |
| 155 | ret = trace_seq_putc(s, '\n'); | 150 | trace_seq_putc(s, '\n'); |
| 156 | if (!ret) | ||
| 157 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 158 | 151 | ||
| 159 | return TRACE_TYPE_HANDLED; | 152 | return trace_handle_return(s); |
| 160 | } | 153 | } |
| 161 | 154 | ||
| 162 | static enum print_line_t | 155 | static enum print_line_t |
| @@ -168,7 +161,6 @@ print_syscall_exit(struct trace_iterator *iter, int flags, | |||
| 168 | struct syscall_trace_exit *trace; | 161 | struct syscall_trace_exit *trace; |
| 169 | int syscall; | 162 | int syscall; |
| 170 | struct syscall_metadata *entry; | 163 | struct syscall_metadata *entry; |
| 171 | int ret; | ||
| 172 | 164 | ||
| 173 | trace = (typeof(trace))ent; | 165 | trace = (typeof(trace))ent; |
| 174 | syscall = trace->nr; | 166 | syscall = trace->nr; |
| @@ -176,7 +168,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags, | |||
| 176 | 168 | ||
| 177 | if (!entry) { | 169 | if (!entry) { |
| 178 | trace_seq_putc(s, '\n'); | 170 | trace_seq_putc(s, '\n'); |
| 179 | return TRACE_TYPE_HANDLED; | 171 | goto out; |
| 180 | } | 172 | } |
| 181 | 173 | ||
| 182 | if (entry->exit_event->event.type != ent->type) { | 174 | if (entry->exit_event->event.type != ent->type) { |
| @@ -184,12 +176,11 @@ print_syscall_exit(struct trace_iterator *iter, int flags, | |||
| 184 | return TRACE_TYPE_UNHANDLED; | 176 | return TRACE_TYPE_UNHANDLED; |
| 185 | } | 177 | } |
| 186 | 178 | ||
| 187 | ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, | 179 | trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, |
| 188 | trace->ret); | 180 | trace->ret); |
| 189 | if (!ret) | ||
| 190 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 191 | 181 | ||
| 192 | return TRACE_TYPE_HANDLED; | 182 | out: |
| 183 | return trace_handle_return(s); | ||
| 193 | } | 184 | } |
| 194 | 185 | ||
| 195 | extern char *__bad_type_size(void); | 186 | extern char *__bad_type_size(void); |
| @@ -523,7 +514,7 @@ unsigned long __init __weak arch_syscall_addr(int nr) | |||
| 523 | return (unsigned long)sys_call_table[nr]; | 514 | return (unsigned long)sys_call_table[nr]; |
| 524 | } | 515 | } |
| 525 | 516 | ||
| 526 | static int __init init_ftrace_syscalls(void) | 517 | void __init init_ftrace_syscalls(void) |
| 527 | { | 518 | { |
| 528 | struct syscall_metadata *meta; | 519 | struct syscall_metadata *meta; |
| 529 | unsigned long addr; | 520 | unsigned long addr; |
| @@ -533,7 +524,7 @@ static int __init init_ftrace_syscalls(void) | |||
| 533 | GFP_KERNEL); | 524 | GFP_KERNEL); |
| 534 | if (!syscalls_metadata) { | 525 | if (!syscalls_metadata) { |
| 535 | WARN_ON(1); | 526 | WARN_ON(1); |
| 536 | return -ENOMEM; | 527 | return; |
| 537 | } | 528 | } |
| 538 | 529 | ||
| 539 | for (i = 0; i < NR_syscalls; i++) { | 530 | for (i = 0; i < NR_syscalls; i++) { |
| @@ -545,10 +536,7 @@ static int __init init_ftrace_syscalls(void) | |||
| 545 | meta->syscall_nr = i; | 536 | meta->syscall_nr = i; |
| 546 | syscalls_metadata[i] = meta; | 537 | syscalls_metadata[i] = meta; |
| 547 | } | 538 | } |
| 548 | |||
| 549 | return 0; | ||
| 550 | } | 539 | } |
| 551 | early_initcall(init_ftrace_syscalls); | ||
| 552 | 540 | ||
| 553 | #ifdef CONFIG_PERF_EVENTS | 541 | #ifdef CONFIG_PERF_EVENTS |
| 554 | 542 | ||
| @@ -586,7 +574,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) | |||
| 586 | size -= sizeof(u32); | 574 | size -= sizeof(u32); |
| 587 | 575 | ||
| 588 | rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, | 576 | rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, |
| 589 | sys_data->enter_event->event.type, regs, &rctx); | 577 | sys_data->enter_event->event.type, NULL, &rctx); |
| 590 | if (!rec) | 578 | if (!rec) |
| 591 | return; | 579 | return; |
| 592 | 580 | ||
| @@ -659,7 +647,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) | |||
| 659 | size -= sizeof(u32); | 647 | size -= sizeof(u32); |
| 660 | 648 | ||
| 661 | rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, | 649 | rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, |
| 662 | sys_data->exit_event->event.type, regs, &rctx); | 650 | sys_data->exit_event->event.type, NULL, &rctx); |
| 663 | if (!rec) | 651 | if (!rec) |
| 664 | return; | 652 | return; |
| 665 | 653 | ||
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 33ff6a24b802..b11441321e7a 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c | |||
| @@ -552,8 +552,7 @@ error: | |||
| 552 | return ret; | 552 | return ret; |
| 553 | 553 | ||
| 554 | fail_address_parse: | 554 | fail_address_parse: |
| 555 | if (inode) | 555 | iput(inode); |
| 556 | iput(inode); | ||
| 557 | 556 | ||
| 558 | pr_info("Failed to parse address or file.\n"); | 557 | pr_info("Failed to parse address or file.\n"); |
| 559 | 558 | ||
| @@ -606,7 +605,7 @@ static int probes_seq_show(struct seq_file *m, void *v) | |||
| 606 | for (i = 0; i < tu->tp.nr_args; i++) | 605 | for (i = 0; i < tu->tp.nr_args; i++) |
| 607 | seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); | 606 | seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); |
| 608 | 607 | ||
| 609 | seq_printf(m, "\n"); | 608 | seq_putc(m, '\n'); |
| 610 | return 0; | 609 | return 0; |
| 611 | } | 610 | } |
| 612 | 611 | ||
| @@ -852,16 +851,14 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e | |||
| 852 | tu = container_of(event, struct trace_uprobe, tp.call.event); | 851 | tu = container_of(event, struct trace_uprobe, tp.call.event); |
| 853 | 852 | ||
| 854 | if (is_ret_probe(tu)) { | 853 | if (is_ret_probe(tu)) { |
| 855 | if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", | 854 | trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", |
| 856 | ftrace_event_name(&tu->tp.call), | 855 | ftrace_event_name(&tu->tp.call), |
| 857 | entry->vaddr[1], entry->vaddr[0])) | 856 | entry->vaddr[1], entry->vaddr[0]); |
| 858 | goto partial; | ||
| 859 | data = DATAOF_TRACE_ENTRY(entry, true); | 857 | data = DATAOF_TRACE_ENTRY(entry, true); |
| 860 | } else { | 858 | } else { |
| 861 | if (!trace_seq_printf(s, "%s: (0x%lx)", | 859 | trace_seq_printf(s, "%s: (0x%lx)", |
| 862 | ftrace_event_name(&tu->tp.call), | 860 | ftrace_event_name(&tu->tp.call), |
| 863 | entry->vaddr[0])) | 861 | entry->vaddr[0]); |
| 864 | goto partial; | ||
| 865 | data = DATAOF_TRACE_ENTRY(entry, false); | 862 | data = DATAOF_TRACE_ENTRY(entry, false); |
| 866 | } | 863 | } |
| 867 | 864 | ||
| @@ -869,14 +866,13 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e | |||
| 869 | struct probe_arg *parg = &tu->tp.args[i]; | 866 | struct probe_arg *parg = &tu->tp.args[i]; |
| 870 | 867 | ||
| 871 | if (!parg->type->print(s, parg->name, data + parg->offset, entry)) | 868 | if (!parg->type->print(s, parg->name, data + parg->offset, entry)) |
| 872 | goto partial; | 869 | goto out; |
| 873 | } | 870 | } |
| 874 | 871 | ||
| 875 | if (trace_seq_puts(s, "\n")) | 872 | trace_seq_putc(s, '\n'); |
| 876 | return TRACE_TYPE_HANDLED; | ||
| 877 | 873 | ||
| 878 | partial: | 874 | out: |
| 879 | return TRACE_TYPE_PARTIAL_LINE; | 875 | return trace_handle_return(s); |
| 880 | } | 876 | } |
| 881 | 877 | ||
| 882 | typedef bool (*filter_func_t)(struct uprobe_consumer *self, | 878 | typedef bool (*filter_func_t)(struct uprobe_consumer *self, |
| @@ -1115,7 +1111,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, | |||
| 1115 | if (hlist_empty(head)) | 1111 | if (hlist_empty(head)) |
| 1116 | goto out; | 1112 | goto out; |
| 1117 | 1113 | ||
| 1118 | entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); | 1114 | entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx); |
| 1119 | if (!entry) | 1115 | if (!entry) |
| 1120 | goto out; | 1116 | goto out; |
| 1121 | 1117 | ||
diff --git a/kernel/uid16.c b/kernel/uid16.c index 602e5bbbceff..d58cc4d8f0d1 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c | |||
| @@ -176,7 +176,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) | |||
| 176 | struct group_info *group_info; | 176 | struct group_info *group_info; |
| 177 | int retval; | 177 | int retval; |
| 178 | 178 | ||
| 179 | if (!ns_capable(current_user_ns(), CAP_SETGID)) | 179 | if (!may_setgroups()) |
| 180 | return -EPERM; | 180 | return -EPERM; |
| 181 | if ((unsigned)gidsetsize > NGROUPS_MAX) | 181 | if ((unsigned)gidsetsize > NGROUPS_MAX) |
| 182 | return -EINVAL; | 182 | return -EINVAL; |
diff --git a/kernel/user.c b/kernel/user.c index 4efa39350e44..b069ccbfb0b0 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
| @@ -50,7 +50,11 @@ struct user_namespace init_user_ns = { | |||
| 50 | .count = ATOMIC_INIT(3), | 50 | .count = ATOMIC_INIT(3), |
| 51 | .owner = GLOBAL_ROOT_UID, | 51 | .owner = GLOBAL_ROOT_UID, |
| 52 | .group = GLOBAL_ROOT_GID, | 52 | .group = GLOBAL_ROOT_GID, |
| 53 | .proc_inum = PROC_USER_INIT_INO, | 53 | .ns.inum = PROC_USER_INIT_INO, |
| 54 | #ifdef CONFIG_USER_NS | ||
| 55 | .ns.ops = &userns_operations, | ||
| 56 | #endif | ||
| 57 | .flags = USERNS_INIT_FLAGS, | ||
| 54 | #ifdef CONFIG_PERSISTENT_KEYRINGS | 58 | #ifdef CONFIG_PERSISTENT_KEYRINGS |
| 55 | .persistent_keyring_register_sem = | 59 | .persistent_keyring_register_sem = |
| 56 | __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem), | 60 | __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem), |
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index aa312b0dc3ec..4109f8320684 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c | |||
| @@ -24,6 +24,7 @@ | |||
| 24 | #include <linux/fs_struct.h> | 24 | #include <linux/fs_struct.h> |
| 25 | 25 | ||
| 26 | static struct kmem_cache *user_ns_cachep __read_mostly; | 26 | static struct kmem_cache *user_ns_cachep __read_mostly; |
| 27 | static DEFINE_MUTEX(userns_state_mutex); | ||
| 27 | 28 | ||
| 28 | static bool new_idmap_permitted(const struct file *file, | 29 | static bool new_idmap_permitted(const struct file *file, |
| 29 | struct user_namespace *ns, int cap_setid, | 30 | struct user_namespace *ns, int cap_setid, |
| @@ -86,11 +87,12 @@ int create_user_ns(struct cred *new) | |||
| 86 | if (!ns) | 87 | if (!ns) |
| 87 | return -ENOMEM; | 88 | return -ENOMEM; |
| 88 | 89 | ||
| 89 | ret = proc_alloc_inum(&ns->proc_inum); | 90 | ret = ns_alloc_inum(&ns->ns); |
| 90 | if (ret) { | 91 | if (ret) { |
| 91 | kmem_cache_free(user_ns_cachep, ns); | 92 | kmem_cache_free(user_ns_cachep, ns); |
| 92 | return ret; | 93 | return ret; |
| 93 | } | 94 | } |
| 95 | ns->ns.ops = &userns_operations; | ||
| 94 | 96 | ||
| 95 | atomic_set(&ns->count, 1); | 97 | atomic_set(&ns->count, 1); |
| 96 | /* Leave the new->user_ns reference with the new user namespace. */ | 98 | /* Leave the new->user_ns reference with the new user namespace. */ |
| @@ -99,6 +101,11 @@ int create_user_ns(struct cred *new) | |||
| 99 | ns->owner = owner; | 101 | ns->owner = owner; |
| 100 | ns->group = group; | 102 | ns->group = group; |
| 101 | 103 | ||
| 104 | /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ | ||
| 105 | mutex_lock(&userns_state_mutex); | ||
| 106 | ns->flags = parent_ns->flags; | ||
| 107 | mutex_unlock(&userns_state_mutex); | ||
| 108 | |||
| 102 | set_cred_user_ns(new, ns); | 109 | set_cred_user_ns(new, ns); |
| 103 | 110 | ||
| 104 | #ifdef CONFIG_PERSISTENT_KEYRINGS | 111 | #ifdef CONFIG_PERSISTENT_KEYRINGS |
| @@ -136,7 +143,7 @@ void free_user_ns(struct user_namespace *ns) | |||
| 136 | #ifdef CONFIG_PERSISTENT_KEYRINGS | 143 | #ifdef CONFIG_PERSISTENT_KEYRINGS |
| 137 | key_put(ns->persistent_keyring_register); | 144 | key_put(ns->persistent_keyring_register); |
| 138 | #endif | 145 | #endif |
| 139 | proc_free_inum(ns->proc_inum); | 146 | ns_free_inum(&ns->ns); |
| 140 | kmem_cache_free(user_ns_cachep, ns); | 147 | kmem_cache_free(user_ns_cachep, ns); |
| 141 | ns = parent; | 148 | ns = parent; |
| 142 | } while (atomic_dec_and_test(&parent->count)); | 149 | } while (atomic_dec_and_test(&parent->count)); |
| @@ -583,9 +590,6 @@ static bool mappings_overlap(struct uid_gid_map *new_map, | |||
| 583 | return false; | 590 | return false; |
| 584 | } | 591 | } |
| 585 | 592 | ||
| 586 | |||
| 587 | static DEFINE_MUTEX(id_map_mutex); | ||
| 588 | |||
| 589 | static ssize_t map_write(struct file *file, const char __user *buf, | 593 | static ssize_t map_write(struct file *file, const char __user *buf, |
| 590 | size_t count, loff_t *ppos, | 594 | size_t count, loff_t *ppos, |
| 591 | int cap_setid, | 595 | int cap_setid, |
| @@ -602,7 +606,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, | |||
| 602 | ssize_t ret = -EINVAL; | 606 | ssize_t ret = -EINVAL; |
| 603 | 607 | ||
| 604 | /* | 608 | /* |
| 605 | * The id_map_mutex serializes all writes to any given map. | 609 | * The userns_state_mutex serializes all writes to any given map. |
| 606 | * | 610 | * |
| 607 | * Any map is only ever written once. | 611 | * Any map is only ever written once. |
| 608 | * | 612 | * |
| @@ -620,7 +624,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, | |||
| 620 | * order and smp_rmb() is guaranteed that we don't have crazy | 624 | * order and smp_rmb() is guaranteed that we don't have crazy |
| 621 | * architectures returning stale data. | 625 | * architectures returning stale data. |
| 622 | */ | 626 | */ |
| 623 | mutex_lock(&id_map_mutex); | 627 | mutex_lock(&userns_state_mutex); |
| 624 | 628 | ||
| 625 | ret = -EPERM; | 629 | ret = -EPERM; |
| 626 | /* Only allow one successful write to the map */ | 630 | /* Only allow one successful write to the map */ |
| @@ -640,7 +644,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, | |||
| 640 | if (!page) | 644 | if (!page) |
| 641 | goto out; | 645 | goto out; |
| 642 | 646 | ||
| 643 | /* Only allow <= page size writes at the beginning of the file */ | 647 | /* Only allow < page size writes at the beginning of the file */ |
| 644 | ret = -EINVAL; | 648 | ret = -EINVAL; |
| 645 | if ((*ppos != 0) || (count >= PAGE_SIZE)) | 649 | if ((*ppos != 0) || (count >= PAGE_SIZE)) |
| 646 | goto out; | 650 | goto out; |
| @@ -750,7 +754,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, | |||
| 750 | *ppos = count; | 754 | *ppos = count; |
| 751 | ret = count; | 755 | ret = count; |
| 752 | out: | 756 | out: |
| 753 | mutex_unlock(&id_map_mutex); | 757 | mutex_unlock(&userns_state_mutex); |
| 754 | if (page) | 758 | if (page) |
| 755 | free_page(page); | 759 | free_page(page); |
| 756 | return ret; | 760 | return ret; |
| @@ -812,16 +816,21 @@ static bool new_idmap_permitted(const struct file *file, | |||
| 812 | struct user_namespace *ns, int cap_setid, | 816 | struct user_namespace *ns, int cap_setid, |
| 813 | struct uid_gid_map *new_map) | 817 | struct uid_gid_map *new_map) |
| 814 | { | 818 | { |
| 815 | /* Allow mapping to your own filesystem ids */ | 819 | const struct cred *cred = file->f_cred; |
| 816 | if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) { | 820 | /* Don't allow mappings that would allow anything that wouldn't |
| 821 | * be allowed without the establishment of unprivileged mappings. | ||
| 822 | */ | ||
| 823 | if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) && | ||
| 824 | uid_eq(ns->owner, cred->euid)) { | ||
| 817 | u32 id = new_map->extent[0].lower_first; | 825 | u32 id = new_map->extent[0].lower_first; |
| 818 | if (cap_setid == CAP_SETUID) { | 826 | if (cap_setid == CAP_SETUID) { |
| 819 | kuid_t uid = make_kuid(ns->parent, id); | 827 | kuid_t uid = make_kuid(ns->parent, id); |
| 820 | if (uid_eq(uid, file->f_cred->fsuid)) | 828 | if (uid_eq(uid, cred->euid)) |
| 821 | return true; | 829 | return true; |
| 822 | } else if (cap_setid == CAP_SETGID) { | 830 | } else if (cap_setid == CAP_SETGID) { |
| 823 | kgid_t gid = make_kgid(ns->parent, id); | 831 | kgid_t gid = make_kgid(ns->parent, id); |
| 824 | if (gid_eq(gid, file->f_cred->fsgid)) | 832 | if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) && |
| 833 | gid_eq(gid, cred->egid)) | ||
| 825 | return true; | 834 | return true; |
| 826 | } | 835 | } |
| 827 | } | 836 | } |
| @@ -841,7 +850,106 @@ static bool new_idmap_permitted(const struct file *file, | |||
| 841 | return false; | 850 | return false; |
| 842 | } | 851 | } |
| 843 | 852 | ||
| 844 | static void *userns_get(struct task_struct *task) | 853 | int proc_setgroups_show(struct seq_file *seq, void *v) |
| 854 | { | ||
| 855 | struct user_namespace *ns = seq->private; | ||
| 856 | unsigned long userns_flags = ACCESS_ONCE(ns->flags); | ||
| 857 | |||
| 858 | seq_printf(seq, "%s\n", | ||
| 859 | (userns_flags & USERNS_SETGROUPS_ALLOWED) ? | ||
| 860 | "allow" : "deny"); | ||
| 861 | return 0; | ||
| 862 | } | ||
| 863 | |||
| 864 | ssize_t proc_setgroups_write(struct file *file, const char __user *buf, | ||
| 865 | size_t count, loff_t *ppos) | ||
| 866 | { | ||
| 867 | struct seq_file *seq = file->private_data; | ||
| 868 | struct user_namespace *ns = seq->private; | ||
| 869 | char kbuf[8], *pos; | ||
| 870 | bool setgroups_allowed; | ||
| 871 | ssize_t ret; | ||
| 872 | |||
| 873 | /* Only allow a very narrow range of strings to be written */ | ||
| 874 | ret = -EINVAL; | ||
| 875 | if ((*ppos != 0) || (count >= sizeof(kbuf))) | ||
| 876 | goto out; | ||
| 877 | |||
| 878 | /* What was written? */ | ||
| 879 | ret = -EFAULT; | ||
| 880 | if (copy_from_user(kbuf, buf, count)) | ||
| 881 | goto out; | ||
| 882 | kbuf[count] = '\0'; | ||
| 883 | pos = kbuf; | ||
| 884 | |||
| 885 | /* What is being requested? */ | ||
| 886 | ret = -EINVAL; | ||
| 887 | if (strncmp(pos, "allow", 5) == 0) { | ||
| 888 | pos += 5; | ||
| 889 | setgroups_allowed = true; | ||
| 890 | } | ||
| 891 | else if (strncmp(pos, "deny", 4) == 0) { | ||
| 892 | pos += 4; | ||
| 893 | setgroups_allowed = false; | ||
| 894 | } | ||
| 895 | else | ||
| 896 | goto out; | ||
| 897 | |||
| 898 | /* Verify there is not trailing junk on the line */ | ||
| 899 | pos = skip_spaces(pos); | ||
| 900 | if (*pos != '\0') | ||
| 901 | goto out; | ||
| 902 | |||
| 903 | ret = -EPERM; | ||
| 904 | mutex_lock(&userns_state_mutex); | ||
| 905 | if (setgroups_allowed) { | ||
| 906 | /* Enabling setgroups after setgroups has been disabled | ||
| 907 | * is not allowed. | ||
| 908 | */ | ||
| 909 | if (!(ns->flags & USERNS_SETGROUPS_ALLOWED)) | ||
| 910 | goto out_unlock; | ||
| 911 | } else { | ||
| 912 | /* Permanently disabling setgroups after setgroups has | ||
| 913 | * been enabled by writing the gid_map is not allowed. | ||
| 914 | */ | ||
| 915 | if (ns->gid_map.nr_extents != 0) | ||
| 916 | goto out_unlock; | ||
| 917 | ns->flags &= ~USERNS_SETGROUPS_ALLOWED; | ||
| 918 | } | ||
| 919 | mutex_unlock(&userns_state_mutex); | ||
| 920 | |||
| 921 | /* Report a successful write */ | ||
| 922 | *ppos = count; | ||
| 923 | ret = count; | ||
| 924 | out: | ||
| 925 | return ret; | ||
| 926 | out_unlock: | ||
| 927 | mutex_unlock(&userns_state_mutex); | ||
| 928 | goto out; | ||
| 929 | } | ||
| 930 | |||
| 931 | bool userns_may_setgroups(const struct user_namespace *ns) | ||
| 932 | { | ||
| 933 | bool allowed; | ||
| 934 | |||
| 935 | mutex_lock(&userns_state_mutex); | ||
| 936 | /* It is not safe to use setgroups until a gid mapping in | ||
| 937 | * the user namespace has been established. | ||
| 938 | */ | ||
| 939 | allowed = ns->gid_map.nr_extents != 0; | ||
| 940 | /* Is setgroups allowed? */ | ||
| 941 | allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED); | ||
| 942 | mutex_unlock(&userns_state_mutex); | ||
| 943 | |||
| 944 | return allowed; | ||
| 945 | } | ||
| 946 | |||
| 947 | static inline struct user_namespace *to_user_ns(struct ns_common *ns) | ||
| 948 | { | ||
| 949 | return container_of(ns, struct user_namespace, ns); | ||
| 950 | } | ||
| 951 | |||
| 952 | static struct ns_common *userns_get(struct task_struct *task) | ||
| 845 | { | 953 | { |
| 846 | struct user_namespace *user_ns; | 954 | struct user_namespace *user_ns; |
| 847 | 955 | ||
| @@ -849,17 +957,17 @@ static void *userns_get(struct task_struct *task) | |||
| 849 | user_ns = get_user_ns(__task_cred(task)->user_ns); | 957 | user_ns = get_user_ns(__task_cred(task)->user_ns); |
| 850 | rcu_read_unlock(); | 958 | rcu_read_unlock(); |
| 851 | 959 | ||
| 852 | return user_ns; | 960 | return user_ns ? &user_ns->ns : NULL; |
| 853 | } | 961 | } |
| 854 | 962 | ||
| 855 | static void userns_put(void *ns) | 963 | static void userns_put(struct ns_common *ns) |
| 856 | { | 964 | { |
| 857 | put_user_ns(ns); | 965 | put_user_ns(to_user_ns(ns)); |
| 858 | } | 966 | } |
| 859 | 967 | ||
| 860 | static int userns_install(struct nsproxy *nsproxy, void *ns) | 968 | static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns) |
| 861 | { | 969 | { |
| 862 | struct user_namespace *user_ns = ns; | 970 | struct user_namespace *user_ns = to_user_ns(ns); |
| 863 | struct cred *cred; | 971 | struct cred *cred; |
| 864 | 972 | ||
| 865 | /* Don't allow gaining capabilities by reentering | 973 | /* Don't allow gaining capabilities by reentering |
| @@ -888,19 +996,12 @@ static int userns_install(struct nsproxy *nsproxy, void *ns) | |||
| 888 | return commit_creds(cred); | 996 | return commit_creds(cred); |
| 889 | } | 997 | } |
| 890 | 998 | ||
| 891 | static unsigned int userns_inum(void *ns) | ||
| 892 | { | ||
| 893 | struct user_namespace *user_ns = ns; | ||
| 894 | return user_ns->proc_inum; | ||
| 895 | } | ||
| 896 | |||
| 897 | const struct proc_ns_operations userns_operations = { | 999 | const struct proc_ns_operations userns_operations = { |
| 898 | .name = "user", | 1000 | .name = "user", |
| 899 | .type = CLONE_NEWUSER, | 1001 | .type = CLONE_NEWUSER, |
| 900 | .get = userns_get, | 1002 | .get = userns_get, |
| 901 | .put = userns_put, | 1003 | .put = userns_put, |
| 902 | .install = userns_install, | 1004 | .install = userns_install, |
| 903 | .inum = userns_inum, | ||
| 904 | }; | 1005 | }; |
| 905 | 1006 | ||
| 906 | static __init int user_namespaces_init(void) | 1007 | static __init int user_namespaces_init(void) |
diff --git a/kernel/utsname.c b/kernel/utsname.c index 883aaaa7de8a..831ea7108232 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c | |||
| @@ -42,12 +42,14 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, | |||
| 42 | if (!ns) | 42 | if (!ns) |
| 43 | return ERR_PTR(-ENOMEM); | 43 | return ERR_PTR(-ENOMEM); |
| 44 | 44 | ||
| 45 | err = proc_alloc_inum(&ns->proc_inum); | 45 | err = ns_alloc_inum(&ns->ns); |
| 46 | if (err) { | 46 | if (err) { |
| 47 | kfree(ns); | 47 | kfree(ns); |
| 48 | return ERR_PTR(err); | 48 | return ERR_PTR(err); |
| 49 | } | 49 | } |
| 50 | 50 | ||
| 51 | ns->ns.ops = &utsns_operations; | ||
| 52 | |||
| 51 | down_read(&uts_sem); | 53 | down_read(&uts_sem); |
| 52 | memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); | 54 | memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); |
| 53 | ns->user_ns = get_user_ns(user_ns); | 55 | ns->user_ns = get_user_ns(user_ns); |
| @@ -84,11 +86,16 @@ void free_uts_ns(struct kref *kref) | |||
| 84 | 86 | ||
| 85 | ns = container_of(kref, struct uts_namespace, kref); | 87 | ns = container_of(kref, struct uts_namespace, kref); |
| 86 | put_user_ns(ns->user_ns); | 88 | put_user_ns(ns->user_ns); |
| 87 | proc_free_inum(ns->proc_inum); | 89 | ns_free_inum(&ns->ns); |
| 88 | kfree(ns); | 90 | kfree(ns); |
| 89 | } | 91 | } |
| 90 | 92 | ||
| 91 | static void *utsns_get(struct task_struct *task) | 93 | static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) |
| 94 | { | ||
| 95 | return container_of(ns, struct uts_namespace, ns); | ||
| 96 | } | ||
| 97 | |||
| 98 | static struct ns_common *utsns_get(struct task_struct *task) | ||
| 92 | { | 99 | { |
| 93 | struct uts_namespace *ns = NULL; | 100 | struct uts_namespace *ns = NULL; |
| 94 | struct nsproxy *nsproxy; | 101 | struct nsproxy *nsproxy; |
| @@ -101,17 +108,17 @@ static void *utsns_get(struct task_struct *task) | |||
| 101 | } | 108 | } |
| 102 | task_unlock(task); | 109 | task_unlock(task); |
| 103 | 110 | ||
| 104 | return ns; | 111 | return ns ? &ns->ns : NULL; |
| 105 | } | 112 | } |
| 106 | 113 | ||
| 107 | static void utsns_put(void *ns) | 114 | static void utsns_put(struct ns_common *ns) |
| 108 | { | 115 | { |
| 109 | put_uts_ns(ns); | 116 | put_uts_ns(to_uts_ns(ns)); |
| 110 | } | 117 | } |
| 111 | 118 | ||
| 112 | static int utsns_install(struct nsproxy *nsproxy, void *new) | 119 | static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new) |
| 113 | { | 120 | { |
| 114 | struct uts_namespace *ns = new; | 121 | struct uts_namespace *ns = to_uts_ns(new); |
| 115 | 122 | ||
| 116 | if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || | 123 | if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || |
| 117 | !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) | 124 | !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) |
| @@ -123,18 +130,10 @@ static int utsns_install(struct nsproxy *nsproxy, void *new) | |||
| 123 | return 0; | 130 | return 0; |
| 124 | } | 131 | } |
| 125 | 132 | ||
| 126 | static unsigned int utsns_inum(void *vp) | ||
| 127 | { | ||
| 128 | struct uts_namespace *ns = vp; | ||
| 129 | |||
| 130 | return ns->proc_inum; | ||
| 131 | } | ||
| 132 | |||
| 133 | const struct proc_ns_operations utsns_operations = { | 133 | const struct proc_ns_operations utsns_operations = { |
| 134 | .name = "uts", | 134 | .name = "uts", |
| 135 | .type = CLONE_NEWUTS, | 135 | .type = CLONE_NEWUTS, |
| 136 | .get = utsns_get, | 136 | .get = utsns_get, |
| 137 | .put = utsns_put, | 137 | .put = utsns_put, |
| 138 | .install = utsns_install, | 138 | .install = utsns_install, |
| 139 | .inum = utsns_inum, | ||
| 140 | }; | 139 | }; |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 09b685daee3d..beeeac9e0e3e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
| @@ -1804,8 +1804,8 @@ static void pool_mayday_timeout(unsigned long __pool) | |||
| 1804 | struct worker_pool *pool = (void *)__pool; | 1804 | struct worker_pool *pool = (void *)__pool; |
| 1805 | struct work_struct *work; | 1805 | struct work_struct *work; |
| 1806 | 1806 | ||
| 1807 | spin_lock_irq(&wq_mayday_lock); /* for wq->maydays */ | 1807 | spin_lock_irq(&pool->lock); |
| 1808 | spin_lock(&pool->lock); | 1808 | spin_lock(&wq_mayday_lock); /* for wq->maydays */ |
| 1809 | 1809 | ||
| 1810 | if (need_to_create_worker(pool)) { | 1810 | if (need_to_create_worker(pool)) { |
| 1811 | /* | 1811 | /* |
| @@ -1818,8 +1818,8 @@ static void pool_mayday_timeout(unsigned long __pool) | |||
| 1818 | send_mayday(work); | 1818 | send_mayday(work); |
| 1819 | } | 1819 | } |
| 1820 | 1820 | ||
| 1821 | spin_unlock(&pool->lock); | 1821 | spin_unlock(&wq_mayday_lock); |
| 1822 | spin_unlock_irq(&wq_mayday_lock); | 1822 | spin_unlock_irq(&pool->lock); |
| 1823 | 1823 | ||
| 1824 | mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); | 1824 | mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); |
| 1825 | } | 1825 | } |
| @@ -1841,17 +1841,11 @@ static void pool_mayday_timeout(unsigned long __pool) | |||
| 1841 | * spin_lock_irq(pool->lock) which may be released and regrabbed | 1841 | * spin_lock_irq(pool->lock) which may be released and regrabbed |
| 1842 | * multiple times. Does GFP_KERNEL allocations. Called only from | 1842 | * multiple times. Does GFP_KERNEL allocations. Called only from |
| 1843 | * manager. | 1843 | * manager. |
| 1844 | * | ||
| 1845 | * Return: | ||
| 1846 | * %false if no action was taken and pool->lock stayed locked, %true | ||
| 1847 | * otherwise. | ||
| 1848 | */ | 1844 | */ |
| 1849 | static bool maybe_create_worker(struct worker_pool *pool) | 1845 | static void maybe_create_worker(struct worker_pool *pool) |
| 1850 | __releases(&pool->lock) | 1846 | __releases(&pool->lock) |
| 1851 | __acquires(&pool->lock) | 1847 | __acquires(&pool->lock) |
| 1852 | { | 1848 | { |
| 1853 | if (!need_to_create_worker(pool)) | ||
| 1854 | return false; | ||
| 1855 | restart: | 1849 | restart: |
| 1856 | spin_unlock_irq(&pool->lock); | 1850 | spin_unlock_irq(&pool->lock); |
| 1857 | 1851 | ||
| @@ -1877,7 +1871,6 @@ restart: | |||
| 1877 | */ | 1871 | */ |
| 1878 | if (need_to_create_worker(pool)) | 1872 | if (need_to_create_worker(pool)) |
| 1879 | goto restart; | 1873 | goto restart; |
| 1880 | return true; | ||
| 1881 | } | 1874 | } |
| 1882 | 1875 | ||
| 1883 | /** | 1876 | /** |
| @@ -1897,16 +1890,14 @@ restart: | |||
| 1897 | * multiple times. Does GFP_KERNEL allocations. | 1890 | * multiple times. Does GFP_KERNEL allocations. |
| 1898 | * | 1891 | * |
| 1899 | * Return: | 1892 | * Return: |
| 1900 | * %false if the pool don't need management and the caller can safely start | 1893 | * %false if the pool doesn't need management and the caller can safely |
| 1901 | * processing works, %true indicates that the function released pool->lock | 1894 | * start processing works, %true if management function was performed and |
| 1902 | * and reacquired it to perform some management function and that the | 1895 | * the conditions that the caller verified before calling the function may |
| 1903 | * conditions that the caller verified while holding the lock before | 1896 | * no longer be true. |
| 1904 | * calling the function might no longer be true. | ||
| 1905 | */ | 1897 | */ |
| 1906 | static bool manage_workers(struct worker *worker) | 1898 | static bool manage_workers(struct worker *worker) |
| 1907 | { | 1899 | { |
| 1908 | struct worker_pool *pool = worker->pool; | 1900 | struct worker_pool *pool = worker->pool; |
| 1909 | bool ret = false; | ||
| 1910 | 1901 | ||
| 1911 | /* | 1902 | /* |
| 1912 | * Anyone who successfully grabs manager_arb wins the arbitration | 1903 | * Anyone who successfully grabs manager_arb wins the arbitration |
| @@ -1919,12 +1910,12 @@ static bool manage_workers(struct worker *worker) | |||
| 1919 | * actual management, the pool may stall indefinitely. | 1910 | * actual management, the pool may stall indefinitely. |
| 1920 | */ | 1911 | */ |
| 1921 | if (!mutex_trylock(&pool->manager_arb)) | 1912 | if (!mutex_trylock(&pool->manager_arb)) |
| 1922 | return ret; | 1913 | return false; |
| 1923 | 1914 | ||
| 1924 | ret |= maybe_create_worker(pool); | 1915 | maybe_create_worker(pool); |
| 1925 | 1916 | ||
| 1926 | mutex_unlock(&pool->manager_arb); | 1917 | mutex_unlock(&pool->manager_arb); |
| 1927 | return ret; | 1918 | return true; |
| 1928 | } | 1919 | } |
| 1929 | 1920 | ||
| 1930 | /** | 1921 | /** |
| @@ -2248,12 +2239,30 @@ repeat: | |||
| 2248 | * Slurp in all works issued via this workqueue and | 2239 | * Slurp in all works issued via this workqueue and |
| 2249 | * process'em. | 2240 | * process'em. |
| 2250 | */ | 2241 | */ |
| 2251 | WARN_ON_ONCE(!list_empty(&rescuer->scheduled)); | 2242 | WARN_ON_ONCE(!list_empty(scheduled)); |
| 2252 | list_for_each_entry_safe(work, n, &pool->worklist, entry) | 2243 | list_for_each_entry_safe(work, n, &pool->worklist, entry) |
| 2253 | if (get_work_pwq(work) == pwq) | 2244 | if (get_work_pwq(work) == pwq) |
| 2254 | move_linked_works(work, scheduled, &n); | 2245 | move_linked_works(work, scheduled, &n); |
| 2255 | 2246 | ||
| 2256 | process_scheduled_works(rescuer); | 2247 | if (!list_empty(scheduled)) { |
| 2248 | process_scheduled_works(rescuer); | ||
| 2249 | |||
| 2250 | /* | ||
| 2251 | * The above execution of rescued work items could | ||
| 2252 | * have created more to rescue through | ||
| 2253 | * pwq_activate_first_delayed() or chained | ||
| 2254 | * queueing. Let's put @pwq back on mayday list so | ||
| 2255 | * that such back-to-back work items, which may be | ||
| 2256 | * being used to relieve memory pressure, don't | ||
| 2257 | * incur MAYDAY_INTERVAL delay inbetween. | ||
| 2258 | */ | ||
| 2259 | if (need_to_create_worker(pool)) { | ||
| 2260 | spin_lock(&wq_mayday_lock); | ||
| 2261 | get_pwq(pwq); | ||
| 2262 | list_move_tail(&pwq->mayday_node, &wq->maydays); | ||
| 2263 | spin_unlock(&wq_mayday_lock); | ||
| 2264 | } | ||
| 2265 | } | ||
| 2257 | 2266 | ||
| 2258 | /* | 2267 | /* |
| 2259 | * Put the reference grabbed by send_mayday(). @pool won't | 2268 | * Put the reference grabbed by send_mayday(). @pool won't |
