diff options
| author | Ingo Molnar <mingo@kernel.org> | 2012-07-25 15:40:40 -0400 |
|---|---|---|
| committer | Ingo Molnar <mingo@kernel.org> | 2012-07-25 15:40:40 -0400 |
| commit | d431adfbc9b7de651f3164c6b7ffcad75805d7e4 (patch) | |
| tree | 29bce222c81a3a392e51c11e2188659aa6d1bded /kernel | |
| parent | d6250a3f12edb3a86db9598ffeca3de8b4a219e9 (diff) | |
| parent | e2b34e311be3a57c9abcb927e37a57e38913714c (diff) | |
Merge branch 'linus' into x86/urgent
Merge in Linus's tree to avoid a conflict.
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
34 files changed, 1838 insertions, 1417 deletions
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 67b847dfa2bb..1f91413edb87 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c | |||
| @@ -14,6 +14,7 @@ | |||
| 14 | #include <linux/ctype.h> | 14 | #include <linux/ctype.h> |
| 15 | #include <linux/string.h> | 15 | #include <linux/string.h> |
| 16 | #include <linux/kernel.h> | 16 | #include <linux/kernel.h> |
| 17 | #include <linux/kmsg_dump.h> | ||
| 17 | #include <linux/reboot.h> | 18 | #include <linux/reboot.h> |
| 18 | #include <linux/sched.h> | 19 | #include <linux/sched.h> |
| 19 | #include <linux/sysrq.h> | 20 | #include <linux/sysrq.h> |
| @@ -2040,8 +2041,15 @@ static int kdb_env(int argc, const char **argv) | |||
| 2040 | */ | 2041 | */ |
| 2041 | static int kdb_dmesg(int argc, const char **argv) | 2042 | static int kdb_dmesg(int argc, const char **argv) |
| 2042 | { | 2043 | { |
| 2043 | char *syslog_data[4], *start, *end, c = '\0', *p; | 2044 | int diag; |
| 2044 | int diag, logging, logsize, lines = 0, adjust = 0, n; | 2045 | int logging; |
| 2046 | int lines = 0; | ||
| 2047 | int adjust = 0; | ||
| 2048 | int n = 0; | ||
| 2049 | int skip = 0; | ||
| 2050 | struct kmsg_dumper dumper = { .active = 1 }; | ||
| 2051 | size_t len; | ||
| 2052 | char buf[201]; | ||
| 2045 | 2053 | ||
| 2046 | if (argc > 2) | 2054 | if (argc > 2) |
| 2047 | return KDB_ARGCOUNT; | 2055 | return KDB_ARGCOUNT; |
| @@ -2064,22 +2072,10 @@ static int kdb_dmesg(int argc, const char **argv) | |||
| 2064 | kdb_set(2, setargs); | 2072 | kdb_set(2, setargs); |
| 2065 | } | 2073 | } |
| 2066 | 2074 | ||
| 2067 | /* syslog_data[0,1] physical start, end+1. syslog_data[2,3] | 2075 | kmsg_dump_rewind_nolock(&dumper); |
| 2068 | * logical start, end+1. */ | 2076 | while (kmsg_dump_get_line_nolock(&dumper, 1, NULL, 0, NULL)) |
| 2069 | kdb_syslog_data(syslog_data); | 2077 | n++; |
| 2070 | if (syslog_data[2] == syslog_data[3]) | 2078 | |
| 2071 | return 0; | ||
| 2072 | logsize = syslog_data[1] - syslog_data[0]; | ||
| 2073 | start = syslog_data[2]; | ||
| 2074 | end = syslog_data[3]; | ||
| 2075 | #define KDB_WRAP(p) (((p - syslog_data[0]) % logsize) + syslog_data[0]) | ||
| 2076 | for (n = 0, p = start; p < end; ++p) { | ||
| 2077 | c = *KDB_WRAP(p); | ||
| 2078 | if (c == '\n') | ||
| 2079 | ++n; | ||
| 2080 | } | ||
| 2081 | if (c != '\n') | ||
| 2082 | ++n; | ||
| 2083 | if (lines < 0) { | 2079 | if (lines < 0) { |
| 2084 | if (adjust >= n) | 2080 | if (adjust >= n) |
| 2085 | kdb_printf("buffer only contains %d lines, nothing " | 2081 | kdb_printf("buffer only contains %d lines, nothing " |
| @@ -2087,21 +2083,11 @@ static int kdb_dmesg(int argc, const char **argv) | |||
| 2087 | else if (adjust - lines >= n) | 2083 | else if (adjust - lines >= n) |
| 2088 | kdb_printf("buffer only contains %d lines, last %d " | 2084 | kdb_printf("buffer only contains %d lines, last %d " |
| 2089 | "lines printed\n", n, n - adjust); | 2085 | "lines printed\n", n, n - adjust); |
| 2090 | if (adjust) { | 2086 | skip = adjust; |
| 2091 | for (; start < end && adjust; ++start) { | 2087 | lines = abs(lines); |
| 2092 | if (*KDB_WRAP(start) == '\n') | ||
| 2093 | --adjust; | ||
| 2094 | } | ||
| 2095 | if (start < end) | ||
| 2096 | ++start; | ||
| 2097 | } | ||
| 2098 | for (p = start; p < end && lines; ++p) { | ||
| 2099 | if (*KDB_WRAP(p) == '\n') | ||
| 2100 | ++lines; | ||
| 2101 | } | ||
| 2102 | end = p; | ||
| 2103 | } else if (lines > 0) { | 2088 | } else if (lines > 0) { |
| 2104 | int skip = n - (adjust + lines); | 2089 | skip = n - lines - adjust; |
| 2090 | lines = abs(lines); | ||
| 2105 | if (adjust >= n) { | 2091 | if (adjust >= n) { |
| 2106 | kdb_printf("buffer only contains %d lines, " | 2092 | kdb_printf("buffer only contains %d lines, " |
| 2107 | "nothing printed\n", n); | 2093 | "nothing printed\n", n); |
| @@ -2112,35 +2098,24 @@ static int kdb_dmesg(int argc, const char **argv) | |||
| 2112 | kdb_printf("buffer only contains %d lines, first " | 2098 | kdb_printf("buffer only contains %d lines, first " |
| 2113 | "%d lines printed\n", n, lines); | 2099 | "%d lines printed\n", n, lines); |
| 2114 | } | 2100 | } |
| 2115 | for (; start < end && skip; ++start) { | 2101 | } else { |
| 2116 | if (*KDB_WRAP(start) == '\n') | 2102 | lines = n; |
| 2117 | --skip; | ||
| 2118 | } | ||
| 2119 | for (p = start; p < end && lines; ++p) { | ||
| 2120 | if (*KDB_WRAP(p) == '\n') | ||
| 2121 | --lines; | ||
| 2122 | } | ||
| 2123 | end = p; | ||
| 2124 | } | 2103 | } |
| 2125 | /* Do a line at a time (max 200 chars) to reduce protocol overhead */ | 2104 | |
| 2126 | c = '\n'; | 2105 | if (skip >= n || skip < 0) |
| 2127 | while (start != end) { | 2106 | return 0; |
| 2128 | char buf[201]; | 2107 | |
| 2129 | p = buf; | 2108 | kmsg_dump_rewind_nolock(&dumper); |
| 2130 | if (KDB_FLAG(CMD_INTERRUPT)) | 2109 | while (kmsg_dump_get_line_nolock(&dumper, 1, buf, sizeof(buf), &len)) { |
| 2131 | return 0; | 2110 | if (skip) { |
| 2132 | while (start < end && (c = *KDB_WRAP(start)) && | 2111 | skip--; |
| 2133 | (p - buf) < sizeof(buf)-1) { | 2112 | continue; |
| 2134 | ++start; | ||
| 2135 | *p++ = c; | ||
| 2136 | if (c == '\n') | ||
| 2137 | break; | ||
| 2138 | } | 2113 | } |
| 2139 | *p = '\0'; | 2114 | if (!lines--) |
| 2140 | kdb_printf("%s", buf); | 2115 | break; |
| 2116 | |||
| 2117 | kdb_printf("%.*s\n", (int)len - 1, buf); | ||
| 2141 | } | 2118 | } |
| 2142 | if (c != '\n') | ||
| 2143 | kdb_printf("\n"); | ||
| 2144 | 2119 | ||
| 2145 | return 0; | 2120 | return 0; |
| 2146 | } | 2121 | } |
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 47c4e56e513b..392ec6a25844 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h | |||
| @@ -205,7 +205,6 @@ extern char kdb_grep_string[]; | |||
| 205 | extern int kdb_grep_leading; | 205 | extern int kdb_grep_leading; |
| 206 | extern int kdb_grep_trailing; | 206 | extern int kdb_grep_trailing; |
| 207 | extern char *kdb_cmds[]; | 207 | extern char *kdb_cmds[]; |
| 208 | extern void kdb_syslog_data(char *syslog_data[]); | ||
| 209 | extern unsigned long kdb_task_state_string(const char *); | 208 | extern unsigned long kdb_task_state_string(const char *); |
| 210 | extern char kdb_task_state_char (const struct task_struct *); | 209 | extern char kdb_task_state_char (const struct task_struct *); |
| 211 | extern unsigned long kdb_task_state(const struct task_struct *p, | 210 | extern unsigned long kdb_task_state(const struct task_struct *p, |
diff --git a/kernel/events/core.c b/kernel/events/core.c index d7d71d6ec972..f1cf0edeb39a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
| @@ -1645,6 +1645,8 @@ perf_install_in_context(struct perf_event_context *ctx, | |||
| 1645 | lockdep_assert_held(&ctx->mutex); | 1645 | lockdep_assert_held(&ctx->mutex); |
| 1646 | 1646 | ||
| 1647 | event->ctx = ctx; | 1647 | event->ctx = ctx; |
| 1648 | if (event->cpu != -1) | ||
| 1649 | event->cpu = cpu; | ||
| 1648 | 1650 | ||
| 1649 | if (!task) { | 1651 | if (!task) { |
| 1650 | /* | 1652 | /* |
| @@ -6252,6 +6254,8 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 6252 | } | 6254 | } |
| 6253 | } | 6255 | } |
| 6254 | 6256 | ||
| 6257 | get_online_cpus(); | ||
| 6258 | |||
| 6255 | event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, | 6259 | event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, |
| 6256 | NULL, NULL); | 6260 | NULL, NULL); |
| 6257 | if (IS_ERR(event)) { | 6261 | if (IS_ERR(event)) { |
| @@ -6304,7 +6308,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 6304 | /* | 6308 | /* |
| 6305 | * Get the target context (task or percpu): | 6309 | * Get the target context (task or percpu): |
| 6306 | */ | 6310 | */ |
| 6307 | ctx = find_get_context(pmu, task, cpu); | 6311 | ctx = find_get_context(pmu, task, event->cpu); |
| 6308 | if (IS_ERR(ctx)) { | 6312 | if (IS_ERR(ctx)) { |
| 6309 | err = PTR_ERR(ctx); | 6313 | err = PTR_ERR(ctx); |
| 6310 | goto err_alloc; | 6314 | goto err_alloc; |
| @@ -6377,20 +6381,23 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 6377 | mutex_lock(&ctx->mutex); | 6381 | mutex_lock(&ctx->mutex); |
| 6378 | 6382 | ||
| 6379 | if (move_group) { | 6383 | if (move_group) { |
| 6380 | perf_install_in_context(ctx, group_leader, cpu); | 6384 | synchronize_rcu(); |
| 6385 | perf_install_in_context(ctx, group_leader, event->cpu); | ||
| 6381 | get_ctx(ctx); | 6386 | get_ctx(ctx); |
| 6382 | list_for_each_entry(sibling, &group_leader->sibling_list, | 6387 | list_for_each_entry(sibling, &group_leader->sibling_list, |
| 6383 | group_entry) { | 6388 | group_entry) { |
| 6384 | perf_install_in_context(ctx, sibling, cpu); | 6389 | perf_install_in_context(ctx, sibling, event->cpu); |
| 6385 | get_ctx(ctx); | 6390 | get_ctx(ctx); |
| 6386 | } | 6391 | } |
| 6387 | } | 6392 | } |
| 6388 | 6393 | ||
| 6389 | perf_install_in_context(ctx, event, cpu); | 6394 | perf_install_in_context(ctx, event, event->cpu); |
| 6390 | ++ctx->generation; | 6395 | ++ctx->generation; |
| 6391 | perf_unpin_context(ctx); | 6396 | perf_unpin_context(ctx); |
| 6392 | mutex_unlock(&ctx->mutex); | 6397 | mutex_unlock(&ctx->mutex); |
| 6393 | 6398 | ||
| 6399 | put_online_cpus(); | ||
| 6400 | |||
| 6394 | event->owner = current; | 6401 | event->owner = current; |
| 6395 | 6402 | ||
| 6396 | mutex_lock(¤t->perf_event_mutex); | 6403 | mutex_lock(¤t->perf_event_mutex); |
| @@ -6419,6 +6426,7 @@ err_context: | |||
| 6419 | err_alloc: | 6426 | err_alloc: |
| 6420 | free_event(event); | 6427 | free_event(event); |
| 6421 | err_task: | 6428 | err_task: |
| 6429 | put_online_cpus(); | ||
| 6422 | if (task) | 6430 | if (task) |
| 6423 | put_task_struct(task); | 6431 | put_task_struct(task); |
| 6424 | err_group_fd: | 6432 | err_group_fd: |
| @@ -6479,6 +6487,39 @@ err: | |||
| 6479 | } | 6487 | } |
| 6480 | EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); | 6488 | EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); |
| 6481 | 6489 | ||
| 6490 | void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) | ||
| 6491 | { | ||
| 6492 | struct perf_event_context *src_ctx; | ||
| 6493 | struct perf_event_context *dst_ctx; | ||
| 6494 | struct perf_event *event, *tmp; | ||
| 6495 | LIST_HEAD(events); | ||
| 6496 | |||
| 6497 | src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx; | ||
| 6498 | dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx; | ||
| 6499 | |||
| 6500 | mutex_lock(&src_ctx->mutex); | ||
| 6501 | list_for_each_entry_safe(event, tmp, &src_ctx->event_list, | ||
| 6502 | event_entry) { | ||
| 6503 | perf_remove_from_context(event); | ||
| 6504 | put_ctx(src_ctx); | ||
| 6505 | list_add(&event->event_entry, &events); | ||
| 6506 | } | ||
| 6507 | mutex_unlock(&src_ctx->mutex); | ||
| 6508 | |||
| 6509 | synchronize_rcu(); | ||
| 6510 | |||
| 6511 | mutex_lock(&dst_ctx->mutex); | ||
| 6512 | list_for_each_entry_safe(event, tmp, &events, event_entry) { | ||
| 6513 | list_del(&event->event_entry); | ||
| 6514 | if (event->state >= PERF_EVENT_STATE_OFF) | ||
| 6515 | event->state = PERF_EVENT_STATE_INACTIVE; | ||
| 6516 | perf_install_in_context(dst_ctx, event, dst_cpu); | ||
| 6517 | get_ctx(dst_ctx); | ||
| 6518 | } | ||
| 6519 | mutex_unlock(&dst_ctx->mutex); | ||
| 6520 | } | ||
| 6521 | EXPORT_SYMBOL_GPL(perf_pmu_migrate_context); | ||
| 6522 | |||
| 6482 | static void sync_child_event(struct perf_event *child_event, | 6523 | static void sync_child_event(struct perf_event *child_event, |
| 6483 | struct task_struct *child) | 6524 | struct task_struct *child) |
| 6484 | { | 6525 | { |
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 985be4d80fe8..f93532748bca 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c | |||
| @@ -38,13 +38,29 @@ | |||
| 38 | #define UINSNS_PER_PAGE (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES) | 38 | #define UINSNS_PER_PAGE (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES) |
| 39 | #define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE | 39 | #define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE |
| 40 | 40 | ||
| 41 | static struct srcu_struct uprobes_srcu; | ||
| 42 | static struct rb_root uprobes_tree = RB_ROOT; | 41 | static struct rb_root uprobes_tree = RB_ROOT; |
| 43 | 42 | ||
| 44 | static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ | 43 | static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ |
| 45 | 44 | ||
| 46 | #define UPROBES_HASH_SZ 13 | 45 | #define UPROBES_HASH_SZ 13 |
| 47 | 46 | ||
| 47 | /* | ||
| 48 | * We need separate register/unregister and mmap/munmap lock hashes because | ||
| 49 | * of mmap_sem nesting. | ||
| 50 | * | ||
| 51 | * uprobe_register() needs to install probes on (potentially) all processes | ||
| 52 | * and thus needs to acquire multiple mmap_sems (consequtively, not | ||
| 53 | * concurrently), whereas uprobe_mmap() is called while holding mmap_sem | ||
| 54 | * for the particular process doing the mmap. | ||
| 55 | * | ||
| 56 | * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem | ||
| 57 | * because of lock order against i_mmap_mutex. This means there's a hole in | ||
| 58 | * the register vma iteration where a mmap() can happen. | ||
| 59 | * | ||
| 60 | * Thus uprobe_register() can race with uprobe_mmap() and we can try and | ||
| 61 | * install a probe where one is already installed. | ||
| 62 | */ | ||
| 63 | |||
| 48 | /* serialize (un)register */ | 64 | /* serialize (un)register */ |
| 49 | static struct mutex uprobes_mutex[UPROBES_HASH_SZ]; | 65 | static struct mutex uprobes_mutex[UPROBES_HASH_SZ]; |
| 50 | 66 | ||
| @@ -61,17 +77,6 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; | |||
| 61 | */ | 77 | */ |
| 62 | static atomic_t uprobe_events = ATOMIC_INIT(0); | 78 | static atomic_t uprobe_events = ATOMIC_INIT(0); |
| 63 | 79 | ||
| 64 | /* | ||
| 65 | * Maintain a temporary per vma info that can be used to search if a vma | ||
| 66 | * has already been handled. This structure is introduced since extending | ||
| 67 | * vm_area_struct wasnt recommended. | ||
| 68 | */ | ||
| 69 | struct vma_info { | ||
| 70 | struct list_head probe_list; | ||
| 71 | struct mm_struct *mm; | ||
| 72 | loff_t vaddr; | ||
| 73 | }; | ||
| 74 | |||
| 75 | struct uprobe { | 80 | struct uprobe { |
| 76 | struct rb_node rb_node; /* node in the rb tree */ | 81 | struct rb_node rb_node; /* node in the rb tree */ |
| 77 | atomic_t ref; | 82 | atomic_t ref; |
| @@ -100,7 +105,8 @@ static bool valid_vma(struct vm_area_struct *vma, bool is_register) | |||
| 100 | if (!is_register) | 105 | if (!is_register) |
| 101 | return true; | 106 | return true; |
| 102 | 107 | ||
| 103 | if ((vma->vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) == (VM_READ|VM_EXEC)) | 108 | if ((vma->vm_flags & (VM_HUGETLB|VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) |
| 109 | == (VM_READ|VM_EXEC)) | ||
| 104 | return true; | 110 | return true; |
| 105 | 111 | ||
| 106 | return false; | 112 | return false; |
| @@ -129,33 +135,17 @@ static loff_t vma_address(struct vm_area_struct *vma, loff_t offset) | |||
| 129 | static int __replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage) | 135 | static int __replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage) |
| 130 | { | 136 | { |
| 131 | struct mm_struct *mm = vma->vm_mm; | 137 | struct mm_struct *mm = vma->vm_mm; |
| 132 | pgd_t *pgd; | ||
| 133 | pud_t *pud; | ||
| 134 | pmd_t *pmd; | ||
| 135 | pte_t *ptep; | ||
| 136 | spinlock_t *ptl; | ||
| 137 | unsigned long addr; | 138 | unsigned long addr; |
| 138 | int err = -EFAULT; | 139 | spinlock_t *ptl; |
| 140 | pte_t *ptep; | ||
| 139 | 141 | ||
| 140 | addr = page_address_in_vma(page, vma); | 142 | addr = page_address_in_vma(page, vma); |
| 141 | if (addr == -EFAULT) | 143 | if (addr == -EFAULT) |
| 142 | goto out; | 144 | return -EFAULT; |
| 143 | |||
| 144 | pgd = pgd_offset(mm, addr); | ||
| 145 | if (!pgd_present(*pgd)) | ||
| 146 | goto out; | ||
| 147 | |||
| 148 | pud = pud_offset(pgd, addr); | ||
| 149 | if (!pud_present(*pud)) | ||
| 150 | goto out; | ||
| 151 | |||
| 152 | pmd = pmd_offset(pud, addr); | ||
| 153 | if (!pmd_present(*pmd)) | ||
| 154 | goto out; | ||
| 155 | 145 | ||
| 156 | ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); | 146 | ptep = page_check_address(page, mm, addr, &ptl, 0); |
| 157 | if (!ptep) | 147 | if (!ptep) |
| 158 | goto out; | 148 | return -EAGAIN; |
| 159 | 149 | ||
| 160 | get_page(kpage); | 150 | get_page(kpage); |
| 161 | page_add_new_anon_rmap(kpage, vma, addr); | 151 | page_add_new_anon_rmap(kpage, vma, addr); |
| @@ -174,10 +164,8 @@ static int __replace_page(struct vm_area_struct *vma, struct page *page, struct | |||
| 174 | try_to_free_swap(page); | 164 | try_to_free_swap(page); |
| 175 | put_page(page); | 165 | put_page(page); |
| 176 | pte_unmap_unlock(ptep, ptl); | 166 | pte_unmap_unlock(ptep, ptl); |
| 177 | err = 0; | ||
| 178 | 167 | ||
| 179 | out: | 168 | return 0; |
| 180 | return err; | ||
| 181 | } | 169 | } |
| 182 | 170 | ||
| 183 | /** | 171 | /** |
| @@ -222,9 +210,8 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, | |||
| 222 | void *vaddr_old, *vaddr_new; | 210 | void *vaddr_old, *vaddr_new; |
| 223 | struct vm_area_struct *vma; | 211 | struct vm_area_struct *vma; |
| 224 | struct uprobe *uprobe; | 212 | struct uprobe *uprobe; |
| 225 | loff_t addr; | ||
| 226 | int ret; | 213 | int ret; |
| 227 | 214 | retry: | |
| 228 | /* Read the page with vaddr into memory */ | 215 | /* Read the page with vaddr into memory */ |
| 229 | ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma); | 216 | ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma); |
| 230 | if (ret <= 0) | 217 | if (ret <= 0) |
| @@ -246,10 +233,6 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, | |||
| 246 | if (mapping != vma->vm_file->f_mapping) | 233 | if (mapping != vma->vm_file->f_mapping) |
| 247 | goto put_out; | 234 | goto put_out; |
| 248 | 235 | ||
| 249 | addr = vma_address(vma, uprobe->offset); | ||
| 250 | if (vaddr != (unsigned long)addr) | ||
| 251 | goto put_out; | ||
| 252 | |||
| 253 | ret = -ENOMEM; | 236 | ret = -ENOMEM; |
| 254 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); | 237 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); |
| 255 | if (!new_page) | 238 | if (!new_page) |
| @@ -267,11 +250,7 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, | |||
| 267 | vaddr_new = kmap_atomic(new_page); | 250 | vaddr_new = kmap_atomic(new_page); |
| 268 | 251 | ||
| 269 | memcpy(vaddr_new, vaddr_old, PAGE_SIZE); | 252 | memcpy(vaddr_new, vaddr_old, PAGE_SIZE); |
| 270 | 253 | memcpy(vaddr_new + (vaddr & ~PAGE_MASK), &opcode, UPROBE_SWBP_INSN_SIZE); | |
| 271 | /* poke the new insn in, ASSUMES we don't cross page boundary */ | ||
| 272 | vaddr &= ~PAGE_MASK; | ||
| 273 | BUG_ON(vaddr + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); | ||
| 274 | memcpy(vaddr_new + vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); | ||
| 275 | 254 | ||
| 276 | kunmap_atomic(vaddr_new); | 255 | kunmap_atomic(vaddr_new); |
| 277 | kunmap_atomic(vaddr_old); | 256 | kunmap_atomic(vaddr_old); |
| @@ -291,6 +270,8 @@ unlock_out: | |||
| 291 | put_out: | 270 | put_out: |
| 292 | put_page(old_page); | 271 | put_page(old_page); |
| 293 | 272 | ||
| 273 | if (unlikely(ret == -EAGAIN)) | ||
| 274 | goto retry; | ||
| 294 | return ret; | 275 | return ret; |
| 295 | } | 276 | } |
| 296 | 277 | ||
| @@ -312,7 +293,7 @@ static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_ | |||
| 312 | void *vaddr_new; | 293 | void *vaddr_new; |
| 313 | int ret; | 294 | int ret; |
| 314 | 295 | ||
| 315 | ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &page, NULL); | 296 | ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL); |
| 316 | if (ret <= 0) | 297 | if (ret <= 0) |
| 317 | return ret; | 298 | return ret; |
| 318 | 299 | ||
| @@ -333,10 +314,20 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) | |||
| 333 | uprobe_opcode_t opcode; | 314 | uprobe_opcode_t opcode; |
| 334 | int result; | 315 | int result; |
| 335 | 316 | ||
| 317 | if (current->mm == mm) { | ||
| 318 | pagefault_disable(); | ||
| 319 | result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr, | ||
| 320 | sizeof(opcode)); | ||
| 321 | pagefault_enable(); | ||
| 322 | |||
| 323 | if (likely(result == 0)) | ||
| 324 | goto out; | ||
| 325 | } | ||
| 326 | |||
| 336 | result = read_opcode(mm, vaddr, &opcode); | 327 | result = read_opcode(mm, vaddr, &opcode); |
| 337 | if (result) | 328 | if (result) |
| 338 | return result; | 329 | return result; |
| 339 | 330 | out: | |
| 340 | if (is_swbp_insn(&opcode)) | 331 | if (is_swbp_insn(&opcode)) |
| 341 | return 1; | 332 | return 1; |
| 342 | 333 | ||
| @@ -355,7 +346,9 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) | |||
| 355 | int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) | 346 | int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) |
| 356 | { | 347 | { |
| 357 | int result; | 348 | int result; |
| 358 | 349 | /* | |
| 350 | * See the comment near uprobes_hash(). | ||
| 351 | */ | ||
| 359 | result = is_swbp_at_addr(mm, vaddr); | 352 | result = is_swbp_at_addr(mm, vaddr); |
| 360 | if (result == 1) | 353 | if (result == 1) |
| 361 | return -EEXIST; | 354 | return -EEXIST; |
| @@ -520,7 +513,6 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) | |||
| 520 | uprobe->inode = igrab(inode); | 513 | uprobe->inode = igrab(inode); |
| 521 | uprobe->offset = offset; | 514 | uprobe->offset = offset; |
| 522 | init_rwsem(&uprobe->consumer_rwsem); | 515 | init_rwsem(&uprobe->consumer_rwsem); |
| 523 | INIT_LIST_HEAD(&uprobe->pending_list); | ||
| 524 | 516 | ||
| 525 | /* add to uprobes_tree, sorted on inode:offset */ | 517 | /* add to uprobes_tree, sorted on inode:offset */ |
| 526 | cur_uprobe = insert_uprobe(uprobe); | 518 | cur_uprobe = insert_uprobe(uprobe); |
| @@ -588,20 +580,22 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) | |||
| 588 | } | 580 | } |
| 589 | 581 | ||
| 590 | static int | 582 | static int |
| 591 | __copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *insn, | 583 | __copy_insn(struct address_space *mapping, struct file *filp, char *insn, |
| 592 | unsigned long nbytes, unsigned long offset) | 584 | unsigned long nbytes, loff_t offset) |
| 593 | { | 585 | { |
| 594 | struct file *filp = vma->vm_file; | ||
| 595 | struct page *page; | 586 | struct page *page; |
| 596 | void *vaddr; | 587 | void *vaddr; |
| 597 | unsigned long off1; | 588 | unsigned long off; |
| 598 | unsigned long idx; | 589 | pgoff_t idx; |
| 599 | 590 | ||
| 600 | if (!filp) | 591 | if (!filp) |
| 601 | return -EINVAL; | 592 | return -EINVAL; |
| 602 | 593 | ||
| 603 | idx = (unsigned long)(offset >> PAGE_CACHE_SHIFT); | 594 | if (!mapping->a_ops->readpage) |
| 604 | off1 = offset &= ~PAGE_MASK; | 595 | return -EIO; |
| 596 | |||
| 597 | idx = offset >> PAGE_CACHE_SHIFT; | ||
| 598 | off = offset & ~PAGE_MASK; | ||
| 605 | 599 | ||
| 606 | /* | 600 | /* |
| 607 | * Ensure that the page that has the original instruction is | 601 | * Ensure that the page that has the original instruction is |
| @@ -612,22 +606,20 @@ __copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *ins | |||
| 612 | return PTR_ERR(page); | 606 | return PTR_ERR(page); |
| 613 | 607 | ||
| 614 | vaddr = kmap_atomic(page); | 608 | vaddr = kmap_atomic(page); |
| 615 | memcpy(insn, vaddr + off1, nbytes); | 609 | memcpy(insn, vaddr + off, nbytes); |
| 616 | kunmap_atomic(vaddr); | 610 | kunmap_atomic(vaddr); |
| 617 | page_cache_release(page); | 611 | page_cache_release(page); |
| 618 | 612 | ||
| 619 | return 0; | 613 | return 0; |
| 620 | } | 614 | } |
| 621 | 615 | ||
| 622 | static int | 616 | static int copy_insn(struct uprobe *uprobe, struct file *filp) |
| 623 | copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr) | ||
| 624 | { | 617 | { |
| 625 | struct address_space *mapping; | 618 | struct address_space *mapping; |
| 626 | unsigned long nbytes; | 619 | unsigned long nbytes; |
| 627 | int bytes; | 620 | int bytes; |
| 628 | 621 | ||
| 629 | addr &= ~PAGE_MASK; | 622 | nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK); |
| 630 | nbytes = PAGE_SIZE - addr; | ||
| 631 | mapping = uprobe->inode->i_mapping; | 623 | mapping = uprobe->inode->i_mapping; |
| 632 | 624 | ||
| 633 | /* Instruction at end of binary; copy only available bytes */ | 625 | /* Instruction at end of binary; copy only available bytes */ |
| @@ -638,13 +630,13 @@ copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr) | |||
| 638 | 630 | ||
| 639 | /* Instruction at the page-boundary; copy bytes in second page */ | 631 | /* Instruction at the page-boundary; copy bytes in second page */ |
| 640 | if (nbytes < bytes) { | 632 | if (nbytes < bytes) { |
| 641 | if (__copy_insn(mapping, vma, uprobe->arch.insn + nbytes, | 633 | int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes, |
| 642 | bytes - nbytes, uprobe->offset + nbytes)) | 634 | bytes - nbytes, uprobe->offset + nbytes); |
| 643 | return -ENOMEM; | 635 | if (err) |
| 644 | 636 | return err; | |
| 645 | bytes = nbytes; | 637 | bytes = nbytes; |
| 646 | } | 638 | } |
| 647 | return __copy_insn(mapping, vma, uprobe->arch.insn, bytes, uprobe->offset); | 639 | return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset); |
| 648 | } | 640 | } |
| 649 | 641 | ||
| 650 | /* | 642 | /* |
| @@ -672,9 +664,8 @@ copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr) | |||
| 672 | */ | 664 | */ |
| 673 | static int | 665 | static int |
| 674 | install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, | 666 | install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, |
| 675 | struct vm_area_struct *vma, loff_t vaddr) | 667 | struct vm_area_struct *vma, unsigned long vaddr) |
| 676 | { | 668 | { |
| 677 | unsigned long addr; | ||
| 678 | int ret; | 669 | int ret; |
| 679 | 670 | ||
| 680 | /* | 671 | /* |
| @@ -687,20 +678,22 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, | |||
| 687 | if (!uprobe->consumers) | 678 | if (!uprobe->consumers) |
| 688 | return -EEXIST; | 679 | return -EEXIST; |
| 689 | 680 | ||
| 690 | addr = (unsigned long)vaddr; | ||
| 691 | |||
| 692 | if (!(uprobe->flags & UPROBE_COPY_INSN)) { | 681 | if (!(uprobe->flags & UPROBE_COPY_INSN)) { |
| 693 | ret = copy_insn(uprobe, vma, addr); | 682 | ret = copy_insn(uprobe, vma->vm_file); |
| 694 | if (ret) | 683 | if (ret) |
| 695 | return ret; | 684 | return ret; |
| 696 | 685 | ||
| 697 | if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) | 686 | if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) |
| 698 | return -EEXIST; | 687 | return -ENOTSUPP; |
| 699 | 688 | ||
| 700 | ret = arch_uprobe_analyze_insn(&uprobe->arch, mm); | 689 | ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); |
| 701 | if (ret) | 690 | if (ret) |
| 702 | return ret; | 691 | return ret; |
| 703 | 692 | ||
| 693 | /* write_opcode() assumes we don't cross page boundary */ | ||
| 694 | BUG_ON((uprobe->offset & ~PAGE_MASK) + | ||
| 695 | UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); | ||
| 696 | |||
| 704 | uprobe->flags |= UPROBE_COPY_INSN; | 697 | uprobe->flags |= UPROBE_COPY_INSN; |
| 705 | } | 698 | } |
| 706 | 699 | ||
| @@ -713,7 +706,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, | |||
| 713 | * Hence increment before and decrement on failure. | 706 | * Hence increment before and decrement on failure. |
| 714 | */ | 707 | */ |
| 715 | atomic_inc(&mm->uprobes_state.count); | 708 | atomic_inc(&mm->uprobes_state.count); |
| 716 | ret = set_swbp(&uprobe->arch, mm, addr); | 709 | ret = set_swbp(&uprobe->arch, mm, vaddr); |
| 717 | if (ret) | 710 | if (ret) |
| 718 | atomic_dec(&mm->uprobes_state.count); | 711 | atomic_dec(&mm->uprobes_state.count); |
| 719 | 712 | ||
| @@ -721,27 +714,21 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, | |||
| 721 | } | 714 | } |
| 722 | 715 | ||
| 723 | static void | 716 | static void |
| 724 | remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, loff_t vaddr) | 717 | remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) |
| 725 | { | 718 | { |
| 726 | if (!set_orig_insn(&uprobe->arch, mm, (unsigned long)vaddr, true)) | 719 | if (!set_orig_insn(&uprobe->arch, mm, vaddr, true)) |
| 727 | atomic_dec(&mm->uprobes_state.count); | 720 | atomic_dec(&mm->uprobes_state.count); |
| 728 | } | 721 | } |
| 729 | 722 | ||
| 730 | /* | 723 | /* |
| 731 | * There could be threads that have hit the breakpoint and are entering the | 724 | * There could be threads that have already hit the breakpoint. They |
| 732 | * notifier code and trying to acquire the uprobes_treelock. The thread | 725 | * will recheck the current insn and restart if find_uprobe() fails. |
| 733 | * calling delete_uprobe() that is removing the uprobe from the rb_tree can | 726 | * See find_active_uprobe(). |
| 734 | * race with these threads and might acquire the uprobes_treelock compared | ||
| 735 | * to some of the breakpoint hit threads. In such a case, the breakpoint | ||
| 736 | * hit threads will not find the uprobe. The current unregistering thread | ||
| 737 | * waits till all other threads have hit a breakpoint, to acquire the | ||
| 738 | * uprobes_treelock before the uprobe is removed from the rbtree. | ||
| 739 | */ | 727 | */ |
| 740 | static void delete_uprobe(struct uprobe *uprobe) | 728 | static void delete_uprobe(struct uprobe *uprobe) |
| 741 | { | 729 | { |
| 742 | unsigned long flags; | 730 | unsigned long flags; |
| 743 | 731 | ||
| 744 | synchronize_srcu(&uprobes_srcu); | ||
| 745 | spin_lock_irqsave(&uprobes_treelock, flags); | 732 | spin_lock_irqsave(&uprobes_treelock, flags); |
| 746 | rb_erase(&uprobe->rb_node, &uprobes_tree); | 733 | rb_erase(&uprobe->rb_node, &uprobes_tree); |
| 747 | spin_unlock_irqrestore(&uprobes_treelock, flags); | 734 | spin_unlock_irqrestore(&uprobes_treelock, flags); |
| @@ -750,139 +737,135 @@ static void delete_uprobe(struct uprobe *uprobe) | |||
| 750 | atomic_dec(&uprobe_events); | 737 | atomic_dec(&uprobe_events); |
| 751 | } | 738 | } |
| 752 | 739 | ||
| 753 | static struct vma_info * | 740 | struct map_info { |
| 754 | __find_next_vma_info(struct address_space *mapping, struct list_head *head, | 741 | struct map_info *next; |
| 755 | struct vma_info *vi, loff_t offset, bool is_register) | 742 | struct mm_struct *mm; |
| 743 | unsigned long vaddr; | ||
| 744 | }; | ||
| 745 | |||
| 746 | static inline struct map_info *free_map_info(struct map_info *info) | ||
| 747 | { | ||
| 748 | struct map_info *next = info->next; | ||
| 749 | kfree(info); | ||
| 750 | return next; | ||
| 751 | } | ||
| 752 | |||
| 753 | static struct map_info * | ||
| 754 | build_map_info(struct address_space *mapping, loff_t offset, bool is_register) | ||
| 756 | { | 755 | { |
| 756 | unsigned long pgoff = offset >> PAGE_SHIFT; | ||
| 757 | struct prio_tree_iter iter; | 757 | struct prio_tree_iter iter; |
| 758 | struct vm_area_struct *vma; | 758 | struct vm_area_struct *vma; |
| 759 | struct vma_info *tmpvi; | 759 | struct map_info *curr = NULL; |
| 760 | unsigned long pgoff; | 760 | struct map_info *prev = NULL; |
| 761 | int existing_vma; | 761 | struct map_info *info; |
| 762 | loff_t vaddr; | 762 | int more = 0; |
| 763 | |||
| 764 | pgoff = offset >> PAGE_SHIFT; | ||
| 765 | 763 | ||
| 764 | again: | ||
| 765 | mutex_lock(&mapping->i_mmap_mutex); | ||
| 766 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 766 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
| 767 | if (!valid_vma(vma, is_register)) | 767 | if (!valid_vma(vma, is_register)) |
| 768 | continue; | 768 | continue; |
| 769 | 769 | ||
| 770 | existing_vma = 0; | 770 | if (!prev && !more) { |
| 771 | vaddr = vma_address(vma, offset); | 771 | /* |
| 772 | 772 | * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through | |
| 773 | list_for_each_entry(tmpvi, head, probe_list) { | 773 | * reclaim. This is optimistic, no harm done if it fails. |
| 774 | if (tmpvi->mm == vma->vm_mm && tmpvi->vaddr == vaddr) { | 774 | */ |
| 775 | existing_vma = 1; | 775 | prev = kmalloc(sizeof(struct map_info), |
| 776 | break; | 776 | GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN); |
| 777 | } | 777 | if (prev) |
| 778 | prev->next = NULL; | ||
| 778 | } | 779 | } |
| 779 | 780 | if (!prev) { | |
| 780 | /* | 781 | more++; |
| 781 | * Another vma needs a probe to be installed. However skip | 782 | continue; |
| 782 | * installing the probe if the vma is about to be unlinked. | ||
| 783 | */ | ||
| 784 | if (!existing_vma && atomic_inc_not_zero(&vma->vm_mm->mm_users)) { | ||
| 785 | vi->mm = vma->vm_mm; | ||
| 786 | vi->vaddr = vaddr; | ||
| 787 | list_add(&vi->probe_list, head); | ||
| 788 | |||
| 789 | return vi; | ||
| 790 | } | 783 | } |
| 791 | } | ||
| 792 | 784 | ||
| 793 | return NULL; | 785 | if (!atomic_inc_not_zero(&vma->vm_mm->mm_users)) |
| 794 | } | 786 | continue; |
| 795 | |||
| 796 | /* | ||
| 797 | * Iterate in the rmap prio tree and find a vma where a probe has not | ||
| 798 | * yet been inserted. | ||
| 799 | */ | ||
| 800 | static struct vma_info * | ||
| 801 | find_next_vma_info(struct address_space *mapping, struct list_head *head, | ||
| 802 | loff_t offset, bool is_register) | ||
| 803 | { | ||
| 804 | struct vma_info *vi, *retvi; | ||
| 805 | 787 | ||
| 806 | vi = kzalloc(sizeof(struct vma_info), GFP_KERNEL); | 788 | info = prev; |
| 807 | if (!vi) | 789 | prev = prev->next; |
| 808 | return ERR_PTR(-ENOMEM); | 790 | info->next = curr; |
| 791 | curr = info; | ||
| 809 | 792 | ||
| 810 | mutex_lock(&mapping->i_mmap_mutex); | 793 | info->mm = vma->vm_mm; |
| 811 | retvi = __find_next_vma_info(mapping, head, vi, offset, is_register); | 794 | info->vaddr = vma_address(vma, offset); |
| 795 | } | ||
| 812 | mutex_unlock(&mapping->i_mmap_mutex); | 796 | mutex_unlock(&mapping->i_mmap_mutex); |
| 813 | 797 | ||
| 814 | if (!retvi) | 798 | if (!more) |
| 815 | kfree(vi); | 799 | goto out; |
| 800 | |||
| 801 | prev = curr; | ||
| 802 | while (curr) { | ||
| 803 | mmput(curr->mm); | ||
| 804 | curr = curr->next; | ||
| 805 | } | ||
| 816 | 806 | ||
| 817 | return retvi; | 807 | do { |
| 808 | info = kmalloc(sizeof(struct map_info), GFP_KERNEL); | ||
| 809 | if (!info) { | ||
| 810 | curr = ERR_PTR(-ENOMEM); | ||
| 811 | goto out; | ||
| 812 | } | ||
| 813 | info->next = prev; | ||
| 814 | prev = info; | ||
| 815 | } while (--more); | ||
| 816 | |||
| 817 | goto again; | ||
| 818 | out: | ||
| 819 | while (prev) | ||
| 820 | prev = free_map_info(prev); | ||
| 821 | return curr; | ||
| 818 | } | 822 | } |
| 819 | 823 | ||
| 820 | static int register_for_each_vma(struct uprobe *uprobe, bool is_register) | 824 | static int register_for_each_vma(struct uprobe *uprobe, bool is_register) |
| 821 | { | 825 | { |
| 822 | struct list_head try_list; | 826 | struct map_info *info; |
| 823 | struct vm_area_struct *vma; | 827 | int err = 0; |
| 824 | struct address_space *mapping; | ||
| 825 | struct vma_info *vi, *tmpvi; | ||
| 826 | struct mm_struct *mm; | ||
| 827 | loff_t vaddr; | ||
| 828 | int ret; | ||
| 829 | 828 | ||
| 830 | mapping = uprobe->inode->i_mapping; | 829 | info = build_map_info(uprobe->inode->i_mapping, |
| 831 | INIT_LIST_HEAD(&try_list); | 830 | uprobe->offset, is_register); |
| 831 | if (IS_ERR(info)) | ||
| 832 | return PTR_ERR(info); | ||
| 832 | 833 | ||
| 833 | ret = 0; | 834 | while (info) { |
| 835 | struct mm_struct *mm = info->mm; | ||
| 836 | struct vm_area_struct *vma; | ||
| 834 | 837 | ||
| 835 | for (;;) { | 838 | if (err) |
| 836 | vi = find_next_vma_info(mapping, &try_list, uprobe->offset, is_register); | 839 | goto free; |
| 837 | if (!vi) | ||
| 838 | break; | ||
| 839 | 840 | ||
| 840 | if (IS_ERR(vi)) { | 841 | down_write(&mm->mmap_sem); |
| 841 | ret = PTR_ERR(vi); | 842 | vma = find_vma(mm, (unsigned long)info->vaddr); |
| 842 | break; | 843 | if (!vma || !valid_vma(vma, is_register)) |
| 843 | } | 844 | goto unlock; |
| 844 | 845 | ||
| 845 | mm = vi->mm; | ||
| 846 | down_read(&mm->mmap_sem); | ||
| 847 | vma = find_vma(mm, (unsigned long)vi->vaddr); | ||
| 848 | if (!vma || !valid_vma(vma, is_register)) { | ||
| 849 | list_del(&vi->probe_list); | ||
| 850 | kfree(vi); | ||
| 851 | up_read(&mm->mmap_sem); | ||
| 852 | mmput(mm); | ||
| 853 | continue; | ||
| 854 | } | ||
| 855 | vaddr = vma_address(vma, uprobe->offset); | ||
| 856 | if (vma->vm_file->f_mapping->host != uprobe->inode || | 846 | if (vma->vm_file->f_mapping->host != uprobe->inode || |
| 857 | vaddr != vi->vaddr) { | 847 | vma_address(vma, uprobe->offset) != info->vaddr) |
| 858 | list_del(&vi->probe_list); | 848 | goto unlock; |
| 859 | kfree(vi); | ||
| 860 | up_read(&mm->mmap_sem); | ||
| 861 | mmput(mm); | ||
| 862 | continue; | ||
| 863 | } | ||
| 864 | |||
| 865 | if (is_register) | ||
| 866 | ret = install_breakpoint(uprobe, mm, vma, vi->vaddr); | ||
| 867 | else | ||
| 868 | remove_breakpoint(uprobe, mm, vi->vaddr); | ||
| 869 | 849 | ||
| 870 | up_read(&mm->mmap_sem); | ||
| 871 | mmput(mm); | ||
| 872 | if (is_register) { | 850 | if (is_register) { |
| 873 | if (ret && ret == -EEXIST) | 851 | err = install_breakpoint(uprobe, mm, vma, info->vaddr); |
| 874 | ret = 0; | 852 | /* |
| 875 | if (ret) | 853 | * We can race against uprobe_mmap(), see the |
| 876 | break; | 854 | * comment near uprobe_hash(). |
| 855 | */ | ||
| 856 | if (err == -EEXIST) | ||
| 857 | err = 0; | ||
| 858 | } else { | ||
| 859 | remove_breakpoint(uprobe, mm, info->vaddr); | ||
| 877 | } | 860 | } |
| 861 | unlock: | ||
| 862 | up_write(&mm->mmap_sem); | ||
| 863 | free: | ||
| 864 | mmput(mm); | ||
| 865 | info = free_map_info(info); | ||
| 878 | } | 866 | } |
| 879 | 867 | ||
| 880 | list_for_each_entry_safe(vi, tmpvi, &try_list, probe_list) { | 868 | return err; |
| 881 | list_del(&vi->probe_list); | ||
| 882 | kfree(vi); | ||
| 883 | } | ||
| 884 | |||
| 885 | return ret; | ||
| 886 | } | 869 | } |
| 887 | 870 | ||
| 888 | static int __uprobe_register(struct uprobe *uprobe) | 871 | static int __uprobe_register(struct uprobe *uprobe) |
| @@ -1048,7 +1031,7 @@ static void build_probe_list(struct inode *inode, struct list_head *head) | |||
| 1048 | int uprobe_mmap(struct vm_area_struct *vma) | 1031 | int uprobe_mmap(struct vm_area_struct *vma) |
| 1049 | { | 1032 | { |
| 1050 | struct list_head tmp_list; | 1033 | struct list_head tmp_list; |
| 1051 | struct uprobe *uprobe, *u; | 1034 | struct uprobe *uprobe; |
| 1052 | struct inode *inode; | 1035 | struct inode *inode; |
| 1053 | int ret, count; | 1036 | int ret, count; |
| 1054 | 1037 | ||
| @@ -1066,12 +1049,9 @@ int uprobe_mmap(struct vm_area_struct *vma) | |||
| 1066 | ret = 0; | 1049 | ret = 0; |
| 1067 | count = 0; | 1050 | count = 0; |
| 1068 | 1051 | ||
| 1069 | list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { | 1052 | list_for_each_entry(uprobe, &tmp_list, pending_list) { |
| 1070 | loff_t vaddr; | ||
| 1071 | |||
| 1072 | list_del(&uprobe->pending_list); | ||
| 1073 | if (!ret) { | 1053 | if (!ret) { |
| 1074 | vaddr = vma_address(vma, uprobe->offset); | 1054 | loff_t vaddr = vma_address(vma, uprobe->offset); |
| 1075 | 1055 | ||
| 1076 | if (vaddr < vma->vm_start || vaddr >= vma->vm_end) { | 1056 | if (vaddr < vma->vm_start || vaddr >= vma->vm_end) { |
| 1077 | put_uprobe(uprobe); | 1057 | put_uprobe(uprobe); |
| @@ -1079,8 +1059,10 @@ int uprobe_mmap(struct vm_area_struct *vma) | |||
| 1079 | } | 1059 | } |
| 1080 | 1060 | ||
| 1081 | ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); | 1061 | ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); |
| 1082 | 1062 | /* | |
| 1083 | /* Ignore double add: */ | 1063 | * We can race against uprobe_register(), see the |
| 1064 | * comment near uprobe_hash(). | ||
| 1065 | */ | ||
| 1084 | if (ret == -EEXIST) { | 1066 | if (ret == -EEXIST) { |
| 1085 | ret = 0; | 1067 | ret = 0; |
| 1086 | 1068 | ||
| @@ -1115,7 +1097,7 @@ int uprobe_mmap(struct vm_area_struct *vma) | |||
| 1115 | void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) | 1097 | void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) |
| 1116 | { | 1098 | { |
| 1117 | struct list_head tmp_list; | 1099 | struct list_head tmp_list; |
| 1118 | struct uprobe *uprobe, *u; | 1100 | struct uprobe *uprobe; |
| 1119 | struct inode *inode; | 1101 | struct inode *inode; |
| 1120 | 1102 | ||
| 1121 | if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) | 1103 | if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) |
| @@ -1132,11 +1114,8 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon | |||
| 1132 | mutex_lock(uprobes_mmap_hash(inode)); | 1114 | mutex_lock(uprobes_mmap_hash(inode)); |
| 1133 | build_probe_list(inode, &tmp_list); | 1115 | build_probe_list(inode, &tmp_list); |
| 1134 | 1116 | ||
| 1135 | list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { | 1117 | list_for_each_entry(uprobe, &tmp_list, pending_list) { |
| 1136 | loff_t vaddr; | 1118 | loff_t vaddr = vma_address(vma, uprobe->offset); |
| 1137 | |||
| 1138 | list_del(&uprobe->pending_list); | ||
| 1139 | vaddr = vma_address(vma, uprobe->offset); | ||
| 1140 | 1119 | ||
| 1141 | if (vaddr >= start && vaddr < end) { | 1120 | if (vaddr >= start && vaddr < end) { |
| 1142 | /* | 1121 | /* |
| @@ -1378,9 +1357,6 @@ void uprobe_free_utask(struct task_struct *t) | |||
| 1378 | { | 1357 | { |
| 1379 | struct uprobe_task *utask = t->utask; | 1358 | struct uprobe_task *utask = t->utask; |
| 1380 | 1359 | ||
| 1381 | if (t->uprobe_srcu_id != -1) | ||
| 1382 | srcu_read_unlock_raw(&uprobes_srcu, t->uprobe_srcu_id); | ||
| 1383 | |||
| 1384 | if (!utask) | 1360 | if (!utask) |
| 1385 | return; | 1361 | return; |
| 1386 | 1362 | ||
| @@ -1398,7 +1374,6 @@ void uprobe_free_utask(struct task_struct *t) | |||
| 1398 | void uprobe_copy_process(struct task_struct *t) | 1374 | void uprobe_copy_process(struct task_struct *t) |
| 1399 | { | 1375 | { |
| 1400 | t->utask = NULL; | 1376 | t->utask = NULL; |
| 1401 | t->uprobe_srcu_id = -1; | ||
| 1402 | } | 1377 | } |
| 1403 | 1378 | ||
| 1404 | /* | 1379 | /* |
| @@ -1417,7 +1392,6 @@ static struct uprobe_task *add_utask(void) | |||
| 1417 | if (unlikely(!utask)) | 1392 | if (unlikely(!utask)) |
| 1418 | return NULL; | 1393 | return NULL; |
| 1419 | 1394 | ||
| 1420 | utask->active_uprobe = NULL; | ||
| 1421 | current->utask = utask; | 1395 | current->utask = utask; |
| 1422 | return utask; | 1396 | return utask; |
| 1423 | } | 1397 | } |
| @@ -1479,41 +1453,64 @@ static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs) | |||
| 1479 | return false; | 1453 | return false; |
| 1480 | } | 1454 | } |
| 1481 | 1455 | ||
| 1456 | static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) | ||
| 1457 | { | ||
| 1458 | struct mm_struct *mm = current->mm; | ||
| 1459 | struct uprobe *uprobe = NULL; | ||
| 1460 | struct vm_area_struct *vma; | ||
| 1461 | |||
| 1462 | down_read(&mm->mmap_sem); | ||
| 1463 | vma = find_vma(mm, bp_vaddr); | ||
| 1464 | if (vma && vma->vm_start <= bp_vaddr) { | ||
| 1465 | if (valid_vma(vma, false)) { | ||
| 1466 | struct inode *inode; | ||
| 1467 | loff_t offset; | ||
| 1468 | |||
| 1469 | inode = vma->vm_file->f_mapping->host; | ||
| 1470 | offset = bp_vaddr - vma->vm_start; | ||
| 1471 | offset += (vma->vm_pgoff << PAGE_SHIFT); | ||
| 1472 | uprobe = find_uprobe(inode, offset); | ||
| 1473 | } | ||
| 1474 | |||
| 1475 | if (!uprobe) | ||
| 1476 | *is_swbp = is_swbp_at_addr(mm, bp_vaddr); | ||
| 1477 | } else { | ||
| 1478 | *is_swbp = -EFAULT; | ||
| 1479 | } | ||
| 1480 | up_read(&mm->mmap_sem); | ||
| 1481 | |||
| 1482 | return uprobe; | ||
| 1483 | } | ||
| 1484 | |||
| 1482 | /* | 1485 | /* |
| 1483 | * Run handler and ask thread to singlestep. | 1486 | * Run handler and ask thread to singlestep. |
| 1484 | * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. | 1487 | * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. |
| 1485 | */ | 1488 | */ |
| 1486 | static void handle_swbp(struct pt_regs *regs) | 1489 | static void handle_swbp(struct pt_regs *regs) |
| 1487 | { | 1490 | { |
| 1488 | struct vm_area_struct *vma; | ||
| 1489 | struct uprobe_task *utask; | 1491 | struct uprobe_task *utask; |
| 1490 | struct uprobe *uprobe; | 1492 | struct uprobe *uprobe; |
| 1491 | struct mm_struct *mm; | ||
| 1492 | unsigned long bp_vaddr; | 1493 | unsigned long bp_vaddr; |
| 1494 | int uninitialized_var(is_swbp); | ||
| 1493 | 1495 | ||
| 1494 | uprobe = NULL; | ||
| 1495 | bp_vaddr = uprobe_get_swbp_addr(regs); | 1496 | bp_vaddr = uprobe_get_swbp_addr(regs); |
| 1496 | mm = current->mm; | 1497 | uprobe = find_active_uprobe(bp_vaddr, &is_swbp); |
| 1497 | down_read(&mm->mmap_sem); | ||
| 1498 | vma = find_vma(mm, bp_vaddr); | ||
| 1499 | |||
| 1500 | if (vma && vma->vm_start <= bp_vaddr && valid_vma(vma, false)) { | ||
| 1501 | struct inode *inode; | ||
| 1502 | loff_t offset; | ||
| 1503 | |||
| 1504 | inode = vma->vm_file->f_mapping->host; | ||
| 1505 | offset = bp_vaddr - vma->vm_start; | ||
| 1506 | offset += (vma->vm_pgoff << PAGE_SHIFT); | ||
| 1507 | uprobe = find_uprobe(inode, offset); | ||
| 1508 | } | ||
| 1509 | |||
| 1510 | srcu_read_unlock_raw(&uprobes_srcu, current->uprobe_srcu_id); | ||
| 1511 | current->uprobe_srcu_id = -1; | ||
| 1512 | up_read(&mm->mmap_sem); | ||
| 1513 | 1498 | ||
| 1514 | if (!uprobe) { | 1499 | if (!uprobe) { |
| 1515 | /* No matching uprobe; signal SIGTRAP. */ | 1500 | if (is_swbp > 0) { |
| 1516 | send_sig(SIGTRAP, current, 0); | 1501 | /* No matching uprobe; signal SIGTRAP. */ |
| 1502 | send_sig(SIGTRAP, current, 0); | ||
| 1503 | } else { | ||
| 1504 | /* | ||
| 1505 | * Either we raced with uprobe_unregister() or we can't | ||
| 1506 | * access this memory. The latter is only possible if | ||
| 1507 | * another thread plays with our ->mm. In both cases | ||
| 1508 | * we can simply restart. If this vma was unmapped we | ||
| 1509 | * can pretend this insn was not executed yet and get | ||
| 1510 | * the (correct) SIGSEGV after restart. | ||
| 1511 | */ | ||
| 1512 | instruction_pointer_set(regs, bp_vaddr); | ||
| 1513 | } | ||
| 1517 | return; | 1514 | return; |
| 1518 | } | 1515 | } |
| 1519 | 1516 | ||
| @@ -1620,7 +1617,6 @@ int uprobe_pre_sstep_notifier(struct pt_regs *regs) | |||
| 1620 | utask->state = UTASK_BP_HIT; | 1617 | utask->state = UTASK_BP_HIT; |
| 1621 | 1618 | ||
| 1622 | set_thread_flag(TIF_UPROBE); | 1619 | set_thread_flag(TIF_UPROBE); |
| 1623 | current->uprobe_srcu_id = srcu_read_lock_raw(&uprobes_srcu); | ||
| 1624 | 1620 | ||
| 1625 | return 1; | 1621 | return 1; |
| 1626 | } | 1622 | } |
| @@ -1655,7 +1651,6 @@ static int __init init_uprobes(void) | |||
| 1655 | mutex_init(&uprobes_mutex[i]); | 1651 | mutex_init(&uprobes_mutex[i]); |
| 1656 | mutex_init(&uprobes_mmap_mutex[i]); | 1652 | mutex_init(&uprobes_mmap_mutex[i]); |
| 1657 | } | 1653 | } |
| 1658 | init_srcu_struct(&uprobes_srcu); | ||
| 1659 | 1654 | ||
| 1660 | return register_die_notifier(&uprobe_exception_nb); | 1655 | return register_die_notifier(&uprobe_exception_nb); |
| 1661 | } | 1656 | } |
diff --git a/kernel/fork.c b/kernel/fork.c index ab5211b9e622..f00e319d8376 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -304,12 +304,17 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
| 304 | } | 304 | } |
| 305 | 305 | ||
| 306 | err = arch_dup_task_struct(tsk, orig); | 306 | err = arch_dup_task_struct(tsk, orig); |
| 307 | if (err) | ||
| 308 | goto out; | ||
| 309 | 307 | ||
| 308 | /* | ||
| 309 | * We defer looking at err, because we will need this setup | ||
| 310 | * for the clean up path to work correctly. | ||
| 311 | */ | ||
| 310 | tsk->stack = ti; | 312 | tsk->stack = ti; |
| 311 | |||
| 312 | setup_thread_stack(tsk, orig); | 313 | setup_thread_stack(tsk, orig); |
| 314 | |||
| 315 | if (err) | ||
| 316 | goto out; | ||
| 317 | |||
| 313 | clear_user_return_notifier(tsk); | 318 | clear_user_return_notifier(tsk); |
| 314 | clear_tsk_need_resched(tsk); | 319 | clear_tsk_need_resched(tsk); |
| 315 | stackend = end_of_stack(tsk); | 320 | stackend = end_of_stack(tsk); |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index ae34bf51682b..6db7a5ed52b5 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
| @@ -657,6 +657,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | |||
| 657 | return 0; | 657 | return 0; |
| 658 | } | 658 | } |
| 659 | 659 | ||
| 660 | static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) | ||
| 661 | { | ||
| 662 | ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; | ||
| 663 | ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; | ||
| 664 | |||
| 665 | return ktime_get_update_offsets(offs_real, offs_boot); | ||
| 666 | } | ||
| 667 | |||
| 660 | /* | 668 | /* |
| 661 | * Retrigger next event is called after clock was set | 669 | * Retrigger next event is called after clock was set |
| 662 | * | 670 | * |
| @@ -665,22 +673,12 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | |||
| 665 | static void retrigger_next_event(void *arg) | 673 | static void retrigger_next_event(void *arg) |
| 666 | { | 674 | { |
| 667 | struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); | 675 | struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); |
| 668 | struct timespec realtime_offset, xtim, wtm, sleep; | ||
| 669 | 676 | ||
| 670 | if (!hrtimer_hres_active()) | 677 | if (!hrtimer_hres_active()) |
| 671 | return; | 678 | return; |
| 672 | 679 | ||
| 673 | /* Optimized out for !HIGH_RES */ | ||
| 674 | get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep); | ||
| 675 | set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); | ||
| 676 | |||
| 677 | /* Adjust CLOCK_REALTIME offset */ | ||
| 678 | raw_spin_lock(&base->lock); | 680 | raw_spin_lock(&base->lock); |
| 679 | base->clock_base[HRTIMER_BASE_REALTIME].offset = | 681 | hrtimer_update_base(base); |
| 680 | timespec_to_ktime(realtime_offset); | ||
| 681 | base->clock_base[HRTIMER_BASE_BOOTTIME].offset = | ||
| 682 | timespec_to_ktime(sleep); | ||
| 683 | |||
| 684 | hrtimer_force_reprogram(base, 0); | 682 | hrtimer_force_reprogram(base, 0); |
| 685 | raw_spin_unlock(&base->lock); | 683 | raw_spin_unlock(&base->lock); |
| 686 | } | 684 | } |
| @@ -710,13 +708,25 @@ static int hrtimer_switch_to_hres(void) | |||
| 710 | base->clock_base[i].resolution = KTIME_HIGH_RES; | 708 | base->clock_base[i].resolution = KTIME_HIGH_RES; |
| 711 | 709 | ||
| 712 | tick_setup_sched_timer(); | 710 | tick_setup_sched_timer(); |
| 713 | |||
| 714 | /* "Retrigger" the interrupt to get things going */ | 711 | /* "Retrigger" the interrupt to get things going */ |
| 715 | retrigger_next_event(NULL); | 712 | retrigger_next_event(NULL); |
| 716 | local_irq_restore(flags); | 713 | local_irq_restore(flags); |
| 717 | return 1; | 714 | return 1; |
| 718 | } | 715 | } |
| 719 | 716 | ||
| 717 | /* | ||
| 718 | * Called from timekeeping code to reprogramm the hrtimer interrupt | ||
| 719 | * device. If called from the timer interrupt context we defer it to | ||
| 720 | * softirq context. | ||
| 721 | */ | ||
| 722 | void clock_was_set_delayed(void) | ||
| 723 | { | ||
| 724 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | ||
| 725 | |||
| 726 | cpu_base->clock_was_set = 1; | ||
| 727 | __raise_softirq_irqoff(HRTIMER_SOFTIRQ); | ||
| 728 | } | ||
| 729 | |||
| 720 | #else | 730 | #else |
| 721 | 731 | ||
| 722 | static inline int hrtimer_hres_active(void) { return 0; } | 732 | static inline int hrtimer_hres_active(void) { return 0; } |
| @@ -1250,11 +1260,10 @@ void hrtimer_interrupt(struct clock_event_device *dev) | |||
| 1250 | cpu_base->nr_events++; | 1260 | cpu_base->nr_events++; |
| 1251 | dev->next_event.tv64 = KTIME_MAX; | 1261 | dev->next_event.tv64 = KTIME_MAX; |
| 1252 | 1262 | ||
| 1253 | entry_time = now = ktime_get(); | 1263 | raw_spin_lock(&cpu_base->lock); |
| 1264 | entry_time = now = hrtimer_update_base(cpu_base); | ||
| 1254 | retry: | 1265 | retry: |
| 1255 | expires_next.tv64 = KTIME_MAX; | 1266 | expires_next.tv64 = KTIME_MAX; |
| 1256 | |||
| 1257 | raw_spin_lock(&cpu_base->lock); | ||
| 1258 | /* | 1267 | /* |
| 1259 | * We set expires_next to KTIME_MAX here with cpu_base->lock | 1268 | * We set expires_next to KTIME_MAX here with cpu_base->lock |
| 1260 | * held to prevent that a timer is enqueued in our queue via | 1269 | * held to prevent that a timer is enqueued in our queue via |
| @@ -1330,8 +1339,12 @@ retry: | |||
| 1330 | * We need to prevent that we loop forever in the hrtimer | 1339 | * We need to prevent that we loop forever in the hrtimer |
| 1331 | * interrupt routine. We give it 3 attempts to avoid | 1340 | * interrupt routine. We give it 3 attempts to avoid |
| 1332 | * overreacting on some spurious event. | 1341 | * overreacting on some spurious event. |
| 1342 | * | ||
| 1343 | * Acquire base lock for updating the offsets and retrieving | ||
| 1344 | * the current time. | ||
| 1333 | */ | 1345 | */ |
| 1334 | now = ktime_get(); | 1346 | raw_spin_lock(&cpu_base->lock); |
| 1347 | now = hrtimer_update_base(cpu_base); | ||
| 1335 | cpu_base->nr_retries++; | 1348 | cpu_base->nr_retries++; |
| 1336 | if (++retries < 3) | 1349 | if (++retries < 3) |
| 1337 | goto retry; | 1350 | goto retry; |
| @@ -1343,6 +1356,7 @@ retry: | |||
| 1343 | */ | 1356 | */ |
| 1344 | cpu_base->nr_hangs++; | 1357 | cpu_base->nr_hangs++; |
| 1345 | cpu_base->hang_detected = 1; | 1358 | cpu_base->hang_detected = 1; |
| 1359 | raw_spin_unlock(&cpu_base->lock); | ||
| 1346 | delta = ktime_sub(now, entry_time); | 1360 | delta = ktime_sub(now, entry_time); |
| 1347 | if (delta.tv64 > cpu_base->max_hang_time.tv64) | 1361 | if (delta.tv64 > cpu_base->max_hang_time.tv64) |
| 1348 | cpu_base->max_hang_time = delta; | 1362 | cpu_base->max_hang_time = delta; |
| @@ -1395,6 +1409,13 @@ void hrtimer_peek_ahead_timers(void) | |||
| 1395 | 1409 | ||
| 1396 | static void run_hrtimer_softirq(struct softirq_action *h) | 1410 | static void run_hrtimer_softirq(struct softirq_action *h) |
| 1397 | { | 1411 | { |
| 1412 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | ||
| 1413 | |||
| 1414 | if (cpu_base->clock_was_set) { | ||
| 1415 | cpu_base->clock_was_set = 0; | ||
| 1416 | clock_was_set(); | ||
| 1417 | } | ||
| 1418 | |||
| 1398 | hrtimer_peek_ahead_timers(); | 1419 | hrtimer_peek_ahead_timers(); |
| 1399 | } | 1420 | } |
| 1400 | 1421 | ||
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 8b53db38a279..238025f5472e 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
| @@ -27,7 +27,6 @@ | |||
| 27 | #include <linux/syscore_ops.h> | 27 | #include <linux/syscore_ops.h> |
| 28 | #include <linux/ctype.h> | 28 | #include <linux/ctype.h> |
| 29 | #include <linux/genhd.h> | 29 | #include <linux/genhd.h> |
| 30 | #include <scsi/scsi_scan.h> | ||
| 31 | 30 | ||
| 32 | #include "power.h" | 31 | #include "power.h" |
| 33 | 32 | ||
| @@ -748,13 +747,6 @@ static int software_resume(void) | |||
| 748 | async_synchronize_full(); | 747 | async_synchronize_full(); |
| 749 | } | 748 | } |
| 750 | 749 | ||
| 751 | /* | ||
| 752 | * We can't depend on SCSI devices being available after loading | ||
| 753 | * one of their modules until scsi_complete_async_scans() is | ||
| 754 | * called and the resume device usually is a SCSI one. | ||
| 755 | */ | ||
| 756 | scsi_complete_async_scans(); | ||
| 757 | |||
| 758 | swsusp_resume_device = name_to_dev_t(resume_file); | 750 | swsusp_resume_device = name_to_dev_t(resume_file); |
| 759 | if (!swsusp_resume_device) { | 751 | if (!swsusp_resume_device) { |
| 760 | error = -ENODEV; | 752 | error = -ENODEV; |
diff --git a/kernel/power/user.c b/kernel/power/user.c index 91b0fd021a95..4ed81e74f86f 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c | |||
| @@ -24,7 +24,6 @@ | |||
| 24 | #include <linux/console.h> | 24 | #include <linux/console.h> |
| 25 | #include <linux/cpu.h> | 25 | #include <linux/cpu.h> |
| 26 | #include <linux/freezer.h> | 26 | #include <linux/freezer.h> |
| 27 | #include <scsi/scsi_scan.h> | ||
| 28 | 27 | ||
| 29 | #include <asm/uaccess.h> | 28 | #include <asm/uaccess.h> |
| 30 | 29 | ||
| @@ -84,7 +83,6 @@ static int snapshot_open(struct inode *inode, struct file *filp) | |||
| 84 | * appear. | 83 | * appear. |
| 85 | */ | 84 | */ |
| 86 | wait_for_device_probe(); | 85 | wait_for_device_probe(); |
| 87 | scsi_complete_async_scans(); | ||
| 88 | 86 | ||
| 89 | data->swap = -1; | 87 | data->swap = -1; |
| 90 | data->mode = O_WRONLY; | 88 | data->mode = O_WRONLY; |
diff --git a/kernel/printk.c b/kernel/printk.c index dba18211685e..ac4bc9e79465 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
| @@ -194,8 +194,10 @@ static int console_may_schedule; | |||
| 194 | */ | 194 | */ |
| 195 | 195 | ||
| 196 | enum log_flags { | 196 | enum log_flags { |
| 197 | LOG_DEFAULT = 0, | 197 | LOG_NOCONS = 1, /* already flushed, do not print to console */ |
| 198 | LOG_NOCONS = 1, /* already flushed, do not print to console */ | 198 | LOG_NEWLINE = 2, /* text ended with a newline */ |
| 199 | LOG_PREFIX = 4, /* text started with a prefix */ | ||
| 200 | LOG_CONT = 8, /* text is a fragment of a continuation line */ | ||
| 199 | }; | 201 | }; |
| 200 | 202 | ||
| 201 | struct log { | 203 | struct log { |
| @@ -217,6 +219,8 @@ static DEFINE_RAW_SPINLOCK(logbuf_lock); | |||
| 217 | /* the next printk record to read by syslog(READ) or /proc/kmsg */ | 219 | /* the next printk record to read by syslog(READ) or /proc/kmsg */ |
| 218 | static u64 syslog_seq; | 220 | static u64 syslog_seq; |
| 219 | static u32 syslog_idx; | 221 | static u32 syslog_idx; |
| 222 | static enum log_flags syslog_prev; | ||
| 223 | static size_t syslog_partial; | ||
| 220 | 224 | ||
| 221 | /* index and sequence number of the first record stored in the buffer */ | 225 | /* index and sequence number of the first record stored in the buffer */ |
| 222 | static u64 log_first_seq; | 226 | static u64 log_first_seq; |
| @@ -430,20 +434,20 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, | |||
| 430 | ret = mutex_lock_interruptible(&user->lock); | 434 | ret = mutex_lock_interruptible(&user->lock); |
| 431 | if (ret) | 435 | if (ret) |
| 432 | return ret; | 436 | return ret; |
| 433 | raw_spin_lock(&logbuf_lock); | 437 | raw_spin_lock_irq(&logbuf_lock); |
| 434 | while (user->seq == log_next_seq) { | 438 | while (user->seq == log_next_seq) { |
| 435 | if (file->f_flags & O_NONBLOCK) { | 439 | if (file->f_flags & O_NONBLOCK) { |
| 436 | ret = -EAGAIN; | 440 | ret = -EAGAIN; |
| 437 | raw_spin_unlock(&logbuf_lock); | 441 | raw_spin_unlock_irq(&logbuf_lock); |
| 438 | goto out; | 442 | goto out; |
| 439 | } | 443 | } |
| 440 | 444 | ||
| 441 | raw_spin_unlock(&logbuf_lock); | 445 | raw_spin_unlock_irq(&logbuf_lock); |
| 442 | ret = wait_event_interruptible(log_wait, | 446 | ret = wait_event_interruptible(log_wait, |
| 443 | user->seq != log_next_seq); | 447 | user->seq != log_next_seq); |
| 444 | if (ret) | 448 | if (ret) |
| 445 | goto out; | 449 | goto out; |
| 446 | raw_spin_lock(&logbuf_lock); | 450 | raw_spin_lock_irq(&logbuf_lock); |
| 447 | } | 451 | } |
| 448 | 452 | ||
| 449 | if (user->seq < log_first_seq) { | 453 | if (user->seq < log_first_seq) { |
| @@ -451,7 +455,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, | |||
| 451 | user->idx = log_first_idx; | 455 | user->idx = log_first_idx; |
| 452 | user->seq = log_first_seq; | 456 | user->seq = log_first_seq; |
| 453 | ret = -EPIPE; | 457 | ret = -EPIPE; |
| 454 | raw_spin_unlock(&logbuf_lock); | 458 | raw_spin_unlock_irq(&logbuf_lock); |
| 455 | goto out; | 459 | goto out; |
| 456 | } | 460 | } |
| 457 | 461 | ||
| @@ -465,7 +469,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, | |||
| 465 | for (i = 0; i < msg->text_len; i++) { | 469 | for (i = 0; i < msg->text_len; i++) { |
| 466 | unsigned char c = log_text(msg)[i]; | 470 | unsigned char c = log_text(msg)[i]; |
| 467 | 471 | ||
| 468 | if (c < ' ' || c >= 128) | 472 | if (c < ' ' || c >= 127 || c == '\\') |
| 469 | len += sprintf(user->buf + len, "\\x%02x", c); | 473 | len += sprintf(user->buf + len, "\\x%02x", c); |
| 470 | else | 474 | else |
| 471 | user->buf[len++] = c; | 475 | user->buf[len++] = c; |
| @@ -489,7 +493,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, | |||
| 489 | continue; | 493 | continue; |
| 490 | } | 494 | } |
| 491 | 495 | ||
| 492 | if (c < ' ' || c >= 128) { | 496 | if (c < ' ' || c >= 127 || c == '\\') { |
| 493 | len += sprintf(user->buf + len, "\\x%02x", c); | 497 | len += sprintf(user->buf + len, "\\x%02x", c); |
| 494 | continue; | 498 | continue; |
| 495 | } | 499 | } |
| @@ -501,7 +505,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, | |||
| 501 | 505 | ||
| 502 | user->idx = log_next(user->idx); | 506 | user->idx = log_next(user->idx); |
| 503 | user->seq++; | 507 | user->seq++; |
| 504 | raw_spin_unlock(&logbuf_lock); | 508 | raw_spin_unlock_irq(&logbuf_lock); |
| 505 | 509 | ||
| 506 | if (len > count) { | 510 | if (len > count) { |
| 507 | ret = -EINVAL; | 511 | ret = -EINVAL; |
| @@ -528,7 +532,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) | |||
| 528 | if (offset) | 532 | if (offset) |
| 529 | return -ESPIPE; | 533 | return -ESPIPE; |
| 530 | 534 | ||
| 531 | raw_spin_lock(&logbuf_lock); | 535 | raw_spin_lock_irq(&logbuf_lock); |
| 532 | switch (whence) { | 536 | switch (whence) { |
| 533 | case SEEK_SET: | 537 | case SEEK_SET: |
| 534 | /* the first record */ | 538 | /* the first record */ |
| @@ -552,7 +556,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) | |||
| 552 | default: | 556 | default: |
| 553 | ret = -EINVAL; | 557 | ret = -EINVAL; |
| 554 | } | 558 | } |
| 555 | raw_spin_unlock(&logbuf_lock); | 559 | raw_spin_unlock_irq(&logbuf_lock); |
| 556 | return ret; | 560 | return ret; |
| 557 | } | 561 | } |
| 558 | 562 | ||
| @@ -566,14 +570,14 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait) | |||
| 566 | 570 | ||
| 567 | poll_wait(file, &log_wait, wait); | 571 | poll_wait(file, &log_wait, wait); |
| 568 | 572 | ||
| 569 | raw_spin_lock(&logbuf_lock); | 573 | raw_spin_lock_irq(&logbuf_lock); |
| 570 | if (user->seq < log_next_seq) { | 574 | if (user->seq < log_next_seq) { |
| 571 | /* return error when data has vanished underneath us */ | 575 | /* return error when data has vanished underneath us */ |
| 572 | if (user->seq < log_first_seq) | 576 | if (user->seq < log_first_seq) |
| 573 | ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; | 577 | ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; |
| 574 | ret = POLLIN|POLLRDNORM; | 578 | ret = POLLIN|POLLRDNORM; |
| 575 | } | 579 | } |
| 576 | raw_spin_unlock(&logbuf_lock); | 580 | raw_spin_unlock_irq(&logbuf_lock); |
| 577 | 581 | ||
| 578 | return ret; | 582 | return ret; |
| 579 | } | 583 | } |
| @@ -597,10 +601,10 @@ static int devkmsg_open(struct inode *inode, struct file *file) | |||
| 597 | 601 | ||
| 598 | mutex_init(&user->lock); | 602 | mutex_init(&user->lock); |
| 599 | 603 | ||
| 600 | raw_spin_lock(&logbuf_lock); | 604 | raw_spin_lock_irq(&logbuf_lock); |
| 601 | user->idx = log_first_idx; | 605 | user->idx = log_first_idx; |
| 602 | user->seq = log_first_seq; | 606 | user->seq = log_first_seq; |
| 603 | raw_spin_unlock(&logbuf_lock); | 607 | raw_spin_unlock_irq(&logbuf_lock); |
| 604 | 608 | ||
| 605 | file->private_data = user; | 609 | file->private_data = user; |
| 606 | return 0; | 610 | return 0; |
| @@ -818,15 +822,18 @@ static size_t print_time(u64 ts, char *buf) | |||
| 818 | static size_t print_prefix(const struct log *msg, bool syslog, char *buf) | 822 | static size_t print_prefix(const struct log *msg, bool syslog, char *buf) |
| 819 | { | 823 | { |
| 820 | size_t len = 0; | 824 | size_t len = 0; |
| 825 | unsigned int prefix = (msg->facility << 3) | msg->level; | ||
| 821 | 826 | ||
| 822 | if (syslog) { | 827 | if (syslog) { |
| 823 | if (buf) { | 828 | if (buf) { |
| 824 | len += sprintf(buf, "<%u>", msg->level); | 829 | len += sprintf(buf, "<%u>", prefix); |
| 825 | } else { | 830 | } else { |
| 826 | len += 3; | 831 | len += 3; |
| 827 | if (msg->level > 9) | 832 | if (prefix > 999) |
| 828 | len++; | 833 | len += 3; |
| 829 | if (msg->level > 99) | 834 | else if (prefix > 99) |
| 835 | len += 2; | ||
| 836 | else if (prefix > 9) | ||
| 830 | len++; | 837 | len++; |
| 831 | } | 838 | } |
| 832 | } | 839 | } |
| @@ -835,13 +842,26 @@ static size_t print_prefix(const struct log *msg, bool syslog, char *buf) | |||
| 835 | return len; | 842 | return len; |
| 836 | } | 843 | } |
| 837 | 844 | ||
| 838 | static size_t msg_print_text(const struct log *msg, bool syslog, | 845 | static size_t msg_print_text(const struct log *msg, enum log_flags prev, |
| 839 | char *buf, size_t size) | 846 | bool syslog, char *buf, size_t size) |
| 840 | { | 847 | { |
| 841 | const char *text = log_text(msg); | 848 | const char *text = log_text(msg); |
| 842 | size_t text_size = msg->text_len; | 849 | size_t text_size = msg->text_len; |
| 850 | bool prefix = true; | ||
| 851 | bool newline = true; | ||
| 843 | size_t len = 0; | 852 | size_t len = 0; |
| 844 | 853 | ||
| 854 | if ((prev & LOG_CONT) && !(msg->flags & LOG_PREFIX)) | ||
| 855 | prefix = false; | ||
| 856 | |||
| 857 | if (msg->flags & LOG_CONT) { | ||
| 858 | if ((prev & LOG_CONT) && !(prev & LOG_NEWLINE)) | ||
| 859 | prefix = false; | ||
| 860 | |||
| 861 | if (!(msg->flags & LOG_NEWLINE)) | ||
| 862 | newline = false; | ||
| 863 | } | ||
| 864 | |||
| 845 | do { | 865 | do { |
| 846 | const char *next = memchr(text, '\n', text_size); | 866 | const char *next = memchr(text, '\n', text_size); |
| 847 | size_t text_len; | 867 | size_t text_len; |
| @@ -859,16 +879,22 @@ static size_t msg_print_text(const struct log *msg, bool syslog, | |||
| 859 | text_len + 1>= size - len) | 879 | text_len + 1>= size - len) |
| 860 | break; | 880 | break; |
| 861 | 881 | ||
| 862 | len += print_prefix(msg, syslog, buf + len); | 882 | if (prefix) |
| 883 | len += print_prefix(msg, syslog, buf + len); | ||
| 863 | memcpy(buf + len, text, text_len); | 884 | memcpy(buf + len, text, text_len); |
| 864 | len += text_len; | 885 | len += text_len; |
| 865 | buf[len++] = '\n'; | 886 | if (next || newline) |
| 887 | buf[len++] = '\n'; | ||
| 866 | } else { | 888 | } else { |
| 867 | /* SYSLOG_ACTION_* buffer size only calculation */ | 889 | /* SYSLOG_ACTION_* buffer size only calculation */ |
| 868 | len += print_prefix(msg, syslog, NULL); | 890 | if (prefix) |
| 869 | len += text_len + 1; | 891 | len += print_prefix(msg, syslog, NULL); |
| 892 | len += text_len; | ||
| 893 | if (next || newline) | ||
| 894 | len++; | ||
| 870 | } | 895 | } |
| 871 | 896 | ||
| 897 | prefix = true; | ||
| 872 | text = next; | 898 | text = next; |
| 873 | } while (text); | 899 | } while (text); |
| 874 | 900 | ||
| @@ -887,22 +913,35 @@ static int syslog_print(char __user *buf, int size) | |||
| 887 | 913 | ||
| 888 | while (size > 0) { | 914 | while (size > 0) { |
| 889 | size_t n; | 915 | size_t n; |
| 916 | size_t skip; | ||
| 890 | 917 | ||
| 891 | raw_spin_lock_irq(&logbuf_lock); | 918 | raw_spin_lock_irq(&logbuf_lock); |
| 892 | if (syslog_seq < log_first_seq) { | 919 | if (syslog_seq < log_first_seq) { |
| 893 | /* messages are gone, move to first one */ | 920 | /* messages are gone, move to first one */ |
| 894 | syslog_seq = log_first_seq; | 921 | syslog_seq = log_first_seq; |
| 895 | syslog_idx = log_first_idx; | 922 | syslog_idx = log_first_idx; |
| 923 | syslog_prev = 0; | ||
| 924 | syslog_partial = 0; | ||
| 896 | } | 925 | } |
| 897 | if (syslog_seq == log_next_seq) { | 926 | if (syslog_seq == log_next_seq) { |
| 898 | raw_spin_unlock_irq(&logbuf_lock); | 927 | raw_spin_unlock_irq(&logbuf_lock); |
| 899 | break; | 928 | break; |
| 900 | } | 929 | } |
| 930 | |||
| 931 | skip = syslog_partial; | ||
| 901 | msg = log_from_idx(syslog_idx); | 932 | msg = log_from_idx(syslog_idx); |
| 902 | n = msg_print_text(msg, true, text, LOG_LINE_MAX); | 933 | n = msg_print_text(msg, syslog_prev, true, text, LOG_LINE_MAX); |
| 903 | if (n <= size) { | 934 | if (n - syslog_partial <= size) { |
| 935 | /* message fits into buffer, move forward */ | ||
| 904 | syslog_idx = log_next(syslog_idx); | 936 | syslog_idx = log_next(syslog_idx); |
| 905 | syslog_seq++; | 937 | syslog_seq++; |
| 938 | syslog_prev = msg->flags; | ||
| 939 | n -= syslog_partial; | ||
| 940 | syslog_partial = 0; | ||
| 941 | } else if (!len){ | ||
| 942 | /* partial read(), remember position */ | ||
| 943 | n = size; | ||
| 944 | syslog_partial += n; | ||
| 906 | } else | 945 | } else |
| 907 | n = 0; | 946 | n = 0; |
| 908 | raw_spin_unlock_irq(&logbuf_lock); | 947 | raw_spin_unlock_irq(&logbuf_lock); |
| @@ -910,17 +949,15 @@ static int syslog_print(char __user *buf, int size) | |||
| 910 | if (!n) | 949 | if (!n) |
| 911 | break; | 950 | break; |
| 912 | 951 | ||
| 913 | len += n; | 952 | if (copy_to_user(buf, text + skip, n)) { |
| 914 | size -= n; | ||
| 915 | buf += n; | ||
| 916 | n = copy_to_user(buf - n, text, n); | ||
| 917 | |||
| 918 | if (n) { | ||
| 919 | len -= n; | ||
| 920 | if (!len) | 953 | if (!len) |
| 921 | len = -EFAULT; | 954 | len = -EFAULT; |
| 922 | break; | 955 | break; |
| 923 | } | 956 | } |
| 957 | |||
| 958 | len += n; | ||
| 959 | size -= n; | ||
| 960 | buf += n; | ||
| 924 | } | 961 | } |
| 925 | 962 | ||
| 926 | kfree(text); | 963 | kfree(text); |
| @@ -941,6 +978,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) | |||
| 941 | u64 next_seq; | 978 | u64 next_seq; |
| 942 | u64 seq; | 979 | u64 seq; |
| 943 | u32 idx; | 980 | u32 idx; |
| 981 | enum log_flags prev; | ||
| 944 | 982 | ||
| 945 | if (clear_seq < log_first_seq) { | 983 | if (clear_seq < log_first_seq) { |
| 946 | /* messages are gone, move to first available one */ | 984 | /* messages are gone, move to first available one */ |
| @@ -954,10 +992,11 @@ static int syslog_print_all(char __user *buf, int size, bool clear) | |||
| 954 | */ | 992 | */ |
| 955 | seq = clear_seq; | 993 | seq = clear_seq; |
| 956 | idx = clear_idx; | 994 | idx = clear_idx; |
| 995 | prev = 0; | ||
| 957 | while (seq < log_next_seq) { | 996 | while (seq < log_next_seq) { |
| 958 | struct log *msg = log_from_idx(idx); | 997 | struct log *msg = log_from_idx(idx); |
| 959 | 998 | ||
| 960 | len += msg_print_text(msg, true, NULL, 0); | 999 | len += msg_print_text(msg, prev, true, NULL, 0); |
| 961 | idx = log_next(idx); | 1000 | idx = log_next(idx); |
| 962 | seq++; | 1001 | seq++; |
| 963 | } | 1002 | } |
| @@ -965,10 +1004,11 @@ static int syslog_print_all(char __user *buf, int size, bool clear) | |||
| 965 | /* move first record forward until length fits into the buffer */ | 1004 | /* move first record forward until length fits into the buffer */ |
| 966 | seq = clear_seq; | 1005 | seq = clear_seq; |
| 967 | idx = clear_idx; | 1006 | idx = clear_idx; |
| 1007 | prev = 0; | ||
| 968 | while (len > size && seq < log_next_seq) { | 1008 | while (len > size && seq < log_next_seq) { |
| 969 | struct log *msg = log_from_idx(idx); | 1009 | struct log *msg = log_from_idx(idx); |
| 970 | 1010 | ||
| 971 | len -= msg_print_text(msg, true, NULL, 0); | 1011 | len -= msg_print_text(msg, prev, true, NULL, 0); |
| 972 | idx = log_next(idx); | 1012 | idx = log_next(idx); |
| 973 | seq++; | 1013 | seq++; |
| 974 | } | 1014 | } |
| @@ -977,17 +1017,19 @@ static int syslog_print_all(char __user *buf, int size, bool clear) | |||
| 977 | next_seq = log_next_seq; | 1017 | next_seq = log_next_seq; |
| 978 | 1018 | ||
| 979 | len = 0; | 1019 | len = 0; |
| 1020 | prev = 0; | ||
| 980 | while (len >= 0 && seq < next_seq) { | 1021 | while (len >= 0 && seq < next_seq) { |
| 981 | struct log *msg = log_from_idx(idx); | 1022 | struct log *msg = log_from_idx(idx); |
| 982 | int textlen; | 1023 | int textlen; |
| 983 | 1024 | ||
| 984 | textlen = msg_print_text(msg, true, text, LOG_LINE_MAX); | 1025 | textlen = msg_print_text(msg, prev, true, text, LOG_LINE_MAX); |
| 985 | if (textlen < 0) { | 1026 | if (textlen < 0) { |
| 986 | len = textlen; | 1027 | len = textlen; |
| 987 | break; | 1028 | break; |
| 988 | } | 1029 | } |
| 989 | idx = log_next(idx); | 1030 | idx = log_next(idx); |
| 990 | seq++; | 1031 | seq++; |
| 1032 | prev = msg->flags; | ||
| 991 | 1033 | ||
| 992 | raw_spin_unlock_irq(&logbuf_lock); | 1034 | raw_spin_unlock_irq(&logbuf_lock); |
| 993 | if (copy_to_user(buf + len, text, textlen)) | 1035 | if (copy_to_user(buf + len, text, textlen)) |
| @@ -1000,6 +1042,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) | |||
| 1000 | /* messages are gone, move to next one */ | 1042 | /* messages are gone, move to next one */ |
| 1001 | seq = log_first_seq; | 1043 | seq = log_first_seq; |
| 1002 | idx = log_first_idx; | 1044 | idx = log_first_idx; |
| 1045 | prev = 0; | ||
| 1003 | } | 1046 | } |
| 1004 | } | 1047 | } |
| 1005 | } | 1048 | } |
| @@ -1018,7 +1061,6 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
| 1018 | { | 1061 | { |
| 1019 | bool clear = false; | 1062 | bool clear = false; |
| 1020 | static int saved_console_loglevel = -1; | 1063 | static int saved_console_loglevel = -1; |
| 1021 | static DEFINE_MUTEX(syslog_mutex); | ||
| 1022 | int error; | 1064 | int error; |
| 1023 | 1065 | ||
| 1024 | error = check_syslog_permissions(type, from_file); | 1066 | error = check_syslog_permissions(type, from_file); |
| @@ -1045,17 +1087,11 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
| 1045 | error = -EFAULT; | 1087 | error = -EFAULT; |
| 1046 | goto out; | 1088 | goto out; |
| 1047 | } | 1089 | } |
| 1048 | error = mutex_lock_interruptible(&syslog_mutex); | ||
| 1049 | if (error) | ||
| 1050 | goto out; | ||
| 1051 | error = wait_event_interruptible(log_wait, | 1090 | error = wait_event_interruptible(log_wait, |
| 1052 | syslog_seq != log_next_seq); | 1091 | syslog_seq != log_next_seq); |
| 1053 | if (error) { | 1092 | if (error) |
| 1054 | mutex_unlock(&syslog_mutex); | ||
| 1055 | goto out; | 1093 | goto out; |
| 1056 | } | ||
| 1057 | error = syslog_print(buf, len); | 1094 | error = syslog_print(buf, len); |
| 1058 | mutex_unlock(&syslog_mutex); | ||
| 1059 | break; | 1095 | break; |
| 1060 | /* Read/clear last kernel messages */ | 1096 | /* Read/clear last kernel messages */ |
| 1061 | case SYSLOG_ACTION_READ_CLEAR: | 1097 | case SYSLOG_ACTION_READ_CLEAR: |
| @@ -1111,6 +1147,8 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
| 1111 | /* messages are gone, move to first one */ | 1147 | /* messages are gone, move to first one */ |
| 1112 | syslog_seq = log_first_seq; | 1148 | syslog_seq = log_first_seq; |
| 1113 | syslog_idx = log_first_idx; | 1149 | syslog_idx = log_first_idx; |
| 1150 | syslog_prev = 0; | ||
| 1151 | syslog_partial = 0; | ||
| 1114 | } | 1152 | } |
| 1115 | if (from_file) { | 1153 | if (from_file) { |
| 1116 | /* | 1154 | /* |
| @@ -1120,19 +1158,20 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
| 1120 | */ | 1158 | */ |
| 1121 | error = log_next_idx - syslog_idx; | 1159 | error = log_next_idx - syslog_idx; |
| 1122 | } else { | 1160 | } else { |
| 1123 | u64 seq; | 1161 | u64 seq = syslog_seq; |
| 1124 | u32 idx; | 1162 | u32 idx = syslog_idx; |
| 1163 | enum log_flags prev = syslog_prev; | ||
| 1125 | 1164 | ||
| 1126 | error = 0; | 1165 | error = 0; |
| 1127 | seq = syslog_seq; | ||
| 1128 | idx = syslog_idx; | ||
| 1129 | while (seq < log_next_seq) { | 1166 | while (seq < log_next_seq) { |
| 1130 | struct log *msg = log_from_idx(idx); | 1167 | struct log *msg = log_from_idx(idx); |
| 1131 | 1168 | ||
| 1132 | error += msg_print_text(msg, true, NULL, 0); | 1169 | error += msg_print_text(msg, prev, true, NULL, 0); |
| 1133 | idx = log_next(idx); | 1170 | idx = log_next(idx); |
| 1134 | seq++; | 1171 | seq++; |
| 1172 | prev = msg->flags; | ||
| 1135 | } | 1173 | } |
| 1174 | error -= syslog_partial; | ||
| 1136 | } | 1175 | } |
| 1137 | raw_spin_unlock_irq(&logbuf_lock); | 1176 | raw_spin_unlock_irq(&logbuf_lock); |
| 1138 | break; | 1177 | break; |
| @@ -1153,21 +1192,6 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) | |||
| 1153 | return do_syslog(type, buf, len, SYSLOG_FROM_CALL); | 1192 | return do_syslog(type, buf, len, SYSLOG_FROM_CALL); |
| 1154 | } | 1193 | } |
| 1155 | 1194 | ||
| 1156 | #ifdef CONFIG_KGDB_KDB | ||
| 1157 | /* kdb dmesg command needs access to the syslog buffer. do_syslog() | ||
| 1158 | * uses locks so it cannot be used during debugging. Just tell kdb | ||
| 1159 | * where the start and end of the physical and logical logs are. This | ||
| 1160 | * is equivalent to do_syslog(3). | ||
| 1161 | */ | ||
| 1162 | void kdb_syslog_data(char *syslog_data[4]) | ||
| 1163 | { | ||
| 1164 | syslog_data[0] = log_buf; | ||
| 1165 | syslog_data[1] = log_buf + log_buf_len; | ||
| 1166 | syslog_data[2] = log_buf + log_first_idx; | ||
| 1167 | syslog_data[3] = log_buf + log_next_idx; | ||
| 1168 | } | ||
| 1169 | #endif /* CONFIG_KGDB_KDB */ | ||
| 1170 | |||
| 1171 | static bool __read_mostly ignore_loglevel; | 1195 | static bool __read_mostly ignore_loglevel; |
| 1172 | 1196 | ||
| 1173 | static int __init ignore_loglevel_setup(char *str) | 1197 | static int __init ignore_loglevel_setup(char *str) |
| @@ -1400,10 +1424,9 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
| 1400 | static char textbuf[LOG_LINE_MAX]; | 1424 | static char textbuf[LOG_LINE_MAX]; |
| 1401 | char *text = textbuf; | 1425 | char *text = textbuf; |
| 1402 | size_t text_len; | 1426 | size_t text_len; |
| 1427 | enum log_flags lflags = 0; | ||
| 1403 | unsigned long flags; | 1428 | unsigned long flags; |
| 1404 | int this_cpu; | 1429 | int this_cpu; |
| 1405 | bool newline = false; | ||
| 1406 | bool prefix = false; | ||
| 1407 | int printed_len = 0; | 1430 | int printed_len = 0; |
| 1408 | 1431 | ||
| 1409 | boot_delay_msec(); | 1432 | boot_delay_msec(); |
| @@ -1442,7 +1465,7 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
| 1442 | recursion_bug = 0; | 1465 | recursion_bug = 0; |
| 1443 | printed_len += strlen(recursion_msg); | 1466 | printed_len += strlen(recursion_msg); |
| 1444 | /* emit KERN_CRIT message */ | 1467 | /* emit KERN_CRIT message */ |
| 1445 | log_store(0, 2, LOG_DEFAULT, 0, | 1468 | log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, |
| 1446 | NULL, 0, recursion_msg, printed_len); | 1469 | NULL, 0, recursion_msg, printed_len); |
| 1447 | } | 1470 | } |
| 1448 | 1471 | ||
| @@ -1455,7 +1478,7 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
| 1455 | /* mark and strip a trailing newline */ | 1478 | /* mark and strip a trailing newline */ |
| 1456 | if (text_len && text[text_len-1] == '\n') { | 1479 | if (text_len && text[text_len-1] == '\n') { |
| 1457 | text_len--; | 1480 | text_len--; |
| 1458 | newline = true; | 1481 | lflags |= LOG_NEWLINE; |
| 1459 | } | 1482 | } |
| 1460 | 1483 | ||
| 1461 | /* strip syslog prefix and extract log level or control flags */ | 1484 | /* strip syslog prefix and extract log level or control flags */ |
| @@ -1465,7 +1488,7 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
| 1465 | if (level == -1) | 1488 | if (level == -1) |
| 1466 | level = text[1] - '0'; | 1489 | level = text[1] - '0'; |
| 1467 | case 'd': /* KERN_DEFAULT */ | 1490 | case 'd': /* KERN_DEFAULT */ |
| 1468 | prefix = true; | 1491 | lflags |= LOG_PREFIX; |
| 1469 | case 'c': /* KERN_CONT */ | 1492 | case 'c': /* KERN_CONT */ |
| 1470 | text += 3; | 1493 | text += 3; |
| 1471 | text_len -= 3; | 1494 | text_len -= 3; |
| @@ -1475,22 +1498,20 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
| 1475 | if (level == -1) | 1498 | if (level == -1) |
| 1476 | level = default_message_loglevel; | 1499 | level = default_message_loglevel; |
| 1477 | 1500 | ||
| 1478 | if (dict) { | 1501 | if (dict) |
| 1479 | prefix = true; | 1502 | lflags |= LOG_PREFIX|LOG_NEWLINE; |
| 1480 | newline = true; | ||
| 1481 | } | ||
| 1482 | 1503 | ||
| 1483 | if (!newline) { | 1504 | if (!(lflags & LOG_NEWLINE)) { |
| 1484 | /* | 1505 | /* |
| 1485 | * Flush the conflicting buffer. An earlier newline was missing, | 1506 | * Flush the conflicting buffer. An earlier newline was missing, |
| 1486 | * or another task also prints continuation lines. | 1507 | * or another task also prints continuation lines. |
| 1487 | */ | 1508 | */ |
| 1488 | if (cont.len && (prefix || cont.owner != current)) | 1509 | if (cont.len && (lflags & LOG_PREFIX || cont.owner != current)) |
| 1489 | cont_flush(); | 1510 | cont_flush(); |
| 1490 | 1511 | ||
| 1491 | /* buffer line if possible, otherwise store it right away */ | 1512 | /* buffer line if possible, otherwise store it right away */ |
| 1492 | if (!cont_add(facility, level, text, text_len)) | 1513 | if (!cont_add(facility, level, text, text_len)) |
| 1493 | log_store(facility, level, LOG_DEFAULT, 0, | 1514 | log_store(facility, level, lflags | LOG_CONT, 0, |
| 1494 | dict, dictlen, text, text_len); | 1515 | dict, dictlen, text, text_len); |
| 1495 | } else { | 1516 | } else { |
| 1496 | bool stored = false; | 1517 | bool stored = false; |
| @@ -1502,13 +1523,13 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
| 1502 | * flush it out and store this line separately. | 1523 | * flush it out and store this line separately. |
| 1503 | */ | 1524 | */ |
| 1504 | if (cont.len && cont.owner == current) { | 1525 | if (cont.len && cont.owner == current) { |
| 1505 | if (!prefix) | 1526 | if (!(lflags & LOG_PREFIX)) |
| 1506 | stored = cont_add(facility, level, text, text_len); | 1527 | stored = cont_add(facility, level, text, text_len); |
| 1507 | cont_flush(); | 1528 | cont_flush(); |
| 1508 | } | 1529 | } |
| 1509 | 1530 | ||
| 1510 | if (!stored) | 1531 | if (!stored) |
| 1511 | log_store(facility, level, LOG_DEFAULT, 0, | 1532 | log_store(facility, level, lflags, 0, |
| 1512 | dict, dictlen, text, text_len); | 1533 | dict, dictlen, text, text_len); |
| 1513 | } | 1534 | } |
| 1514 | printed_len += text_len; | 1535 | printed_len += text_len; |
| @@ -1607,8 +1628,8 @@ static struct cont { | |||
| 1607 | static struct log *log_from_idx(u32 idx) { return NULL; } | 1628 | static struct log *log_from_idx(u32 idx) { return NULL; } |
| 1608 | static u32 log_next(u32 idx) { return 0; } | 1629 | static u32 log_next(u32 idx) { return 0; } |
| 1609 | static void call_console_drivers(int level, const char *text, size_t len) {} | 1630 | static void call_console_drivers(int level, const char *text, size_t len) {} |
| 1610 | static size_t msg_print_text(const struct log *msg, bool syslog, | 1631 | static size_t msg_print_text(const struct log *msg, enum log_flags prev, |
| 1611 | char *buf, size_t size) { return 0; } | 1632 | bool syslog, char *buf, size_t size) { return 0; } |
| 1612 | static size_t cont_print_text(char *text, size_t size) { return 0; } | 1633 | static size_t cont_print_text(char *text, size_t size) { return 0; } |
| 1613 | 1634 | ||
| 1614 | #endif /* CONFIG_PRINTK */ | 1635 | #endif /* CONFIG_PRINTK */ |
| @@ -1884,6 +1905,7 @@ void wake_up_klogd(void) | |||
| 1884 | /* the next printk record to write to the console */ | 1905 | /* the next printk record to write to the console */ |
| 1885 | static u64 console_seq; | 1906 | static u64 console_seq; |
| 1886 | static u32 console_idx; | 1907 | static u32 console_idx; |
| 1908 | static enum log_flags console_prev; | ||
| 1887 | 1909 | ||
| 1888 | /** | 1910 | /** |
| 1889 | * console_unlock - unlock the console system | 1911 | * console_unlock - unlock the console system |
| @@ -1944,6 +1966,7 @@ again: | |||
| 1944 | /* messages are gone, move to first one */ | 1966 | /* messages are gone, move to first one */ |
| 1945 | console_seq = log_first_seq; | 1967 | console_seq = log_first_seq; |
| 1946 | console_idx = log_first_idx; | 1968 | console_idx = log_first_idx; |
| 1969 | console_prev = 0; | ||
| 1947 | } | 1970 | } |
| 1948 | skip: | 1971 | skip: |
| 1949 | if (console_seq == log_next_seq) | 1972 | if (console_seq == log_next_seq) |
| @@ -1957,14 +1980,21 @@ skip: | |||
| 1957 | */ | 1980 | */ |
| 1958 | console_idx = log_next(console_idx); | 1981 | console_idx = log_next(console_idx); |
| 1959 | console_seq++; | 1982 | console_seq++; |
| 1983 | /* | ||
| 1984 | * We will get here again when we register a new | ||
| 1985 | * CON_PRINTBUFFER console. Clear the flag so we | ||
| 1986 | * will properly dump everything later. | ||
| 1987 | */ | ||
| 1988 | msg->flags &= ~LOG_NOCONS; | ||
| 1960 | goto skip; | 1989 | goto skip; |
| 1961 | } | 1990 | } |
| 1962 | 1991 | ||
| 1963 | level = msg->level; | 1992 | level = msg->level; |
| 1964 | len = msg_print_text(msg, false, text, sizeof(text)); | 1993 | len = msg_print_text(msg, console_prev, false, |
| 1965 | 1994 | text, sizeof(text)); | |
| 1966 | console_idx = log_next(console_idx); | 1995 | console_idx = log_next(console_idx); |
| 1967 | console_seq++; | 1996 | console_seq++; |
| 1997 | console_prev = msg->flags; | ||
| 1968 | raw_spin_unlock(&logbuf_lock); | 1998 | raw_spin_unlock(&logbuf_lock); |
| 1969 | 1999 | ||
| 1970 | stop_critical_timings(); /* don't trace print latency */ | 2000 | stop_critical_timings(); /* don't trace print latency */ |
| @@ -2227,6 +2257,7 @@ void register_console(struct console *newcon) | |||
| 2227 | raw_spin_lock_irqsave(&logbuf_lock, flags); | 2257 | raw_spin_lock_irqsave(&logbuf_lock, flags); |
| 2228 | console_seq = syslog_seq; | 2258 | console_seq = syslog_seq; |
| 2229 | console_idx = syslog_idx; | 2259 | console_idx = syslog_idx; |
| 2260 | console_prev = syslog_prev; | ||
| 2230 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | 2261 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
| 2231 | /* | 2262 | /* |
| 2232 | * We're about to replay the log buffer. Only do this to the | 2263 | * We're about to replay the log buffer. Only do this to the |
| @@ -2479,7 +2510,7 @@ void kmsg_dump(enum kmsg_dump_reason reason) | |||
| 2479 | } | 2510 | } |
| 2480 | 2511 | ||
| 2481 | /** | 2512 | /** |
| 2482 | * kmsg_dump_get_line - retrieve one kmsg log line | 2513 | * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version) |
| 2483 | * @dumper: registered kmsg dumper | 2514 | * @dumper: registered kmsg dumper |
| 2484 | * @syslog: include the "<4>" prefixes | 2515 | * @syslog: include the "<4>" prefixes |
| 2485 | * @line: buffer to copy the line to | 2516 | * @line: buffer to copy the line to |
| @@ -2494,11 +2525,12 @@ void kmsg_dump(enum kmsg_dump_reason reason) | |||
| 2494 | * | 2525 | * |
| 2495 | * A return value of FALSE indicates that there are no more records to | 2526 | * A return value of FALSE indicates that there are no more records to |
| 2496 | * read. | 2527 | * read. |
| 2528 | * | ||
| 2529 | * The function is similar to kmsg_dump_get_line(), but grabs no locks. | ||
| 2497 | */ | 2530 | */ |
| 2498 | bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, | 2531 | bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, |
| 2499 | char *line, size_t size, size_t *len) | 2532 | char *line, size_t size, size_t *len) |
| 2500 | { | 2533 | { |
| 2501 | unsigned long flags; | ||
| 2502 | struct log *msg; | 2534 | struct log *msg; |
| 2503 | size_t l = 0; | 2535 | size_t l = 0; |
| 2504 | bool ret = false; | 2536 | bool ret = false; |
| @@ -2506,7 +2538,6 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, | |||
| 2506 | if (!dumper->active) | 2538 | if (!dumper->active) |
| 2507 | goto out; | 2539 | goto out; |
| 2508 | 2540 | ||
| 2509 | raw_spin_lock_irqsave(&logbuf_lock, flags); | ||
| 2510 | if (dumper->cur_seq < log_first_seq) { | 2541 | if (dumper->cur_seq < log_first_seq) { |
| 2511 | /* messages are gone, move to first available one */ | 2542 | /* messages are gone, move to first available one */ |
| 2512 | dumper->cur_seq = log_first_seq; | 2543 | dumper->cur_seq = log_first_seq; |
| @@ -2514,24 +2545,50 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, | |||
| 2514 | } | 2545 | } |
| 2515 | 2546 | ||
| 2516 | /* last entry */ | 2547 | /* last entry */ |
| 2517 | if (dumper->cur_seq >= log_next_seq) { | 2548 | if (dumper->cur_seq >= log_next_seq) |
| 2518 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | ||
| 2519 | goto out; | 2549 | goto out; |
| 2520 | } | ||
| 2521 | 2550 | ||
| 2522 | msg = log_from_idx(dumper->cur_idx); | 2551 | msg = log_from_idx(dumper->cur_idx); |
| 2523 | l = msg_print_text(msg, syslog, | 2552 | l = msg_print_text(msg, 0, syslog, line, size); |
| 2524 | line, size); | ||
| 2525 | 2553 | ||
| 2526 | dumper->cur_idx = log_next(dumper->cur_idx); | 2554 | dumper->cur_idx = log_next(dumper->cur_idx); |
| 2527 | dumper->cur_seq++; | 2555 | dumper->cur_seq++; |
| 2528 | ret = true; | 2556 | ret = true; |
| 2529 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | ||
| 2530 | out: | 2557 | out: |
| 2531 | if (len) | 2558 | if (len) |
| 2532 | *len = l; | 2559 | *len = l; |
| 2533 | return ret; | 2560 | return ret; |
| 2534 | } | 2561 | } |
| 2562 | |||
| 2563 | /** | ||
| 2564 | * kmsg_dump_get_line - retrieve one kmsg log line | ||
| 2565 | * @dumper: registered kmsg dumper | ||
| 2566 | * @syslog: include the "<4>" prefixes | ||
| 2567 | * @line: buffer to copy the line to | ||
| 2568 | * @size: maximum size of the buffer | ||
| 2569 | * @len: length of line placed into buffer | ||
| 2570 | * | ||
| 2571 | * Start at the beginning of the kmsg buffer, with the oldest kmsg | ||
| 2572 | * record, and copy one record into the provided buffer. | ||
| 2573 | * | ||
| 2574 | * Consecutive calls will return the next available record moving | ||
| 2575 | * towards the end of the buffer with the youngest messages. | ||
| 2576 | * | ||
| 2577 | * A return value of FALSE indicates that there are no more records to | ||
| 2578 | * read. | ||
| 2579 | */ | ||
| 2580 | bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, | ||
| 2581 | char *line, size_t size, size_t *len) | ||
| 2582 | { | ||
| 2583 | unsigned long flags; | ||
| 2584 | bool ret; | ||
| 2585 | |||
| 2586 | raw_spin_lock_irqsave(&logbuf_lock, flags); | ||
| 2587 | ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len); | ||
| 2588 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | ||
| 2589 | |||
| 2590 | return ret; | ||
| 2591 | } | ||
| 2535 | EXPORT_SYMBOL_GPL(kmsg_dump_get_line); | 2592 | EXPORT_SYMBOL_GPL(kmsg_dump_get_line); |
| 2536 | 2593 | ||
| 2537 | /** | 2594 | /** |
| @@ -2561,6 +2618,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, | |||
| 2561 | u32 idx; | 2618 | u32 idx; |
| 2562 | u64 next_seq; | 2619 | u64 next_seq; |
| 2563 | u32 next_idx; | 2620 | u32 next_idx; |
| 2621 | enum log_flags prev; | ||
| 2564 | size_t l = 0; | 2622 | size_t l = 0; |
| 2565 | bool ret = false; | 2623 | bool ret = false; |
| 2566 | 2624 | ||
| @@ -2583,23 +2641,27 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, | |||
| 2583 | /* calculate length of entire buffer */ | 2641 | /* calculate length of entire buffer */ |
| 2584 | seq = dumper->cur_seq; | 2642 | seq = dumper->cur_seq; |
| 2585 | idx = dumper->cur_idx; | 2643 | idx = dumper->cur_idx; |
| 2644 | prev = 0; | ||
| 2586 | while (seq < dumper->next_seq) { | 2645 | while (seq < dumper->next_seq) { |
| 2587 | struct log *msg = log_from_idx(idx); | 2646 | struct log *msg = log_from_idx(idx); |
| 2588 | 2647 | ||
| 2589 | l += msg_print_text(msg, true, NULL, 0); | 2648 | l += msg_print_text(msg, prev, true, NULL, 0); |
| 2590 | idx = log_next(idx); | 2649 | idx = log_next(idx); |
| 2591 | seq++; | 2650 | seq++; |
| 2651 | prev = msg->flags; | ||
| 2592 | } | 2652 | } |
| 2593 | 2653 | ||
| 2594 | /* move first record forward until length fits into the buffer */ | 2654 | /* move first record forward until length fits into the buffer */ |
| 2595 | seq = dumper->cur_seq; | 2655 | seq = dumper->cur_seq; |
| 2596 | idx = dumper->cur_idx; | 2656 | idx = dumper->cur_idx; |
| 2657 | prev = 0; | ||
| 2597 | while (l > size && seq < dumper->next_seq) { | 2658 | while (l > size && seq < dumper->next_seq) { |
| 2598 | struct log *msg = log_from_idx(idx); | 2659 | struct log *msg = log_from_idx(idx); |
| 2599 | 2660 | ||
| 2600 | l -= msg_print_text(msg, true, NULL, 0); | 2661 | l -= msg_print_text(msg, prev, true, NULL, 0); |
| 2601 | idx = log_next(idx); | 2662 | idx = log_next(idx); |
| 2602 | seq++; | 2663 | seq++; |
| 2664 | prev = msg->flags; | ||
| 2603 | } | 2665 | } |
| 2604 | 2666 | ||
| 2605 | /* last message in next interation */ | 2667 | /* last message in next interation */ |
| @@ -2607,14 +2669,14 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, | |||
| 2607 | next_idx = idx; | 2669 | next_idx = idx; |
| 2608 | 2670 | ||
| 2609 | l = 0; | 2671 | l = 0; |
| 2672 | prev = 0; | ||
| 2610 | while (seq < dumper->next_seq) { | 2673 | while (seq < dumper->next_seq) { |
| 2611 | struct log *msg = log_from_idx(idx); | 2674 | struct log *msg = log_from_idx(idx); |
| 2612 | 2675 | ||
| 2613 | l += msg_print_text(msg, syslog, | 2676 | l += msg_print_text(msg, prev, syslog, buf + l, size - l); |
| 2614 | buf + l, size - l); | ||
| 2615 | |||
| 2616 | idx = log_next(idx); | 2677 | idx = log_next(idx); |
| 2617 | seq++; | 2678 | seq++; |
| 2679 | prev = msg->flags; | ||
| 2618 | } | 2680 | } |
| 2619 | 2681 | ||
| 2620 | dumper->next_seq = next_seq; | 2682 | dumper->next_seq = next_seq; |
| @@ -2629,6 +2691,24 @@ out: | |||
| 2629 | EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); | 2691 | EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); |
| 2630 | 2692 | ||
| 2631 | /** | 2693 | /** |
| 2694 | * kmsg_dump_rewind_nolock - reset the interator (unlocked version) | ||
| 2695 | * @dumper: registered kmsg dumper | ||
| 2696 | * | ||
| 2697 | * Reset the dumper's iterator so that kmsg_dump_get_line() and | ||
| 2698 | * kmsg_dump_get_buffer() can be called again and used multiple | ||
| 2699 | * times within the same dumper.dump() callback. | ||
| 2700 | * | ||
| 2701 | * The function is similar to kmsg_dump_rewind(), but grabs no locks. | ||
| 2702 | */ | ||
| 2703 | void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper) | ||
| 2704 | { | ||
| 2705 | dumper->cur_seq = clear_seq; | ||
| 2706 | dumper->cur_idx = clear_idx; | ||
| 2707 | dumper->next_seq = log_next_seq; | ||
| 2708 | dumper->next_idx = log_next_idx; | ||
| 2709 | } | ||
| 2710 | |||
| 2711 | /** | ||
| 2632 | * kmsg_dump_rewind - reset the interator | 2712 | * kmsg_dump_rewind - reset the interator |
| 2633 | * @dumper: registered kmsg dumper | 2713 | * @dumper: registered kmsg dumper |
| 2634 | * | 2714 | * |
| @@ -2641,10 +2721,7 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper) | |||
| 2641 | unsigned long flags; | 2721 | unsigned long flags; |
| 2642 | 2722 | ||
| 2643 | raw_spin_lock_irqsave(&logbuf_lock, flags); | 2723 | raw_spin_lock_irqsave(&logbuf_lock, flags); |
| 2644 | dumper->cur_seq = clear_seq; | 2724 | kmsg_dump_rewind_nolock(dumper); |
| 2645 | dumper->cur_idx = clear_idx; | ||
| 2646 | dumper->next_seq = log_next_seq; | ||
| 2647 | dumper->next_idx = log_next_idx; | ||
| 2648 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | 2725 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
| 2649 | } | 2726 | } |
| 2650 | EXPORT_SYMBOL_GPL(kmsg_dump_rewind); | 2727 | EXPORT_SYMBOL_GPL(kmsg_dump_rewind); |
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 95cba41ce1e9..4e6a61b15e86 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
| @@ -54,6 +54,50 @@ | |||
| 54 | #ifdef CONFIG_PREEMPT_RCU | 54 | #ifdef CONFIG_PREEMPT_RCU |
| 55 | 55 | ||
| 56 | /* | 56 | /* |
| 57 | * Preemptible RCU implementation for rcu_read_lock(). | ||
| 58 | * Just increment ->rcu_read_lock_nesting, shared state will be updated | ||
| 59 | * if we block. | ||
| 60 | */ | ||
| 61 | void __rcu_read_lock(void) | ||
| 62 | { | ||
| 63 | current->rcu_read_lock_nesting++; | ||
| 64 | barrier(); /* critical section after entry code. */ | ||
| 65 | } | ||
| 66 | EXPORT_SYMBOL_GPL(__rcu_read_lock); | ||
| 67 | |||
| 68 | /* | ||
| 69 | * Preemptible RCU implementation for rcu_read_unlock(). | ||
| 70 | * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost | ||
| 71 | * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then | ||
| 72 | * invoke rcu_read_unlock_special() to clean up after a context switch | ||
| 73 | * in an RCU read-side critical section and other special cases. | ||
| 74 | */ | ||
| 75 | void __rcu_read_unlock(void) | ||
| 76 | { | ||
| 77 | struct task_struct *t = current; | ||
| 78 | |||
| 79 | if (t->rcu_read_lock_nesting != 1) { | ||
| 80 | --t->rcu_read_lock_nesting; | ||
| 81 | } else { | ||
| 82 | barrier(); /* critical section before exit code. */ | ||
| 83 | t->rcu_read_lock_nesting = INT_MIN; | ||
| 84 | barrier(); /* assign before ->rcu_read_unlock_special load */ | ||
| 85 | if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) | ||
| 86 | rcu_read_unlock_special(t); | ||
| 87 | barrier(); /* ->rcu_read_unlock_special load before assign */ | ||
| 88 | t->rcu_read_lock_nesting = 0; | ||
| 89 | } | ||
| 90 | #ifdef CONFIG_PROVE_LOCKING | ||
| 91 | { | ||
| 92 | int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting); | ||
| 93 | |||
| 94 | WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); | ||
| 95 | } | ||
| 96 | #endif /* #ifdef CONFIG_PROVE_LOCKING */ | ||
| 97 | } | ||
| 98 | EXPORT_SYMBOL_GPL(__rcu_read_unlock); | ||
| 99 | |||
| 100 | /* | ||
| 57 | * Check for a task exiting while in a preemptible-RCU read-side | 101 | * Check for a task exiting while in a preemptible-RCU read-side |
| 58 | * critical section, clean up if so. No need to issue warnings, | 102 | * critical section, clean up if so. No need to issue warnings, |
| 59 | * as debug_check_no_locks_held() already does this if lockdep | 103 | * as debug_check_no_locks_held() already does this if lockdep |
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 37a5444204d2..547b1fe5b052 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c | |||
| @@ -172,7 +172,7 @@ void rcu_irq_enter(void) | |||
| 172 | local_irq_restore(flags); | 172 | local_irq_restore(flags); |
| 173 | } | 173 | } |
| 174 | 174 | ||
| 175 | #ifdef CONFIG_PROVE_RCU | 175 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
| 176 | 176 | ||
| 177 | /* | 177 | /* |
| 178 | * Test whether RCU thinks that the current CPU is idle. | 178 | * Test whether RCU thinks that the current CPU is idle. |
| @@ -183,7 +183,7 @@ int rcu_is_cpu_idle(void) | |||
| 183 | } | 183 | } |
| 184 | EXPORT_SYMBOL(rcu_is_cpu_idle); | 184 | EXPORT_SYMBOL(rcu_is_cpu_idle); |
| 185 | 185 | ||
| 186 | #endif /* #ifdef CONFIG_PROVE_RCU */ | 186 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ |
| 187 | 187 | ||
| 188 | /* | 188 | /* |
| 189 | * Test whether the current CPU was interrupted from idle. Nested | 189 | * Test whether the current CPU was interrupted from idle. Nested |
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index fc31a2d65100..918fd1e8509c 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h | |||
| @@ -132,7 +132,6 @@ static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { | |||
| 132 | RCU_TRACE(.rcb.name = "rcu_preempt") | 132 | RCU_TRACE(.rcb.name = "rcu_preempt") |
| 133 | }; | 133 | }; |
| 134 | 134 | ||
| 135 | static void rcu_read_unlock_special(struct task_struct *t); | ||
| 136 | static int rcu_preempted_readers_exp(void); | 135 | static int rcu_preempted_readers_exp(void); |
| 137 | static void rcu_report_exp_done(void); | 136 | static void rcu_report_exp_done(void); |
| 138 | 137 | ||
| @@ -351,8 +350,9 @@ static int rcu_initiate_boost(void) | |||
| 351 | rcu_preempt_ctrlblk.boost_tasks = | 350 | rcu_preempt_ctrlblk.boost_tasks = |
| 352 | rcu_preempt_ctrlblk.gp_tasks; | 351 | rcu_preempt_ctrlblk.gp_tasks; |
| 353 | invoke_rcu_callbacks(); | 352 | invoke_rcu_callbacks(); |
| 354 | } else | 353 | } else { |
| 355 | RCU_TRACE(rcu_initiate_boost_trace()); | 354 | RCU_TRACE(rcu_initiate_boost_trace()); |
| 355 | } | ||
| 356 | return 1; | 356 | return 1; |
| 357 | } | 357 | } |
| 358 | 358 | ||
| @@ -527,23 +527,11 @@ void rcu_preempt_note_context_switch(void) | |||
| 527 | } | 527 | } |
| 528 | 528 | ||
| 529 | /* | 529 | /* |
| 530 | * Tiny-preemptible RCU implementation for rcu_read_lock(). | ||
| 531 | * Just increment ->rcu_read_lock_nesting, shared state will be updated | ||
| 532 | * if we block. | ||
| 533 | */ | ||
| 534 | void __rcu_read_lock(void) | ||
| 535 | { | ||
| 536 | current->rcu_read_lock_nesting++; | ||
| 537 | barrier(); /* needed if we ever invoke rcu_read_lock in rcutiny.c */ | ||
| 538 | } | ||
| 539 | EXPORT_SYMBOL_GPL(__rcu_read_lock); | ||
| 540 | |||
| 541 | /* | ||
| 542 | * Handle special cases during rcu_read_unlock(), such as needing to | 530 | * Handle special cases during rcu_read_unlock(), such as needing to |
| 543 | * notify RCU core processing or task having blocked during the RCU | 531 | * notify RCU core processing or task having blocked during the RCU |
| 544 | * read-side critical section. | 532 | * read-side critical section. |
| 545 | */ | 533 | */ |
| 546 | static noinline void rcu_read_unlock_special(struct task_struct *t) | 534 | void rcu_read_unlock_special(struct task_struct *t) |
| 547 | { | 535 | { |
| 548 | int empty; | 536 | int empty; |
| 549 | int empty_exp; | 537 | int empty_exp; |
| @@ -627,38 +615,6 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) | |||
| 627 | } | 615 | } |
| 628 | 616 | ||
| 629 | /* | 617 | /* |
| 630 | * Tiny-preemptible RCU implementation for rcu_read_unlock(). | ||
| 631 | * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost | ||
| 632 | * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then | ||
| 633 | * invoke rcu_read_unlock_special() to clean up after a context switch | ||
| 634 | * in an RCU read-side critical section and other special cases. | ||
| 635 | */ | ||
| 636 | void __rcu_read_unlock(void) | ||
| 637 | { | ||
| 638 | struct task_struct *t = current; | ||
| 639 | |||
| 640 | barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */ | ||
| 641 | if (t->rcu_read_lock_nesting != 1) | ||
| 642 | --t->rcu_read_lock_nesting; | ||
| 643 | else { | ||
| 644 | t->rcu_read_lock_nesting = INT_MIN; | ||
| 645 | barrier(); /* assign before ->rcu_read_unlock_special load */ | ||
| 646 | if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) | ||
| 647 | rcu_read_unlock_special(t); | ||
| 648 | barrier(); /* ->rcu_read_unlock_special load before assign */ | ||
| 649 | t->rcu_read_lock_nesting = 0; | ||
| 650 | } | ||
| 651 | #ifdef CONFIG_PROVE_LOCKING | ||
| 652 | { | ||
| 653 | int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting); | ||
| 654 | |||
| 655 | WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); | ||
| 656 | } | ||
| 657 | #endif /* #ifdef CONFIG_PROVE_LOCKING */ | ||
| 658 | } | ||
| 659 | EXPORT_SYMBOL_GPL(__rcu_read_unlock); | ||
| 660 | |||
| 661 | /* | ||
| 662 | * Check for a quiescent state from the current CPU. When a task blocks, | 618 | * Check for a quiescent state from the current CPU. When a task blocks, |
| 663 | * the task is recorded in the rcu_preempt_ctrlblk structure, which is | 619 | * the task is recorded in the rcu_preempt_ctrlblk structure, which is |
| 664 | * checked elsewhere. This is called from the scheduling-clock interrupt. | 620 | * checked elsewhere. This is called from the scheduling-clock interrupt. |
| @@ -823,9 +779,9 @@ void synchronize_rcu_expedited(void) | |||
| 823 | rpcp->exp_tasks = NULL; | 779 | rpcp->exp_tasks = NULL; |
| 824 | 780 | ||
| 825 | /* Wait for tail of ->blkd_tasks list to drain. */ | 781 | /* Wait for tail of ->blkd_tasks list to drain. */ |
| 826 | if (!rcu_preempted_readers_exp()) | 782 | if (!rcu_preempted_readers_exp()) { |
| 827 | local_irq_restore(flags); | 783 | local_irq_restore(flags); |
| 828 | else { | 784 | } else { |
| 829 | rcu_initiate_boost(); | 785 | rcu_initiate_boost(); |
| 830 | local_irq_restore(flags); | 786 | local_irq_restore(flags); |
| 831 | wait_event(sync_rcu_preempt_exp_wq, | 787 | wait_event(sync_rcu_preempt_exp_wq, |
| @@ -846,8 +802,6 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | |||
| 846 | */ | 802 | */ |
| 847 | int rcu_preempt_needs_cpu(void) | 803 | int rcu_preempt_needs_cpu(void) |
| 848 | { | 804 | { |
| 849 | if (!rcu_preempt_running_reader()) | ||
| 850 | rcu_preempt_cpu_qs(); | ||
| 851 | return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; | 805 | return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; |
| 852 | } | 806 | } |
| 853 | 807 | ||
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index e66b34ab7555..25b15033c61f 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
| @@ -49,8 +49,7 @@ | |||
| 49 | #include <asm/byteorder.h> | 49 | #include <asm/byteorder.h> |
| 50 | 50 | ||
| 51 | MODULE_LICENSE("GPL"); | 51 | MODULE_LICENSE("GPL"); |
| 52 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " | 52 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); |
| 53 | "Josh Triplett <josh@freedesktop.org>"); | ||
| 54 | 53 | ||
| 55 | static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ | 54 | static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ |
| 56 | static int nfakewriters = 4; /* # fake writer threads */ | 55 | static int nfakewriters = 4; /* # fake writer threads */ |
| @@ -206,6 +205,7 @@ static unsigned long boost_starttime; /* jiffies of next boost test start. */ | |||
| 206 | DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ | 205 | DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ |
| 207 | /* and boost task create/destroy. */ | 206 | /* and boost task create/destroy. */ |
| 208 | static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */ | 207 | static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */ |
| 208 | static bool barrier_phase; /* Test phase. */ | ||
| 209 | static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ | 209 | static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ |
| 210 | static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ | 210 | static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ |
| 211 | static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); | 211 | static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); |
| @@ -407,8 +407,9 @@ rcu_torture_cb(struct rcu_head *p) | |||
| 407 | if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { | 407 | if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { |
| 408 | rp->rtort_mbtest = 0; | 408 | rp->rtort_mbtest = 0; |
| 409 | rcu_torture_free(rp); | 409 | rcu_torture_free(rp); |
| 410 | } else | 410 | } else { |
| 411 | cur_ops->deferred_free(rp); | 411 | cur_ops->deferred_free(rp); |
| 412 | } | ||
| 412 | } | 413 | } |
| 413 | 414 | ||
| 414 | static int rcu_no_completed(void) | 415 | static int rcu_no_completed(void) |
| @@ -635,6 +636,17 @@ static void srcu_torture_synchronize(void) | |||
| 635 | synchronize_srcu(&srcu_ctl); | 636 | synchronize_srcu(&srcu_ctl); |
| 636 | } | 637 | } |
| 637 | 638 | ||
| 639 | static void srcu_torture_call(struct rcu_head *head, | ||
| 640 | void (*func)(struct rcu_head *head)) | ||
| 641 | { | ||
| 642 | call_srcu(&srcu_ctl, head, func); | ||
| 643 | } | ||
| 644 | |||
| 645 | static void srcu_torture_barrier(void) | ||
| 646 | { | ||
| 647 | srcu_barrier(&srcu_ctl); | ||
| 648 | } | ||
| 649 | |||
| 638 | static int srcu_torture_stats(char *page) | 650 | static int srcu_torture_stats(char *page) |
| 639 | { | 651 | { |
| 640 | int cnt = 0; | 652 | int cnt = 0; |
| @@ -661,8 +673,8 @@ static struct rcu_torture_ops srcu_ops = { | |||
| 661 | .completed = srcu_torture_completed, | 673 | .completed = srcu_torture_completed, |
| 662 | .deferred_free = srcu_torture_deferred_free, | 674 | .deferred_free = srcu_torture_deferred_free, |
| 663 | .sync = srcu_torture_synchronize, | 675 | .sync = srcu_torture_synchronize, |
| 664 | .call = NULL, | 676 | .call = srcu_torture_call, |
| 665 | .cb_barrier = NULL, | 677 | .cb_barrier = srcu_torture_barrier, |
| 666 | .stats = srcu_torture_stats, | 678 | .stats = srcu_torture_stats, |
| 667 | .name = "srcu" | 679 | .name = "srcu" |
| 668 | }; | 680 | }; |
| @@ -1013,7 +1025,11 @@ rcu_torture_fakewriter(void *arg) | |||
| 1013 | do { | 1025 | do { |
| 1014 | schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); | 1026 | schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); |
| 1015 | udelay(rcu_random(&rand) & 0x3ff); | 1027 | udelay(rcu_random(&rand) & 0x3ff); |
| 1016 | cur_ops->sync(); | 1028 | if (cur_ops->cb_barrier != NULL && |
| 1029 | rcu_random(&rand) % (nfakewriters * 8) == 0) | ||
| 1030 | cur_ops->cb_barrier(); | ||
| 1031 | else | ||
| 1032 | cur_ops->sync(); | ||
| 1017 | rcu_stutter_wait("rcu_torture_fakewriter"); | 1033 | rcu_stutter_wait("rcu_torture_fakewriter"); |
| 1018 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | 1034 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); |
| 1019 | 1035 | ||
| @@ -1183,27 +1199,27 @@ rcu_torture_printk(char *page) | |||
| 1183 | } | 1199 | } |
| 1184 | cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); | 1200 | cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); |
| 1185 | cnt += sprintf(&page[cnt], | 1201 | cnt += sprintf(&page[cnt], |
| 1186 | "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " | 1202 | "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", |
| 1187 | "rtmbe: %d rtbke: %ld rtbre: %ld " | ||
| 1188 | "rtbf: %ld rtb: %ld nt: %ld " | ||
| 1189 | "onoff: %ld/%ld:%ld/%ld " | ||
| 1190 | "barrier: %ld/%ld:%ld", | ||
| 1191 | rcu_torture_current, | 1203 | rcu_torture_current, |
| 1192 | rcu_torture_current_version, | 1204 | rcu_torture_current_version, |
| 1193 | list_empty(&rcu_torture_freelist), | 1205 | list_empty(&rcu_torture_freelist), |
| 1194 | atomic_read(&n_rcu_torture_alloc), | 1206 | atomic_read(&n_rcu_torture_alloc), |
| 1195 | atomic_read(&n_rcu_torture_alloc_fail), | 1207 | atomic_read(&n_rcu_torture_alloc_fail), |
| 1196 | atomic_read(&n_rcu_torture_free), | 1208 | atomic_read(&n_rcu_torture_free)); |
| 1209 | cnt += sprintf(&page[cnt], "rtmbe: %d rtbke: %ld rtbre: %ld ", | ||
| 1197 | atomic_read(&n_rcu_torture_mberror), | 1210 | atomic_read(&n_rcu_torture_mberror), |
| 1198 | n_rcu_torture_boost_ktrerror, | 1211 | n_rcu_torture_boost_ktrerror, |
| 1199 | n_rcu_torture_boost_rterror, | 1212 | n_rcu_torture_boost_rterror); |
| 1213 | cnt += sprintf(&page[cnt], "rtbf: %ld rtb: %ld nt: %ld ", | ||
| 1200 | n_rcu_torture_boost_failure, | 1214 | n_rcu_torture_boost_failure, |
| 1201 | n_rcu_torture_boosts, | 1215 | n_rcu_torture_boosts, |
| 1202 | n_rcu_torture_timers, | 1216 | n_rcu_torture_timers); |
| 1217 | cnt += sprintf(&page[cnt], "onoff: %ld/%ld:%ld/%ld ", | ||
| 1203 | n_online_successes, | 1218 | n_online_successes, |
| 1204 | n_online_attempts, | 1219 | n_online_attempts, |
| 1205 | n_offline_successes, | 1220 | n_offline_successes, |
| 1206 | n_offline_attempts, | 1221 | n_offline_attempts); |
| 1222 | cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld", | ||
| 1207 | n_barrier_successes, | 1223 | n_barrier_successes, |
| 1208 | n_barrier_attempts, | 1224 | n_barrier_attempts, |
| 1209 | n_rcu_torture_barrier_error); | 1225 | n_rcu_torture_barrier_error); |
| @@ -1445,8 +1461,7 @@ rcu_torture_shutdown(void *arg) | |||
| 1445 | delta = shutdown_time - jiffies_snap; | 1461 | delta = shutdown_time - jiffies_snap; |
| 1446 | if (verbose) | 1462 | if (verbose) |
| 1447 | printk(KERN_ALERT "%s" TORTURE_FLAG | 1463 | printk(KERN_ALERT "%s" TORTURE_FLAG |
| 1448 | "rcu_torture_shutdown task: %lu " | 1464 | "rcu_torture_shutdown task: %lu jiffies remaining\n", |
| 1449 | "jiffies remaining\n", | ||
| 1450 | torture_type, delta); | 1465 | torture_type, delta); |
| 1451 | schedule_timeout_interruptible(delta); | 1466 | schedule_timeout_interruptible(delta); |
| 1452 | jiffies_snap = ACCESS_ONCE(jiffies); | 1467 | jiffies_snap = ACCESS_ONCE(jiffies); |
| @@ -1498,8 +1513,7 @@ rcu_torture_onoff(void *arg) | |||
| 1498 | if (cpu_down(cpu) == 0) { | 1513 | if (cpu_down(cpu) == 0) { |
| 1499 | if (verbose) | 1514 | if (verbose) |
| 1500 | printk(KERN_ALERT "%s" TORTURE_FLAG | 1515 | printk(KERN_ALERT "%s" TORTURE_FLAG |
| 1501 | "rcu_torture_onoff task: " | 1516 | "rcu_torture_onoff task: offlined %d\n", |
| 1502 | "offlined %d\n", | ||
| 1503 | torture_type, cpu); | 1517 | torture_type, cpu); |
| 1504 | n_offline_successes++; | 1518 | n_offline_successes++; |
| 1505 | } | 1519 | } |
| @@ -1512,8 +1526,7 @@ rcu_torture_onoff(void *arg) | |||
| 1512 | if (cpu_up(cpu) == 0) { | 1526 | if (cpu_up(cpu) == 0) { |
| 1513 | if (verbose) | 1527 | if (verbose) |
| 1514 | printk(KERN_ALERT "%s" TORTURE_FLAG | 1528 | printk(KERN_ALERT "%s" TORTURE_FLAG |
| 1515 | "rcu_torture_onoff task: " | 1529 | "rcu_torture_onoff task: onlined %d\n", |
| 1516 | "onlined %d\n", | ||
| 1517 | torture_type, cpu); | 1530 | torture_type, cpu); |
| 1518 | n_online_successes++; | 1531 | n_online_successes++; |
| 1519 | } | 1532 | } |
| @@ -1631,6 +1644,7 @@ void rcu_torture_barrier_cbf(struct rcu_head *rcu) | |||
| 1631 | static int rcu_torture_barrier_cbs(void *arg) | 1644 | static int rcu_torture_barrier_cbs(void *arg) |
| 1632 | { | 1645 | { |
| 1633 | long myid = (long)arg; | 1646 | long myid = (long)arg; |
| 1647 | bool lastphase = 0; | ||
| 1634 | struct rcu_head rcu; | 1648 | struct rcu_head rcu; |
| 1635 | 1649 | ||
| 1636 | init_rcu_head_on_stack(&rcu); | 1650 | init_rcu_head_on_stack(&rcu); |
| @@ -1638,9 +1652,11 @@ static int rcu_torture_barrier_cbs(void *arg) | |||
| 1638 | set_user_nice(current, 19); | 1652 | set_user_nice(current, 19); |
| 1639 | do { | 1653 | do { |
| 1640 | wait_event(barrier_cbs_wq[myid], | 1654 | wait_event(barrier_cbs_wq[myid], |
| 1641 | atomic_read(&barrier_cbs_count) == n_barrier_cbs || | 1655 | barrier_phase != lastphase || |
| 1642 | kthread_should_stop() || | 1656 | kthread_should_stop() || |
| 1643 | fullstop != FULLSTOP_DONTSTOP); | 1657 | fullstop != FULLSTOP_DONTSTOP); |
| 1658 | lastphase = barrier_phase; | ||
| 1659 | smp_mb(); /* ensure barrier_phase load before ->call(). */ | ||
| 1644 | if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) | 1660 | if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) |
| 1645 | break; | 1661 | break; |
| 1646 | cur_ops->call(&rcu, rcu_torture_barrier_cbf); | 1662 | cur_ops->call(&rcu, rcu_torture_barrier_cbf); |
| @@ -1665,7 +1681,8 @@ static int rcu_torture_barrier(void *arg) | |||
| 1665 | do { | 1681 | do { |
| 1666 | atomic_set(&barrier_cbs_invoked, 0); | 1682 | atomic_set(&barrier_cbs_invoked, 0); |
| 1667 | atomic_set(&barrier_cbs_count, n_barrier_cbs); | 1683 | atomic_set(&barrier_cbs_count, n_barrier_cbs); |
| 1668 | /* wake_up() path contains the required barriers. */ | 1684 | smp_mb(); /* Ensure barrier_phase after prior assignments. */ |
| 1685 | barrier_phase = !barrier_phase; | ||
| 1669 | for (i = 0; i < n_barrier_cbs; i++) | 1686 | for (i = 0; i < n_barrier_cbs; i++) |
| 1670 | wake_up(&barrier_cbs_wq[i]); | 1687 | wake_up(&barrier_cbs_wq[i]); |
| 1671 | wait_event(barrier_wq, | 1688 | wait_event(barrier_wq, |
| @@ -1684,7 +1701,7 @@ static int rcu_torture_barrier(void *arg) | |||
| 1684 | schedule_timeout_interruptible(HZ / 10); | 1701 | schedule_timeout_interruptible(HZ / 10); |
| 1685 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | 1702 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); |
| 1686 | VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping"); | 1703 | VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping"); |
| 1687 | rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); | 1704 | rcutorture_shutdown_absorb("rcu_torture_barrier"); |
| 1688 | while (!kthread_should_stop()) | 1705 | while (!kthread_should_stop()) |
| 1689 | schedule_timeout_interruptible(1); | 1706 | schedule_timeout_interruptible(1); |
| 1690 | return 0; | 1707 | return 0; |
| @@ -1908,8 +1925,8 @@ rcu_torture_init(void) | |||
| 1908 | static struct rcu_torture_ops *torture_ops[] = | 1925 | static struct rcu_torture_ops *torture_ops[] = |
| 1909 | { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, | 1926 | { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, |
| 1910 | &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, | 1927 | &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, |
| 1911 | &srcu_ops, &srcu_sync_ops, &srcu_raw_ops, | 1928 | &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops, |
| 1912 | &srcu_raw_sync_ops, &srcu_expedited_ops, | 1929 | &srcu_raw_ops, &srcu_raw_sync_ops, |
| 1913 | &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; | 1930 | &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; |
| 1914 | 1931 | ||
| 1915 | mutex_lock(&fullstop_mutex); | 1932 | mutex_lock(&fullstop_mutex); |
| @@ -1931,8 +1948,7 @@ rcu_torture_init(void) | |||
| 1931 | return -EINVAL; | 1948 | return -EINVAL; |
| 1932 | } | 1949 | } |
| 1933 | if (cur_ops->fqs == NULL && fqs_duration != 0) { | 1950 | if (cur_ops->fqs == NULL && fqs_duration != 0) { |
| 1934 | printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero " | 1951 | printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); |
| 1935 | "fqs_duration, fqs disabled.\n"); | ||
| 1936 | fqs_duration = 0; | 1952 | fqs_duration = 0; |
| 1937 | } | 1953 | } |
| 1938 | if (cur_ops->init) | 1954 | if (cur_ops->init) |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 38ecdda3f55f..f280e542e3e9 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
| @@ -60,36 +60,44 @@ | |||
| 60 | 60 | ||
| 61 | /* Data structures. */ | 61 | /* Data structures. */ |
| 62 | 62 | ||
| 63 | static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; | 63 | static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; |
| 64 | 64 | ||
| 65 | #define RCU_STATE_INITIALIZER(structname) { \ | 65 | #define RCU_STATE_INITIALIZER(sname, cr) { \ |
| 66 | .level = { &structname##_state.node[0] }, \ | 66 | .level = { &sname##_state.node[0] }, \ |
| 67 | .levelcnt = { \ | 67 | .call = cr, \ |
| 68 | NUM_RCU_LVL_0, /* root of hierarchy. */ \ | ||
| 69 | NUM_RCU_LVL_1, \ | ||
| 70 | NUM_RCU_LVL_2, \ | ||
| 71 | NUM_RCU_LVL_3, \ | ||
| 72 | NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \ | ||
| 73 | }, \ | ||
| 74 | .fqs_state = RCU_GP_IDLE, \ | 68 | .fqs_state = RCU_GP_IDLE, \ |
| 75 | .gpnum = -300, \ | 69 | .gpnum = -300, \ |
| 76 | .completed = -300, \ | 70 | .completed = -300, \ |
| 77 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ | 71 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.onofflock), \ |
| 78 | .orphan_nxttail = &structname##_state.orphan_nxtlist, \ | 72 | .orphan_nxttail = &sname##_state.orphan_nxtlist, \ |
| 79 | .orphan_donetail = &structname##_state.orphan_donelist, \ | 73 | .orphan_donetail = &sname##_state.orphan_donelist, \ |
| 80 | .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ | 74 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ |
| 81 | .n_force_qs = 0, \ | 75 | .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.fqslock), \ |
| 82 | .n_force_qs_ngp = 0, \ | 76 | .name = #sname, \ |
| 83 | .name = #structname, \ | ||
| 84 | } | 77 | } |
| 85 | 78 | ||
| 86 | struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched); | 79 | struct rcu_state rcu_sched_state = |
| 80 | RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched); | ||
| 87 | DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); | 81 | DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); |
| 88 | 82 | ||
| 89 | struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh); | 83 | struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh); |
| 90 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); | 84 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); |
| 91 | 85 | ||
| 92 | static struct rcu_state *rcu_state; | 86 | static struct rcu_state *rcu_state; |
| 87 | LIST_HEAD(rcu_struct_flavors); | ||
| 88 | |||
| 89 | /* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ | ||
| 90 | static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF; | ||
| 91 | module_param(rcu_fanout_leaf, int, 0); | ||
| 92 | int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; | ||
| 93 | static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */ | ||
| 94 | NUM_RCU_LVL_0, | ||
| 95 | NUM_RCU_LVL_1, | ||
| 96 | NUM_RCU_LVL_2, | ||
| 97 | NUM_RCU_LVL_3, | ||
| 98 | NUM_RCU_LVL_4, | ||
| 99 | }; | ||
| 100 | int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ | ||
| 93 | 101 | ||
| 94 | /* | 102 | /* |
| 95 | * The rcu_scheduler_active variable transitions from zero to one just | 103 | * The rcu_scheduler_active variable transitions from zero to one just |
| @@ -147,13 +155,6 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); | |||
| 147 | unsigned long rcutorture_testseq; | 155 | unsigned long rcutorture_testseq; |
| 148 | unsigned long rcutorture_vernum; | 156 | unsigned long rcutorture_vernum; |
| 149 | 157 | ||
| 150 | /* State information for rcu_barrier() and friends. */ | ||
| 151 | |||
| 152 | static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; | ||
| 153 | static atomic_t rcu_barrier_cpu_count; | ||
| 154 | static DEFINE_MUTEX(rcu_barrier_mutex); | ||
| 155 | static struct completion rcu_barrier_completion; | ||
| 156 | |||
| 157 | /* | 158 | /* |
| 158 | * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s | 159 | * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s |
| 159 | * permit this function to be invoked without holding the root rcu_node | 160 | * permit this function to be invoked without holding the root rcu_node |
| @@ -201,6 +202,7 @@ void rcu_note_context_switch(int cpu) | |||
| 201 | { | 202 | { |
| 202 | trace_rcu_utilization("Start context switch"); | 203 | trace_rcu_utilization("Start context switch"); |
| 203 | rcu_sched_qs(cpu); | 204 | rcu_sched_qs(cpu); |
| 205 | rcu_preempt_note_context_switch(cpu); | ||
| 204 | trace_rcu_utilization("End context switch"); | 206 | trace_rcu_utilization("End context switch"); |
| 205 | } | 207 | } |
| 206 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); | 208 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); |
| @@ -357,7 +359,7 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) | |||
| 357 | struct task_struct *idle = idle_task(smp_processor_id()); | 359 | struct task_struct *idle = idle_task(smp_processor_id()); |
| 358 | 360 | ||
| 359 | trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); | 361 | trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); |
| 360 | ftrace_dump(DUMP_ALL); | 362 | ftrace_dump(DUMP_ORIG); |
| 361 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", | 363 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", |
| 362 | current->pid, current->comm, | 364 | current->pid, current->comm, |
| 363 | idle->pid, idle->comm); /* must be idle task! */ | 365 | idle->pid, idle->comm); /* must be idle task! */ |
| @@ -467,7 +469,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) | |||
| 467 | 469 | ||
| 468 | trace_rcu_dyntick("Error on exit: not idle task", | 470 | trace_rcu_dyntick("Error on exit: not idle task", |
| 469 | oldval, rdtp->dynticks_nesting); | 471 | oldval, rdtp->dynticks_nesting); |
| 470 | ftrace_dump(DUMP_ALL); | 472 | ftrace_dump(DUMP_ORIG); |
| 471 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", | 473 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", |
| 472 | current->pid, current->comm, | 474 | current->pid, current->comm, |
| 473 | idle->pid, idle->comm); /* must be idle task! */ | 475 | idle->pid, idle->comm); /* must be idle task! */ |
| @@ -584,8 +586,6 @@ void rcu_nmi_exit(void) | |||
| 584 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); | 586 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); |
| 585 | } | 587 | } |
| 586 | 588 | ||
| 587 | #ifdef CONFIG_PROVE_RCU | ||
| 588 | |||
| 589 | /** | 589 | /** |
| 590 | * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle | 590 | * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle |
| 591 | * | 591 | * |
| @@ -603,7 +603,7 @@ int rcu_is_cpu_idle(void) | |||
| 603 | } | 603 | } |
| 604 | EXPORT_SYMBOL(rcu_is_cpu_idle); | 604 | EXPORT_SYMBOL(rcu_is_cpu_idle); |
| 605 | 605 | ||
| 606 | #ifdef CONFIG_HOTPLUG_CPU | 606 | #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) |
| 607 | 607 | ||
| 608 | /* | 608 | /* |
| 609 | * Is the current CPU online? Disable preemption to avoid false positives | 609 | * Is the current CPU online? Disable preemption to avoid false positives |
| @@ -644,9 +644,7 @@ bool rcu_lockdep_current_cpu_online(void) | |||
| 644 | } | 644 | } |
| 645 | EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); | 645 | EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); |
| 646 | 646 | ||
| 647 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 647 | #endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */ |
| 648 | |||
| 649 | #endif /* #ifdef CONFIG_PROVE_RCU */ | ||
| 650 | 648 | ||
| 651 | /** | 649 | /** |
| 652 | * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle | 650 | * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle |
| @@ -732,7 +730,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
| 732 | int cpu; | 730 | int cpu; |
| 733 | long delta; | 731 | long delta; |
| 734 | unsigned long flags; | 732 | unsigned long flags; |
| 735 | int ndetected; | 733 | int ndetected = 0; |
| 736 | struct rcu_node *rnp = rcu_get_root(rsp); | 734 | struct rcu_node *rnp = rcu_get_root(rsp); |
| 737 | 735 | ||
| 738 | /* Only let one CPU complain about others per time interval. */ | 736 | /* Only let one CPU complain about others per time interval. */ |
| @@ -773,7 +771,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
| 773 | */ | 771 | */ |
| 774 | rnp = rcu_get_root(rsp); | 772 | rnp = rcu_get_root(rsp); |
| 775 | raw_spin_lock_irqsave(&rnp->lock, flags); | 773 | raw_spin_lock_irqsave(&rnp->lock, flags); |
| 776 | ndetected = rcu_print_task_stall(rnp); | 774 | ndetected += rcu_print_task_stall(rnp); |
| 777 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 775 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 778 | 776 | ||
| 779 | print_cpu_stall_info_end(); | 777 | print_cpu_stall_info_end(); |
| @@ -859,9 +857,10 @@ static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) | |||
| 859 | */ | 857 | */ |
| 860 | void rcu_cpu_stall_reset(void) | 858 | void rcu_cpu_stall_reset(void) |
| 861 | { | 859 | { |
| 862 | rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2; | 860 | struct rcu_state *rsp; |
| 863 | rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2; | 861 | |
| 864 | rcu_preempt_stall_reset(); | 862 | for_each_rcu_flavor(rsp) |
| 863 | rsp->jiffies_stall = jiffies + ULONG_MAX / 2; | ||
| 865 | } | 864 | } |
| 866 | 865 | ||
| 867 | static struct notifier_block rcu_panic_block = { | 866 | static struct notifier_block rcu_panic_block = { |
| @@ -893,8 +892,9 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct | |||
| 893 | if (rnp->qsmask & rdp->grpmask) { | 892 | if (rnp->qsmask & rdp->grpmask) { |
| 894 | rdp->qs_pending = 1; | 893 | rdp->qs_pending = 1; |
| 895 | rdp->passed_quiesce = 0; | 894 | rdp->passed_quiesce = 0; |
| 896 | } else | 895 | } else { |
| 897 | rdp->qs_pending = 0; | 896 | rdp->qs_pending = 0; |
| 897 | } | ||
| 898 | zero_cpu_stall_ticks(rdp); | 898 | zero_cpu_stall_ticks(rdp); |
| 899 | } | 899 | } |
| 900 | } | 900 | } |
| @@ -936,6 +936,18 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 936 | } | 936 | } |
| 937 | 937 | ||
| 938 | /* | 938 | /* |
| 939 | * Initialize the specified rcu_data structure's callback list to empty. | ||
| 940 | */ | ||
| 941 | static void init_callback_list(struct rcu_data *rdp) | ||
| 942 | { | ||
| 943 | int i; | ||
| 944 | |||
| 945 | rdp->nxtlist = NULL; | ||
| 946 | for (i = 0; i < RCU_NEXT_SIZE; i++) | ||
| 947 | rdp->nxttail[i] = &rdp->nxtlist; | ||
| 948 | } | ||
| 949 | |||
| 950 | /* | ||
| 939 | * Advance this CPU's callbacks, but only if the current grace period | 951 | * Advance this CPU's callbacks, but only if the current grace period |
| 940 | * has ended. This may be called only from the CPU to whom the rdp | 952 | * has ended. This may be called only from the CPU to whom the rdp |
| 941 | * belongs. In addition, the corresponding leaf rcu_node structure's | 953 | * belongs. In addition, the corresponding leaf rcu_node structure's |
| @@ -1327,8 +1339,6 @@ static void | |||
| 1327 | rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, | 1339 | rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, |
| 1328 | struct rcu_node *rnp, struct rcu_data *rdp) | 1340 | struct rcu_node *rnp, struct rcu_data *rdp) |
| 1329 | { | 1341 | { |
| 1330 | int i; | ||
| 1331 | |||
| 1332 | /* | 1342 | /* |
| 1333 | * Orphan the callbacks. First adjust the counts. This is safe | 1343 | * Orphan the callbacks. First adjust the counts. This is safe |
| 1334 | * because ->onofflock excludes _rcu_barrier()'s adoption of | 1344 | * because ->onofflock excludes _rcu_barrier()'s adoption of |
| @@ -1339,7 +1349,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, | |||
| 1339 | rsp->qlen += rdp->qlen; | 1349 | rsp->qlen += rdp->qlen; |
| 1340 | rdp->n_cbs_orphaned += rdp->qlen; | 1350 | rdp->n_cbs_orphaned += rdp->qlen; |
| 1341 | rdp->qlen_lazy = 0; | 1351 | rdp->qlen_lazy = 0; |
| 1342 | rdp->qlen = 0; | 1352 | ACCESS_ONCE(rdp->qlen) = 0; |
| 1343 | } | 1353 | } |
| 1344 | 1354 | ||
| 1345 | /* | 1355 | /* |
| @@ -1368,9 +1378,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, | |||
| 1368 | } | 1378 | } |
| 1369 | 1379 | ||
| 1370 | /* Finally, initialize the rcu_data structure's list to empty. */ | 1380 | /* Finally, initialize the rcu_data structure's list to empty. */ |
| 1371 | rdp->nxtlist = NULL; | 1381 | init_callback_list(rdp); |
| 1372 | for (i = 0; i < RCU_NEXT_SIZE; i++) | ||
| 1373 | rdp->nxttail[i] = &rdp->nxtlist; | ||
| 1374 | } | 1382 | } |
| 1375 | 1383 | ||
| 1376 | /* | 1384 | /* |
| @@ -1504,6 +1512,9 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | |||
| 1504 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1512 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 1505 | if (need_report & RCU_OFL_TASKS_EXP_GP) | 1513 | if (need_report & RCU_OFL_TASKS_EXP_GP) |
| 1506 | rcu_report_exp_rnp(rsp, rnp, true); | 1514 | rcu_report_exp_rnp(rsp, rnp, true); |
| 1515 | WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, | ||
| 1516 | "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", | ||
| 1517 | cpu, rdp->qlen, rdp->nxtlist); | ||
| 1507 | } | 1518 | } |
| 1508 | 1519 | ||
| 1509 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ | 1520 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ |
| @@ -1591,7 +1602,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 1591 | } | 1602 | } |
| 1592 | smp_mb(); /* List handling before counting for rcu_barrier(). */ | 1603 | smp_mb(); /* List handling before counting for rcu_barrier(). */ |
| 1593 | rdp->qlen_lazy -= count_lazy; | 1604 | rdp->qlen_lazy -= count_lazy; |
| 1594 | rdp->qlen -= count; | 1605 | ACCESS_ONCE(rdp->qlen) -= count; |
| 1595 | rdp->n_cbs_invoked += count; | 1606 | rdp->n_cbs_invoked += count; |
| 1596 | 1607 | ||
| 1597 | /* Reinstate batch limit if we have worked down the excess. */ | 1608 | /* Reinstate batch limit if we have worked down the excess. */ |
| @@ -1604,6 +1615,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 1604 | rdp->n_force_qs_snap = rsp->n_force_qs; | 1615 | rdp->n_force_qs_snap = rsp->n_force_qs; |
| 1605 | } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) | 1616 | } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) |
| 1606 | rdp->qlen_last_fqs_check = rdp->qlen; | 1617 | rdp->qlen_last_fqs_check = rdp->qlen; |
| 1618 | WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0)); | ||
| 1607 | 1619 | ||
| 1608 | local_irq_restore(flags); | 1620 | local_irq_restore(flags); |
| 1609 | 1621 | ||
| @@ -1744,8 +1756,6 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) | |||
| 1744 | break; /* grace period idle or initializing, ignore. */ | 1756 | break; /* grace period idle or initializing, ignore. */ |
| 1745 | 1757 | ||
| 1746 | case RCU_SAVE_DYNTICK: | 1758 | case RCU_SAVE_DYNTICK: |
| 1747 | if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK) | ||
| 1748 | break; /* So gcc recognizes the dead code. */ | ||
| 1749 | 1759 | ||
| 1750 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ | 1760 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ |
| 1751 | 1761 | ||
| @@ -1787,9 +1797,10 @@ unlock_fqs_ret: | |||
| 1787 | * whom the rdp belongs. | 1797 | * whom the rdp belongs. |
| 1788 | */ | 1798 | */ |
| 1789 | static void | 1799 | static void |
| 1790 | __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) | 1800 | __rcu_process_callbacks(struct rcu_state *rsp) |
| 1791 | { | 1801 | { |
| 1792 | unsigned long flags; | 1802 | unsigned long flags; |
| 1803 | struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); | ||
| 1793 | 1804 | ||
| 1794 | WARN_ON_ONCE(rdp->beenonline == 0); | 1805 | WARN_ON_ONCE(rdp->beenonline == 0); |
| 1795 | 1806 | ||
| @@ -1825,11 +1836,11 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 1825 | */ | 1836 | */ |
| 1826 | static void rcu_process_callbacks(struct softirq_action *unused) | 1837 | static void rcu_process_callbacks(struct softirq_action *unused) |
| 1827 | { | 1838 | { |
| 1839 | struct rcu_state *rsp; | ||
| 1840 | |||
| 1828 | trace_rcu_utilization("Start RCU core"); | 1841 | trace_rcu_utilization("Start RCU core"); |
| 1829 | __rcu_process_callbacks(&rcu_sched_state, | 1842 | for_each_rcu_flavor(rsp) |
| 1830 | &__get_cpu_var(rcu_sched_data)); | 1843 | __rcu_process_callbacks(rsp); |
| 1831 | __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); | ||
| 1832 | rcu_preempt_process_callbacks(); | ||
| 1833 | trace_rcu_utilization("End RCU core"); | 1844 | trace_rcu_utilization("End RCU core"); |
| 1834 | } | 1845 | } |
| 1835 | 1846 | ||
| @@ -1856,6 +1867,56 @@ static void invoke_rcu_core(void) | |||
| 1856 | raise_softirq(RCU_SOFTIRQ); | 1867 | raise_softirq(RCU_SOFTIRQ); |
| 1857 | } | 1868 | } |
| 1858 | 1869 | ||
| 1870 | /* | ||
| 1871 | * Handle any core-RCU processing required by a call_rcu() invocation. | ||
| 1872 | */ | ||
| 1873 | static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, | ||
| 1874 | struct rcu_head *head, unsigned long flags) | ||
| 1875 | { | ||
| 1876 | /* | ||
| 1877 | * If called from an extended quiescent state, invoke the RCU | ||
| 1878 | * core in order to force a re-evaluation of RCU's idleness. | ||
| 1879 | */ | ||
| 1880 | if (rcu_is_cpu_idle() && cpu_online(smp_processor_id())) | ||
| 1881 | invoke_rcu_core(); | ||
| 1882 | |||
| 1883 | /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ | ||
| 1884 | if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id())) | ||
| 1885 | return; | ||
| 1886 | |||
| 1887 | /* | ||
| 1888 | * Force the grace period if too many callbacks or too long waiting. | ||
| 1889 | * Enforce hysteresis, and don't invoke force_quiescent_state() | ||
| 1890 | * if some other CPU has recently done so. Also, don't bother | ||
| 1891 | * invoking force_quiescent_state() if the newly enqueued callback | ||
| 1892 | * is the only one waiting for a grace period to complete. | ||
| 1893 | */ | ||
| 1894 | if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { | ||
| 1895 | |||
| 1896 | /* Are we ignoring a completed grace period? */ | ||
| 1897 | rcu_process_gp_end(rsp, rdp); | ||
| 1898 | check_for_new_grace_period(rsp, rdp); | ||
| 1899 | |||
| 1900 | /* Start a new grace period if one not already started. */ | ||
| 1901 | if (!rcu_gp_in_progress(rsp)) { | ||
| 1902 | unsigned long nestflag; | ||
| 1903 | struct rcu_node *rnp_root = rcu_get_root(rsp); | ||
| 1904 | |||
| 1905 | raw_spin_lock_irqsave(&rnp_root->lock, nestflag); | ||
| 1906 | rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */ | ||
| 1907 | } else { | ||
| 1908 | /* Give the grace period a kick. */ | ||
| 1909 | rdp->blimit = LONG_MAX; | ||
| 1910 | if (rsp->n_force_qs == rdp->n_force_qs_snap && | ||
| 1911 | *rdp->nxttail[RCU_DONE_TAIL] != head) | ||
| 1912 | force_quiescent_state(rsp, 0); | ||
| 1913 | rdp->n_force_qs_snap = rsp->n_force_qs; | ||
| 1914 | rdp->qlen_last_fqs_check = rdp->qlen; | ||
| 1915 | } | ||
| 1916 | } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) | ||
| 1917 | force_quiescent_state(rsp, 1); | ||
| 1918 | } | ||
| 1919 | |||
| 1859 | static void | 1920 | static void |
| 1860 | __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | 1921 | __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), |
| 1861 | struct rcu_state *rsp, bool lazy) | 1922 | struct rcu_state *rsp, bool lazy) |
| @@ -1880,7 +1941,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
| 1880 | rdp = this_cpu_ptr(rsp->rda); | 1941 | rdp = this_cpu_ptr(rsp->rda); |
| 1881 | 1942 | ||
| 1882 | /* Add the callback to our list. */ | 1943 | /* Add the callback to our list. */ |
| 1883 | rdp->qlen++; | 1944 | ACCESS_ONCE(rdp->qlen)++; |
| 1884 | if (lazy) | 1945 | if (lazy) |
| 1885 | rdp->qlen_lazy++; | 1946 | rdp->qlen_lazy++; |
| 1886 | else | 1947 | else |
| @@ -1895,43 +1956,8 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
| 1895 | else | 1956 | else |
| 1896 | trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen); | 1957 | trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen); |
| 1897 | 1958 | ||
| 1898 | /* If interrupts were disabled, don't dive into RCU core. */ | 1959 | /* Go handle any RCU core processing required. */ |
| 1899 | if (irqs_disabled_flags(flags)) { | 1960 | __call_rcu_core(rsp, rdp, head, flags); |
| 1900 | local_irq_restore(flags); | ||
| 1901 | return; | ||
| 1902 | } | ||
| 1903 | |||
| 1904 | /* | ||
| 1905 | * Force the grace period if too many callbacks or too long waiting. | ||
| 1906 | * Enforce hysteresis, and don't invoke force_quiescent_state() | ||
| 1907 | * if some other CPU has recently done so. Also, don't bother | ||
| 1908 | * invoking force_quiescent_state() if the newly enqueued callback | ||
| 1909 | * is the only one waiting for a grace period to complete. | ||
| 1910 | */ | ||
| 1911 | if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { | ||
| 1912 | |||
| 1913 | /* Are we ignoring a completed grace period? */ | ||
| 1914 | rcu_process_gp_end(rsp, rdp); | ||
| 1915 | check_for_new_grace_period(rsp, rdp); | ||
| 1916 | |||
| 1917 | /* Start a new grace period if one not already started. */ | ||
| 1918 | if (!rcu_gp_in_progress(rsp)) { | ||
| 1919 | unsigned long nestflag; | ||
| 1920 | struct rcu_node *rnp_root = rcu_get_root(rsp); | ||
| 1921 | |||
| 1922 | raw_spin_lock_irqsave(&rnp_root->lock, nestflag); | ||
| 1923 | rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */ | ||
| 1924 | } else { | ||
| 1925 | /* Give the grace period a kick. */ | ||
| 1926 | rdp->blimit = LONG_MAX; | ||
| 1927 | if (rsp->n_force_qs == rdp->n_force_qs_snap && | ||
| 1928 | *rdp->nxttail[RCU_DONE_TAIL] != head) | ||
| 1929 | force_quiescent_state(rsp, 0); | ||
| 1930 | rdp->n_force_qs_snap = rsp->n_force_qs; | ||
| 1931 | rdp->qlen_last_fqs_check = rdp->qlen; | ||
| 1932 | } | ||
| 1933 | } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) | ||
| 1934 | force_quiescent_state(rsp, 1); | ||
| 1935 | local_irq_restore(flags); | 1961 | local_irq_restore(flags); |
| 1936 | } | 1962 | } |
| 1937 | 1963 | ||
| @@ -1961,28 +1987,16 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); | |||
| 1961 | * occasionally incorrectly indicate that there are multiple CPUs online | 1987 | * occasionally incorrectly indicate that there are multiple CPUs online |
| 1962 | * when there was in fact only one the whole time, as this just adds | 1988 | * when there was in fact only one the whole time, as this just adds |
| 1963 | * some overhead: RCU still operates correctly. | 1989 | * some overhead: RCU still operates correctly. |
| 1964 | * | ||
| 1965 | * Of course, sampling num_online_cpus() with preemption enabled can | ||
| 1966 | * give erroneous results if there are concurrent CPU-hotplug operations. | ||
| 1967 | * For example, given a demonic sequence of preemptions in num_online_cpus() | ||
| 1968 | * and CPU-hotplug operations, there could be two or more CPUs online at | ||
| 1969 | * all times, but num_online_cpus() might well return one (or even zero). | ||
| 1970 | * | ||
| 1971 | * However, all such demonic sequences require at least one CPU-offline | ||
| 1972 | * operation. Furthermore, rcu_blocking_is_gp() giving the wrong answer | ||
| 1973 | * is only a problem if there is an RCU read-side critical section executing | ||
| 1974 | * throughout. But RCU-sched and RCU-bh read-side critical sections | ||
| 1975 | * disable either preemption or bh, which prevents a CPU from going offline. | ||
| 1976 | * Therefore, the only way that rcu_blocking_is_gp() can incorrectly return | ||
| 1977 | * that there is only one CPU when in fact there was more than one throughout | ||
| 1978 | * is when there were no RCU readers in the system. If there are no | ||
| 1979 | * RCU readers, the grace period by definition can be of zero length, | ||
| 1980 | * regardless of the number of online CPUs. | ||
| 1981 | */ | 1990 | */ |
| 1982 | static inline int rcu_blocking_is_gp(void) | 1991 | static inline int rcu_blocking_is_gp(void) |
| 1983 | { | 1992 | { |
| 1993 | int ret; | ||
| 1994 | |||
| 1984 | might_sleep(); /* Check for RCU read-side critical section. */ | 1995 | might_sleep(); /* Check for RCU read-side critical section. */ |
| 1985 | return num_online_cpus() <= 1; | 1996 | preempt_disable(); |
| 1997 | ret = num_online_cpus() <= 1; | ||
| 1998 | preempt_enable(); | ||
| 1999 | return ret; | ||
| 1986 | } | 2000 | } |
| 1987 | 2001 | ||
| 1988 | /** | 2002 | /** |
| @@ -2117,9 +2131,9 @@ void synchronize_sched_expedited(void) | |||
| 2117 | put_online_cpus(); | 2131 | put_online_cpus(); |
| 2118 | 2132 | ||
| 2119 | /* No joy, try again later. Or just synchronize_sched(). */ | 2133 | /* No joy, try again later. Or just synchronize_sched(). */ |
| 2120 | if (trycount++ < 10) | 2134 | if (trycount++ < 10) { |
| 2121 | udelay(trycount * num_online_cpus()); | 2135 | udelay(trycount * num_online_cpus()); |
| 2122 | else { | 2136 | } else { |
| 2123 | synchronize_sched(); | 2137 | synchronize_sched(); |
| 2124 | return; | 2138 | return; |
| 2125 | } | 2139 | } |
| @@ -2240,9 +2254,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 2240 | */ | 2254 | */ |
| 2241 | static int rcu_pending(int cpu) | 2255 | static int rcu_pending(int cpu) |
| 2242 | { | 2256 | { |
| 2243 | return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) || | 2257 | struct rcu_state *rsp; |
| 2244 | __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)) || | 2258 | |
| 2245 | rcu_preempt_pending(cpu); | 2259 | for_each_rcu_flavor(rsp) |
| 2260 | if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu))) | ||
| 2261 | return 1; | ||
| 2262 | return 0; | ||
| 2246 | } | 2263 | } |
| 2247 | 2264 | ||
| 2248 | /* | 2265 | /* |
| @@ -2252,20 +2269,41 @@ static int rcu_pending(int cpu) | |||
| 2252 | */ | 2269 | */ |
| 2253 | static int rcu_cpu_has_callbacks(int cpu) | 2270 | static int rcu_cpu_has_callbacks(int cpu) |
| 2254 | { | 2271 | { |
| 2272 | struct rcu_state *rsp; | ||
| 2273 | |||
| 2255 | /* RCU callbacks either ready or pending? */ | 2274 | /* RCU callbacks either ready or pending? */ |
| 2256 | return per_cpu(rcu_sched_data, cpu).nxtlist || | 2275 | for_each_rcu_flavor(rsp) |
| 2257 | per_cpu(rcu_bh_data, cpu).nxtlist || | 2276 | if (per_cpu_ptr(rsp->rda, cpu)->nxtlist) |
| 2258 | rcu_preempt_cpu_has_callbacks(cpu); | 2277 | return 1; |
| 2278 | return 0; | ||
| 2279 | } | ||
| 2280 | |||
| 2281 | /* | ||
| 2282 | * Helper function for _rcu_barrier() tracing. If tracing is disabled, | ||
| 2283 | * the compiler is expected to optimize this away. | ||
| 2284 | */ | ||
| 2285 | static void _rcu_barrier_trace(struct rcu_state *rsp, char *s, | ||
| 2286 | int cpu, unsigned long done) | ||
| 2287 | { | ||
| 2288 | trace_rcu_barrier(rsp->name, s, cpu, | ||
| 2289 | atomic_read(&rsp->barrier_cpu_count), done); | ||
| 2259 | } | 2290 | } |
| 2260 | 2291 | ||
| 2261 | /* | 2292 | /* |
| 2262 | * RCU callback function for _rcu_barrier(). If we are last, wake | 2293 | * RCU callback function for _rcu_barrier(). If we are last, wake |
| 2263 | * up the task executing _rcu_barrier(). | 2294 | * up the task executing _rcu_barrier(). |
| 2264 | */ | 2295 | */ |
| 2265 | static void rcu_barrier_callback(struct rcu_head *notused) | 2296 | static void rcu_barrier_callback(struct rcu_head *rhp) |
| 2266 | { | 2297 | { |
| 2267 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) | 2298 | struct rcu_data *rdp = container_of(rhp, struct rcu_data, barrier_head); |
| 2268 | complete(&rcu_barrier_completion); | 2299 | struct rcu_state *rsp = rdp->rsp; |
| 2300 | |||
| 2301 | if (atomic_dec_and_test(&rsp->barrier_cpu_count)) { | ||
| 2302 | _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done); | ||
| 2303 | complete(&rsp->barrier_completion); | ||
| 2304 | } else { | ||
| 2305 | _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done); | ||
| 2306 | } | ||
| 2269 | } | 2307 | } |
| 2270 | 2308 | ||
| 2271 | /* | 2309 | /* |
| @@ -2273,35 +2311,63 @@ static void rcu_barrier_callback(struct rcu_head *notused) | |||
| 2273 | */ | 2311 | */ |
| 2274 | static void rcu_barrier_func(void *type) | 2312 | static void rcu_barrier_func(void *type) |
| 2275 | { | 2313 | { |
| 2276 | int cpu = smp_processor_id(); | 2314 | struct rcu_state *rsp = type; |
| 2277 | struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); | 2315 | struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); |
| 2278 | void (*call_rcu_func)(struct rcu_head *head, | ||
| 2279 | void (*func)(struct rcu_head *head)); | ||
| 2280 | 2316 | ||
| 2281 | atomic_inc(&rcu_barrier_cpu_count); | 2317 | _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done); |
| 2282 | call_rcu_func = type; | 2318 | atomic_inc(&rsp->barrier_cpu_count); |
| 2283 | call_rcu_func(head, rcu_barrier_callback); | 2319 | rsp->call(&rdp->barrier_head, rcu_barrier_callback); |
| 2284 | } | 2320 | } |
| 2285 | 2321 | ||
| 2286 | /* | 2322 | /* |
| 2287 | * Orchestrate the specified type of RCU barrier, waiting for all | 2323 | * Orchestrate the specified type of RCU barrier, waiting for all |
| 2288 | * RCU callbacks of the specified type to complete. | 2324 | * RCU callbacks of the specified type to complete. |
| 2289 | */ | 2325 | */ |
| 2290 | static void _rcu_barrier(struct rcu_state *rsp, | 2326 | static void _rcu_barrier(struct rcu_state *rsp) |
| 2291 | void (*call_rcu_func)(struct rcu_head *head, | ||
| 2292 | void (*func)(struct rcu_head *head))) | ||
| 2293 | { | 2327 | { |
| 2294 | int cpu; | 2328 | int cpu; |
| 2295 | unsigned long flags; | 2329 | unsigned long flags; |
| 2296 | struct rcu_data *rdp; | 2330 | struct rcu_data *rdp; |
| 2297 | struct rcu_head rh; | 2331 | struct rcu_data rd; |
| 2332 | unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done); | ||
| 2333 | unsigned long snap_done; | ||
| 2298 | 2334 | ||
| 2299 | init_rcu_head_on_stack(&rh); | 2335 | init_rcu_head_on_stack(&rd.barrier_head); |
| 2336 | _rcu_barrier_trace(rsp, "Begin", -1, snap); | ||
| 2300 | 2337 | ||
| 2301 | /* Take mutex to serialize concurrent rcu_barrier() requests. */ | 2338 | /* Take mutex to serialize concurrent rcu_barrier() requests. */ |
| 2302 | mutex_lock(&rcu_barrier_mutex); | 2339 | mutex_lock(&rsp->barrier_mutex); |
| 2340 | |||
| 2341 | /* | ||
| 2342 | * Ensure that all prior references, including to ->n_barrier_done, | ||
| 2343 | * are ordered before the _rcu_barrier() machinery. | ||
| 2344 | */ | ||
| 2345 | smp_mb(); /* See above block comment. */ | ||
| 2346 | |||
| 2347 | /* | ||
| 2348 | * Recheck ->n_barrier_done to see if others did our work for us. | ||
| 2349 | * This means checking ->n_barrier_done for an even-to-odd-to-even | ||
| 2350 | * transition. The "if" expression below therefore rounds the old | ||
| 2351 | * value up to the next even number and adds two before comparing. | ||
| 2352 | */ | ||
| 2353 | snap_done = ACCESS_ONCE(rsp->n_barrier_done); | ||
| 2354 | _rcu_barrier_trace(rsp, "Check", -1, snap_done); | ||
| 2355 | if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) { | ||
| 2356 | _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done); | ||
| 2357 | smp_mb(); /* caller's subsequent code after above check. */ | ||
| 2358 | mutex_unlock(&rsp->barrier_mutex); | ||
| 2359 | return; | ||
| 2360 | } | ||
| 2303 | 2361 | ||
| 2304 | smp_mb(); /* Prevent any prior operations from leaking in. */ | 2362 | /* |
| 2363 | * Increment ->n_barrier_done to avoid duplicate work. Use | ||
| 2364 | * ACCESS_ONCE() to prevent the compiler from speculating | ||
| 2365 | * the increment to precede the early-exit check. | ||
| 2366 | */ | ||
| 2367 | ACCESS_ONCE(rsp->n_barrier_done)++; | ||
| 2368 | WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1); | ||
| 2369 | _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done); | ||
| 2370 | smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */ | ||
| 2305 | 2371 | ||
| 2306 | /* | 2372 | /* |
| 2307 | * Initialize the count to one rather than to zero in order to | 2373 | * Initialize the count to one rather than to zero in order to |
| @@ -2320,8 +2386,8 @@ static void _rcu_barrier(struct rcu_state *rsp, | |||
| 2320 | * 6. Both rcu_barrier_callback() callbacks are invoked, awakening | 2386 | * 6. Both rcu_barrier_callback() callbacks are invoked, awakening |
| 2321 | * us -- but before CPU 1's orphaned callbacks are invoked!!! | 2387 | * us -- but before CPU 1's orphaned callbacks are invoked!!! |
| 2322 | */ | 2388 | */ |
| 2323 | init_completion(&rcu_barrier_completion); | 2389 | init_completion(&rsp->barrier_completion); |
| 2324 | atomic_set(&rcu_barrier_cpu_count, 1); | 2390 | atomic_set(&rsp->barrier_cpu_count, 1); |
| 2325 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | 2391 | raw_spin_lock_irqsave(&rsp->onofflock, flags); |
| 2326 | rsp->rcu_barrier_in_progress = current; | 2392 | rsp->rcu_barrier_in_progress = current; |
| 2327 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | 2393 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); |
| @@ -2337,14 +2403,19 @@ static void _rcu_barrier(struct rcu_state *rsp, | |||
| 2337 | preempt_disable(); | 2403 | preempt_disable(); |
| 2338 | rdp = per_cpu_ptr(rsp->rda, cpu); | 2404 | rdp = per_cpu_ptr(rsp->rda, cpu); |
| 2339 | if (cpu_is_offline(cpu)) { | 2405 | if (cpu_is_offline(cpu)) { |
| 2406 | _rcu_barrier_trace(rsp, "Offline", cpu, | ||
| 2407 | rsp->n_barrier_done); | ||
| 2340 | preempt_enable(); | 2408 | preempt_enable(); |
| 2341 | while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen)) | 2409 | while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen)) |
| 2342 | schedule_timeout_interruptible(1); | 2410 | schedule_timeout_interruptible(1); |
| 2343 | } else if (ACCESS_ONCE(rdp->qlen)) { | 2411 | } else if (ACCESS_ONCE(rdp->qlen)) { |
| 2344 | smp_call_function_single(cpu, rcu_barrier_func, | 2412 | _rcu_barrier_trace(rsp, "OnlineQ", cpu, |
| 2345 | (void *)call_rcu_func, 1); | 2413 | rsp->n_barrier_done); |
| 2414 | smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); | ||
| 2346 | preempt_enable(); | 2415 | preempt_enable(); |
| 2347 | } else { | 2416 | } else { |
| 2417 | _rcu_barrier_trace(rsp, "OnlineNQ", cpu, | ||
| 2418 | rsp->n_barrier_done); | ||
| 2348 | preempt_enable(); | 2419 | preempt_enable(); |
| 2349 | } | 2420 | } |
| 2350 | } | 2421 | } |
| @@ -2361,24 +2432,32 @@ static void _rcu_barrier(struct rcu_state *rsp, | |||
| 2361 | rcu_adopt_orphan_cbs(rsp); | 2432 | rcu_adopt_orphan_cbs(rsp); |
| 2362 | rsp->rcu_barrier_in_progress = NULL; | 2433 | rsp->rcu_barrier_in_progress = NULL; |
| 2363 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | 2434 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); |
| 2364 | atomic_inc(&rcu_barrier_cpu_count); | 2435 | atomic_inc(&rsp->barrier_cpu_count); |
| 2365 | smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */ | 2436 | smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */ |
| 2366 | call_rcu_func(&rh, rcu_barrier_callback); | 2437 | rd.rsp = rsp; |
| 2438 | rsp->call(&rd.barrier_head, rcu_barrier_callback); | ||
| 2367 | 2439 | ||
| 2368 | /* | 2440 | /* |
| 2369 | * Now that we have an rcu_barrier_callback() callback on each | 2441 | * Now that we have an rcu_barrier_callback() callback on each |
| 2370 | * CPU, and thus each counted, remove the initial count. | 2442 | * CPU, and thus each counted, remove the initial count. |
| 2371 | */ | 2443 | */ |
| 2372 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) | 2444 | if (atomic_dec_and_test(&rsp->barrier_cpu_count)) |
| 2373 | complete(&rcu_barrier_completion); | 2445 | complete(&rsp->barrier_completion); |
| 2446 | |||
| 2447 | /* Increment ->n_barrier_done to prevent duplicate work. */ | ||
| 2448 | smp_mb(); /* Keep increment after above mechanism. */ | ||
| 2449 | ACCESS_ONCE(rsp->n_barrier_done)++; | ||
| 2450 | WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0); | ||
| 2451 | _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done); | ||
| 2452 | smp_mb(); /* Keep increment before caller's subsequent code. */ | ||
| 2374 | 2453 | ||
| 2375 | /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ | 2454 | /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ |
| 2376 | wait_for_completion(&rcu_barrier_completion); | 2455 | wait_for_completion(&rsp->barrier_completion); |
| 2377 | 2456 | ||
| 2378 | /* Other rcu_barrier() invocations can now safely proceed. */ | 2457 | /* Other rcu_barrier() invocations can now safely proceed. */ |
| 2379 | mutex_unlock(&rcu_barrier_mutex); | 2458 | mutex_unlock(&rsp->barrier_mutex); |
| 2380 | 2459 | ||
| 2381 | destroy_rcu_head_on_stack(&rh); | 2460 | destroy_rcu_head_on_stack(&rd.barrier_head); |
| 2382 | } | 2461 | } |
| 2383 | 2462 | ||
| 2384 | /** | 2463 | /** |
| @@ -2386,7 +2465,7 @@ static void _rcu_barrier(struct rcu_state *rsp, | |||
| 2386 | */ | 2465 | */ |
| 2387 | void rcu_barrier_bh(void) | 2466 | void rcu_barrier_bh(void) |
| 2388 | { | 2467 | { |
| 2389 | _rcu_barrier(&rcu_bh_state, call_rcu_bh); | 2468 | _rcu_barrier(&rcu_bh_state); |
| 2390 | } | 2469 | } |
| 2391 | EXPORT_SYMBOL_GPL(rcu_barrier_bh); | 2470 | EXPORT_SYMBOL_GPL(rcu_barrier_bh); |
| 2392 | 2471 | ||
| @@ -2395,7 +2474,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier_bh); | |||
| 2395 | */ | 2474 | */ |
| 2396 | void rcu_barrier_sched(void) | 2475 | void rcu_barrier_sched(void) |
| 2397 | { | 2476 | { |
| 2398 | _rcu_barrier(&rcu_sched_state, call_rcu_sched); | 2477 | _rcu_barrier(&rcu_sched_state); |
| 2399 | } | 2478 | } |
| 2400 | EXPORT_SYMBOL_GPL(rcu_barrier_sched); | 2479 | EXPORT_SYMBOL_GPL(rcu_barrier_sched); |
| 2401 | 2480 | ||
| @@ -2406,18 +2485,15 @@ static void __init | |||
| 2406 | rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | 2485 | rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) |
| 2407 | { | 2486 | { |
| 2408 | unsigned long flags; | 2487 | unsigned long flags; |
| 2409 | int i; | ||
| 2410 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | 2488 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); |
| 2411 | struct rcu_node *rnp = rcu_get_root(rsp); | 2489 | struct rcu_node *rnp = rcu_get_root(rsp); |
| 2412 | 2490 | ||
| 2413 | /* Set up local state, ensuring consistent view of global state. */ | 2491 | /* Set up local state, ensuring consistent view of global state. */ |
| 2414 | raw_spin_lock_irqsave(&rnp->lock, flags); | 2492 | raw_spin_lock_irqsave(&rnp->lock, flags); |
| 2415 | rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); | 2493 | rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); |
| 2416 | rdp->nxtlist = NULL; | 2494 | init_callback_list(rdp); |
| 2417 | for (i = 0; i < RCU_NEXT_SIZE; i++) | ||
| 2418 | rdp->nxttail[i] = &rdp->nxtlist; | ||
| 2419 | rdp->qlen_lazy = 0; | 2495 | rdp->qlen_lazy = 0; |
| 2420 | rdp->qlen = 0; | 2496 | ACCESS_ONCE(rdp->qlen) = 0; |
| 2421 | rdp->dynticks = &per_cpu(rcu_dynticks, cpu); | 2497 | rdp->dynticks = &per_cpu(rcu_dynticks, cpu); |
| 2422 | WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); | 2498 | WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); |
| 2423 | WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); | 2499 | WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); |
| @@ -2491,9 +2567,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) | |||
| 2491 | 2567 | ||
| 2492 | static void __cpuinit rcu_prepare_cpu(int cpu) | 2568 | static void __cpuinit rcu_prepare_cpu(int cpu) |
| 2493 | { | 2569 | { |
| 2494 | rcu_init_percpu_data(cpu, &rcu_sched_state, 0); | 2570 | struct rcu_state *rsp; |
| 2495 | rcu_init_percpu_data(cpu, &rcu_bh_state, 0); | 2571 | |
| 2496 | rcu_preempt_init_percpu_data(cpu); | 2572 | for_each_rcu_flavor(rsp) |
| 2573 | rcu_init_percpu_data(cpu, rsp, | ||
| 2574 | strcmp(rsp->name, "rcu_preempt") == 0); | ||
| 2497 | } | 2575 | } |
| 2498 | 2576 | ||
| 2499 | /* | 2577 | /* |
| @@ -2505,6 +2583,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
| 2505 | long cpu = (long)hcpu; | 2583 | long cpu = (long)hcpu; |
| 2506 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); | 2584 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); |
| 2507 | struct rcu_node *rnp = rdp->mynode; | 2585 | struct rcu_node *rnp = rdp->mynode; |
| 2586 | struct rcu_state *rsp; | ||
| 2508 | 2587 | ||
| 2509 | trace_rcu_utilization("Start CPU hotplug"); | 2588 | trace_rcu_utilization("Start CPU hotplug"); |
| 2510 | switch (action) { | 2589 | switch (action) { |
| @@ -2529,18 +2608,16 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
| 2529 | * touch any data without introducing corruption. We send the | 2608 | * touch any data without introducing corruption. We send the |
| 2530 | * dying CPU's callbacks to an arbitrarily chosen online CPU. | 2609 | * dying CPU's callbacks to an arbitrarily chosen online CPU. |
| 2531 | */ | 2610 | */ |
| 2532 | rcu_cleanup_dying_cpu(&rcu_bh_state); | 2611 | for_each_rcu_flavor(rsp) |
| 2533 | rcu_cleanup_dying_cpu(&rcu_sched_state); | 2612 | rcu_cleanup_dying_cpu(rsp); |
| 2534 | rcu_preempt_cleanup_dying_cpu(); | ||
| 2535 | rcu_cleanup_after_idle(cpu); | 2613 | rcu_cleanup_after_idle(cpu); |
| 2536 | break; | 2614 | break; |
| 2537 | case CPU_DEAD: | 2615 | case CPU_DEAD: |
| 2538 | case CPU_DEAD_FROZEN: | 2616 | case CPU_DEAD_FROZEN: |
| 2539 | case CPU_UP_CANCELED: | 2617 | case CPU_UP_CANCELED: |
| 2540 | case CPU_UP_CANCELED_FROZEN: | 2618 | case CPU_UP_CANCELED_FROZEN: |
| 2541 | rcu_cleanup_dead_cpu(cpu, &rcu_bh_state); | 2619 | for_each_rcu_flavor(rsp) |
| 2542 | rcu_cleanup_dead_cpu(cpu, &rcu_sched_state); | 2620 | rcu_cleanup_dead_cpu(cpu, rsp); |
| 2543 | rcu_preempt_cleanup_dead_cpu(cpu); | ||
| 2544 | break; | 2621 | break; |
| 2545 | default: | 2622 | default: |
| 2546 | break; | 2623 | break; |
| @@ -2573,9 +2650,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) | |||
| 2573 | { | 2650 | { |
| 2574 | int i; | 2651 | int i; |
| 2575 | 2652 | ||
| 2576 | for (i = NUM_RCU_LVLS - 1; i > 0; i--) | 2653 | for (i = rcu_num_lvls - 1; i > 0; i--) |
| 2577 | rsp->levelspread[i] = CONFIG_RCU_FANOUT; | 2654 | rsp->levelspread[i] = CONFIG_RCU_FANOUT; |
| 2578 | rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF; | 2655 | rsp->levelspread[0] = rcu_fanout_leaf; |
| 2579 | } | 2656 | } |
| 2580 | #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ | 2657 | #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ |
| 2581 | static void __init rcu_init_levelspread(struct rcu_state *rsp) | 2658 | static void __init rcu_init_levelspread(struct rcu_state *rsp) |
| @@ -2585,7 +2662,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) | |||
| 2585 | int i; | 2662 | int i; |
| 2586 | 2663 | ||
| 2587 | cprv = NR_CPUS; | 2664 | cprv = NR_CPUS; |
| 2588 | for (i = NUM_RCU_LVLS - 1; i >= 0; i--) { | 2665 | for (i = rcu_num_lvls - 1; i >= 0; i--) { |
| 2589 | ccur = rsp->levelcnt[i]; | 2666 | ccur = rsp->levelcnt[i]; |
| 2590 | rsp->levelspread[i] = (cprv + ccur - 1) / ccur; | 2667 | rsp->levelspread[i] = (cprv + ccur - 1) / ccur; |
| 2591 | cprv = ccur; | 2668 | cprv = ccur; |
| @@ -2612,13 +2689,15 @@ static void __init rcu_init_one(struct rcu_state *rsp, | |||
| 2612 | 2689 | ||
| 2613 | /* Initialize the level-tracking arrays. */ | 2690 | /* Initialize the level-tracking arrays. */ |
| 2614 | 2691 | ||
| 2615 | for (i = 1; i < NUM_RCU_LVLS; i++) | 2692 | for (i = 0; i < rcu_num_lvls; i++) |
| 2693 | rsp->levelcnt[i] = num_rcu_lvl[i]; | ||
| 2694 | for (i = 1; i < rcu_num_lvls; i++) | ||
| 2616 | rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; | 2695 | rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; |
| 2617 | rcu_init_levelspread(rsp); | 2696 | rcu_init_levelspread(rsp); |
| 2618 | 2697 | ||
| 2619 | /* Initialize the elements themselves, starting from the leaves. */ | 2698 | /* Initialize the elements themselves, starting from the leaves. */ |
| 2620 | 2699 | ||
| 2621 | for (i = NUM_RCU_LVLS - 1; i >= 0; i--) { | 2700 | for (i = rcu_num_lvls - 1; i >= 0; i--) { |
| 2622 | cpustride *= rsp->levelspread[i]; | 2701 | cpustride *= rsp->levelspread[i]; |
| 2623 | rnp = rsp->level[i]; | 2702 | rnp = rsp->level[i]; |
| 2624 | for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { | 2703 | for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { |
| @@ -2648,13 +2727,74 @@ static void __init rcu_init_one(struct rcu_state *rsp, | |||
| 2648 | } | 2727 | } |
| 2649 | 2728 | ||
| 2650 | rsp->rda = rda; | 2729 | rsp->rda = rda; |
| 2651 | rnp = rsp->level[NUM_RCU_LVLS - 1]; | 2730 | rnp = rsp->level[rcu_num_lvls - 1]; |
| 2652 | for_each_possible_cpu(i) { | 2731 | for_each_possible_cpu(i) { |
| 2653 | while (i > rnp->grphi) | 2732 | while (i > rnp->grphi) |
| 2654 | rnp++; | 2733 | rnp++; |
| 2655 | per_cpu_ptr(rsp->rda, i)->mynode = rnp; | 2734 | per_cpu_ptr(rsp->rda, i)->mynode = rnp; |
| 2656 | rcu_boot_init_percpu_data(i, rsp); | 2735 | rcu_boot_init_percpu_data(i, rsp); |
| 2657 | } | 2736 | } |
| 2737 | list_add(&rsp->flavors, &rcu_struct_flavors); | ||
| 2738 | } | ||
| 2739 | |||
| 2740 | /* | ||
| 2741 | * Compute the rcu_node tree geometry from kernel parameters. This cannot | ||
| 2742 | * replace the definitions in rcutree.h because those are needed to size | ||
| 2743 | * the ->node array in the rcu_state structure. | ||
| 2744 | */ | ||
| 2745 | static void __init rcu_init_geometry(void) | ||
| 2746 | { | ||
| 2747 | int i; | ||
| 2748 | int j; | ||
| 2749 | int n = nr_cpu_ids; | ||
| 2750 | int rcu_capacity[MAX_RCU_LVLS + 1]; | ||
| 2751 | |||
| 2752 | /* If the compile-time values are accurate, just leave. */ | ||
| 2753 | if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF) | ||
| 2754 | return; | ||
| 2755 | |||
| 2756 | /* | ||
| 2757 | * Compute number of nodes that can be handled an rcu_node tree | ||
| 2758 | * with the given number of levels. Setting rcu_capacity[0] makes | ||
| 2759 | * some of the arithmetic easier. | ||
| 2760 | */ | ||
| 2761 | rcu_capacity[0] = 1; | ||
| 2762 | rcu_capacity[1] = rcu_fanout_leaf; | ||
| 2763 | for (i = 2; i <= MAX_RCU_LVLS; i++) | ||
| 2764 | rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT; | ||
| 2765 | |||
| 2766 | /* | ||
| 2767 | * The boot-time rcu_fanout_leaf parameter is only permitted | ||
| 2768 | * to increase the leaf-level fanout, not decrease it. Of course, | ||
| 2769 | * the leaf-level fanout cannot exceed the number of bits in | ||
| 2770 | * the rcu_node masks. Finally, the tree must be able to accommodate | ||
| 2771 | * the configured number of CPUs. Complain and fall back to the | ||
| 2772 | * compile-time values if these limits are exceeded. | ||
| 2773 | */ | ||
| 2774 | if (rcu_fanout_leaf < CONFIG_RCU_FANOUT_LEAF || | ||
| 2775 | rcu_fanout_leaf > sizeof(unsigned long) * 8 || | ||
| 2776 | n > rcu_capacity[MAX_RCU_LVLS]) { | ||
| 2777 | WARN_ON(1); | ||
| 2778 | return; | ||
| 2779 | } | ||
| 2780 | |||
| 2781 | /* Calculate the number of rcu_nodes at each level of the tree. */ | ||
| 2782 | for (i = 1; i <= MAX_RCU_LVLS; i++) | ||
| 2783 | if (n <= rcu_capacity[i]) { | ||
| 2784 | for (j = 0; j <= i; j++) | ||
| 2785 | num_rcu_lvl[j] = | ||
| 2786 | DIV_ROUND_UP(n, rcu_capacity[i - j]); | ||
| 2787 | rcu_num_lvls = i; | ||
| 2788 | for (j = i + 1; j <= MAX_RCU_LVLS; j++) | ||
| 2789 | num_rcu_lvl[j] = 0; | ||
| 2790 | break; | ||
| 2791 | } | ||
| 2792 | |||
| 2793 | /* Calculate the total number of rcu_node structures. */ | ||
| 2794 | rcu_num_nodes = 0; | ||
| 2795 | for (i = 0; i <= MAX_RCU_LVLS; i++) | ||
| 2796 | rcu_num_nodes += num_rcu_lvl[i]; | ||
| 2797 | rcu_num_nodes -= n; | ||
| 2658 | } | 2798 | } |
| 2659 | 2799 | ||
| 2660 | void __init rcu_init(void) | 2800 | void __init rcu_init(void) |
| @@ -2662,6 +2802,7 @@ void __init rcu_init(void) | |||
| 2662 | int cpu; | 2802 | int cpu; |
| 2663 | 2803 | ||
| 2664 | rcu_bootup_announce(); | 2804 | rcu_bootup_announce(); |
| 2805 | rcu_init_geometry(); | ||
| 2665 | rcu_init_one(&rcu_sched_state, &rcu_sched_data); | 2806 | rcu_init_one(&rcu_sched_state, &rcu_sched_data); |
| 2666 | rcu_init_one(&rcu_bh_state, &rcu_bh_data); | 2807 | rcu_init_one(&rcu_bh_state, &rcu_bh_data); |
| 2667 | __rcu_init_preempt(); | 2808 | __rcu_init_preempt(); |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index ea056495783e..4d29169f2124 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
| @@ -42,28 +42,28 @@ | |||
| 42 | #define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) | 42 | #define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) |
| 43 | 43 | ||
| 44 | #if NR_CPUS <= RCU_FANOUT_1 | 44 | #if NR_CPUS <= RCU_FANOUT_1 |
| 45 | # define NUM_RCU_LVLS 1 | 45 | # define RCU_NUM_LVLS 1 |
| 46 | # define NUM_RCU_LVL_0 1 | 46 | # define NUM_RCU_LVL_0 1 |
| 47 | # define NUM_RCU_LVL_1 (NR_CPUS) | 47 | # define NUM_RCU_LVL_1 (NR_CPUS) |
| 48 | # define NUM_RCU_LVL_2 0 | 48 | # define NUM_RCU_LVL_2 0 |
| 49 | # define NUM_RCU_LVL_3 0 | 49 | # define NUM_RCU_LVL_3 0 |
| 50 | # define NUM_RCU_LVL_4 0 | 50 | # define NUM_RCU_LVL_4 0 |
| 51 | #elif NR_CPUS <= RCU_FANOUT_2 | 51 | #elif NR_CPUS <= RCU_FANOUT_2 |
| 52 | # define NUM_RCU_LVLS 2 | 52 | # define RCU_NUM_LVLS 2 |
| 53 | # define NUM_RCU_LVL_0 1 | 53 | # define NUM_RCU_LVL_0 1 |
| 54 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) | 54 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) |
| 55 | # define NUM_RCU_LVL_2 (NR_CPUS) | 55 | # define NUM_RCU_LVL_2 (NR_CPUS) |
| 56 | # define NUM_RCU_LVL_3 0 | 56 | # define NUM_RCU_LVL_3 0 |
| 57 | # define NUM_RCU_LVL_4 0 | 57 | # define NUM_RCU_LVL_4 0 |
| 58 | #elif NR_CPUS <= RCU_FANOUT_3 | 58 | #elif NR_CPUS <= RCU_FANOUT_3 |
| 59 | # define NUM_RCU_LVLS 3 | 59 | # define RCU_NUM_LVLS 3 |
| 60 | # define NUM_RCU_LVL_0 1 | 60 | # define NUM_RCU_LVL_0 1 |
| 61 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) | 61 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) |
| 62 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) | 62 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) |
| 63 | # define NUM_RCU_LVL_3 (NR_CPUS) | 63 | # define NUM_RCU_LVL_3 (NR_CPUS) |
| 64 | # define NUM_RCU_LVL_4 0 | 64 | # define NUM_RCU_LVL_4 0 |
| 65 | #elif NR_CPUS <= RCU_FANOUT_4 | 65 | #elif NR_CPUS <= RCU_FANOUT_4 |
| 66 | # define NUM_RCU_LVLS 4 | 66 | # define RCU_NUM_LVLS 4 |
| 67 | # define NUM_RCU_LVL_0 1 | 67 | # define NUM_RCU_LVL_0 1 |
| 68 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) | 68 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) |
| 69 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) | 69 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) |
| @@ -76,6 +76,9 @@ | |||
| 76 | #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) | 76 | #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) |
| 77 | #define NUM_RCU_NODES (RCU_SUM - NR_CPUS) | 77 | #define NUM_RCU_NODES (RCU_SUM - NR_CPUS) |
| 78 | 78 | ||
| 79 | extern int rcu_num_lvls; | ||
| 80 | extern int rcu_num_nodes; | ||
| 81 | |||
| 79 | /* | 82 | /* |
| 80 | * Dynticks per-CPU state. | 83 | * Dynticks per-CPU state. |
| 81 | */ | 84 | */ |
| @@ -97,6 +100,7 @@ struct rcu_dynticks { | |||
| 97 | /* # times non-lazy CBs posted to CPU. */ | 100 | /* # times non-lazy CBs posted to CPU. */ |
| 98 | unsigned long nonlazy_posted_snap; | 101 | unsigned long nonlazy_posted_snap; |
| 99 | /* idle-period nonlazy_posted snapshot. */ | 102 | /* idle-period nonlazy_posted snapshot. */ |
| 103 | int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ | ||
| 100 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | 104 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ |
| 101 | }; | 105 | }; |
| 102 | 106 | ||
| @@ -206,7 +210,7 @@ struct rcu_node { | |||
| 206 | */ | 210 | */ |
| 207 | #define rcu_for_each_node_breadth_first(rsp, rnp) \ | 211 | #define rcu_for_each_node_breadth_first(rsp, rnp) \ |
| 208 | for ((rnp) = &(rsp)->node[0]; \ | 212 | for ((rnp) = &(rsp)->node[0]; \ |
| 209 | (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) | 213 | (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) |
| 210 | 214 | ||
| 211 | /* | 215 | /* |
| 212 | * Do a breadth-first scan of the non-leaf rcu_node structures for the | 216 | * Do a breadth-first scan of the non-leaf rcu_node structures for the |
| @@ -215,7 +219,7 @@ struct rcu_node { | |||
| 215 | */ | 219 | */ |
| 216 | #define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ | 220 | #define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ |
| 217 | for ((rnp) = &(rsp)->node[0]; \ | 221 | for ((rnp) = &(rsp)->node[0]; \ |
| 218 | (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++) | 222 | (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++) |
| 219 | 223 | ||
| 220 | /* | 224 | /* |
| 221 | * Scan the leaves of the rcu_node hierarchy for the specified rcu_state | 225 | * Scan the leaves of the rcu_node hierarchy for the specified rcu_state |
| @@ -224,8 +228,8 @@ struct rcu_node { | |||
| 224 | * It is still a leaf node, even if it is also the root node. | 228 | * It is still a leaf node, even if it is also the root node. |
| 225 | */ | 229 | */ |
| 226 | #define rcu_for_each_leaf_node(rsp, rnp) \ | 230 | #define rcu_for_each_leaf_node(rsp, rnp) \ |
| 227 | for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \ | 231 | for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ |
| 228 | (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) | 232 | (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) |
| 229 | 233 | ||
| 230 | /* Index values for nxttail array in struct rcu_data. */ | 234 | /* Index values for nxttail array in struct rcu_data. */ |
| 231 | #define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ | 235 | #define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ |
| @@ -311,6 +315,9 @@ struct rcu_data { | |||
| 311 | unsigned long n_rp_need_fqs; | 315 | unsigned long n_rp_need_fqs; |
| 312 | unsigned long n_rp_need_nothing; | 316 | unsigned long n_rp_need_nothing; |
| 313 | 317 | ||
| 318 | /* 6) _rcu_barrier() callback. */ | ||
| 319 | struct rcu_head barrier_head; | ||
| 320 | |||
| 314 | int cpu; | 321 | int cpu; |
| 315 | struct rcu_state *rsp; | 322 | struct rcu_state *rsp; |
| 316 | }; | 323 | }; |
| @@ -357,10 +364,12 @@ do { \ | |||
| 357 | */ | 364 | */ |
| 358 | struct rcu_state { | 365 | struct rcu_state { |
| 359 | struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */ | 366 | struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */ |
| 360 | struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */ | 367 | struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */ |
| 361 | u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ | 368 | u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ |
| 362 | u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */ | 369 | u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ |
| 363 | struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ | 370 | struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ |
| 371 | void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ | ||
| 372 | void (*func)(struct rcu_head *head)); | ||
| 364 | 373 | ||
| 365 | /* The following fields are guarded by the root rcu_node's lock. */ | 374 | /* The following fields are guarded by the root rcu_node's lock. */ |
| 366 | 375 | ||
| @@ -392,6 +401,11 @@ struct rcu_state { | |||
| 392 | struct task_struct *rcu_barrier_in_progress; | 401 | struct task_struct *rcu_barrier_in_progress; |
| 393 | /* Task doing rcu_barrier(), */ | 402 | /* Task doing rcu_barrier(), */ |
| 394 | /* or NULL if no barrier. */ | 403 | /* or NULL if no barrier. */ |
| 404 | struct mutex barrier_mutex; /* Guards barrier fields. */ | ||
| 405 | atomic_t barrier_cpu_count; /* # CPUs waiting on. */ | ||
| 406 | struct completion barrier_completion; /* Wake at barrier end. */ | ||
| 407 | unsigned long n_barrier_done; /* ++ at start and end of */ | ||
| 408 | /* _rcu_barrier(). */ | ||
| 395 | raw_spinlock_t fqslock; /* Only one task forcing */ | 409 | raw_spinlock_t fqslock; /* Only one task forcing */ |
| 396 | /* quiescent states. */ | 410 | /* quiescent states. */ |
| 397 | unsigned long jiffies_force_qs; /* Time at which to invoke */ | 411 | unsigned long jiffies_force_qs; /* Time at which to invoke */ |
| @@ -409,8 +423,13 @@ struct rcu_state { | |||
| 409 | unsigned long gp_max; /* Maximum GP duration in */ | 423 | unsigned long gp_max; /* Maximum GP duration in */ |
| 410 | /* jiffies. */ | 424 | /* jiffies. */ |
| 411 | char *name; /* Name of structure. */ | 425 | char *name; /* Name of structure. */ |
| 426 | struct list_head flavors; /* List of RCU flavors. */ | ||
| 412 | }; | 427 | }; |
| 413 | 428 | ||
| 429 | extern struct list_head rcu_struct_flavors; | ||
| 430 | #define for_each_rcu_flavor(rsp) \ | ||
| 431 | list_for_each_entry((rsp), &rcu_struct_flavors, flavors) | ||
| 432 | |||
| 414 | /* Return values for rcu_preempt_offline_tasks(). */ | 433 | /* Return values for rcu_preempt_offline_tasks(). */ |
| 415 | 434 | ||
| 416 | #define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */ | 435 | #define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */ |
| @@ -444,6 +463,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); | |||
| 444 | /* Forward declarations for rcutree_plugin.h */ | 463 | /* Forward declarations for rcutree_plugin.h */ |
| 445 | static void rcu_bootup_announce(void); | 464 | static void rcu_bootup_announce(void); |
| 446 | long rcu_batches_completed(void); | 465 | long rcu_batches_completed(void); |
| 466 | static void rcu_preempt_note_context_switch(int cpu); | ||
| 447 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); | 467 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); |
| 448 | #ifdef CONFIG_HOTPLUG_CPU | 468 | #ifdef CONFIG_HOTPLUG_CPU |
| 449 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, | 469 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, |
| @@ -452,25 +472,18 @@ static void rcu_stop_cpu_kthread(int cpu); | |||
| 452 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 472 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
| 453 | static void rcu_print_detail_task_stall(struct rcu_state *rsp); | 473 | static void rcu_print_detail_task_stall(struct rcu_state *rsp); |
| 454 | static int rcu_print_task_stall(struct rcu_node *rnp); | 474 | static int rcu_print_task_stall(struct rcu_node *rnp); |
| 455 | static void rcu_preempt_stall_reset(void); | ||
| 456 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); | 475 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); |
| 457 | #ifdef CONFIG_HOTPLUG_CPU | 476 | #ifdef CONFIG_HOTPLUG_CPU |
| 458 | static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | 477 | static int rcu_preempt_offline_tasks(struct rcu_state *rsp, |
| 459 | struct rcu_node *rnp, | 478 | struct rcu_node *rnp, |
| 460 | struct rcu_data *rdp); | 479 | struct rcu_data *rdp); |
| 461 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 480 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
| 462 | static void rcu_preempt_cleanup_dead_cpu(int cpu); | ||
| 463 | static void rcu_preempt_check_callbacks(int cpu); | 481 | static void rcu_preempt_check_callbacks(int cpu); |
| 464 | static void rcu_preempt_process_callbacks(void); | ||
| 465 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); | 482 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); |
| 466 | #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) | 483 | #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) |
| 467 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, | 484 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, |
| 468 | bool wake); | 485 | bool wake); |
| 469 | #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ | 486 | #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ |
| 470 | static int rcu_preempt_pending(int cpu); | ||
| 471 | static int rcu_preempt_cpu_has_callbacks(int cpu); | ||
| 472 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu); | ||
| 473 | static void rcu_preempt_cleanup_dying_cpu(void); | ||
| 474 | static void __init __rcu_init_preempt(void); | 487 | static void __init __rcu_init_preempt(void); |
| 475 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); | 488 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); |
| 476 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); | 489 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); |
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 5271a020887e..7f3244c0df01 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
| @@ -68,17 +68,21 @@ static void __init rcu_bootup_announce_oddness(void) | |||
| 68 | printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n"); | 68 | printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n"); |
| 69 | #endif | 69 | #endif |
| 70 | #if NUM_RCU_LVL_4 != 0 | 70 | #if NUM_RCU_LVL_4 != 0 |
| 71 | printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n"); | 71 | printk(KERN_INFO "\tFour-level hierarchy is enabled.\n"); |
| 72 | #endif | 72 | #endif |
| 73 | if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) | ||
| 74 | printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); | ||
| 75 | if (nr_cpu_ids != NR_CPUS) | ||
| 76 | printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); | ||
| 73 | } | 77 | } |
| 74 | 78 | ||
| 75 | #ifdef CONFIG_TREE_PREEMPT_RCU | 79 | #ifdef CONFIG_TREE_PREEMPT_RCU |
| 76 | 80 | ||
| 77 | struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt); | 81 | struct rcu_state rcu_preempt_state = |
| 82 | RCU_STATE_INITIALIZER(rcu_preempt, call_rcu); | ||
| 78 | DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); | 83 | DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); |
| 79 | static struct rcu_state *rcu_state = &rcu_preempt_state; | 84 | static struct rcu_state *rcu_state = &rcu_preempt_state; |
| 80 | 85 | ||
| 81 | static void rcu_read_unlock_special(struct task_struct *t); | ||
| 82 | static int rcu_preempted_readers_exp(struct rcu_node *rnp); | 86 | static int rcu_preempted_readers_exp(struct rcu_node *rnp); |
| 83 | 87 | ||
| 84 | /* | 88 | /* |
| @@ -153,7 +157,7 @@ static void rcu_preempt_qs(int cpu) | |||
| 153 | * | 157 | * |
| 154 | * Caller must disable preemption. | 158 | * Caller must disable preemption. |
| 155 | */ | 159 | */ |
| 156 | void rcu_preempt_note_context_switch(void) | 160 | static void rcu_preempt_note_context_switch(int cpu) |
| 157 | { | 161 | { |
| 158 | struct task_struct *t = current; | 162 | struct task_struct *t = current; |
| 159 | unsigned long flags; | 163 | unsigned long flags; |
| @@ -164,7 +168,7 @@ void rcu_preempt_note_context_switch(void) | |||
| 164 | (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { | 168 | (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { |
| 165 | 169 | ||
| 166 | /* Possibly blocking in an RCU read-side critical section. */ | 170 | /* Possibly blocking in an RCU read-side critical section. */ |
| 167 | rdp = __this_cpu_ptr(rcu_preempt_state.rda); | 171 | rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); |
| 168 | rnp = rdp->mynode; | 172 | rnp = rdp->mynode; |
| 169 | raw_spin_lock_irqsave(&rnp->lock, flags); | 173 | raw_spin_lock_irqsave(&rnp->lock, flags); |
| 170 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; | 174 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; |
| @@ -228,23 +232,11 @@ void rcu_preempt_note_context_switch(void) | |||
| 228 | * means that we continue to block the current grace period. | 232 | * means that we continue to block the current grace period. |
| 229 | */ | 233 | */ |
| 230 | local_irq_save(flags); | 234 | local_irq_save(flags); |
| 231 | rcu_preempt_qs(smp_processor_id()); | 235 | rcu_preempt_qs(cpu); |
| 232 | local_irq_restore(flags); | 236 | local_irq_restore(flags); |
| 233 | } | 237 | } |
| 234 | 238 | ||
| 235 | /* | 239 | /* |
| 236 | * Tree-preemptible RCU implementation for rcu_read_lock(). | ||
| 237 | * Just increment ->rcu_read_lock_nesting, shared state will be updated | ||
| 238 | * if we block. | ||
| 239 | */ | ||
| 240 | void __rcu_read_lock(void) | ||
| 241 | { | ||
| 242 | current->rcu_read_lock_nesting++; | ||
| 243 | barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */ | ||
| 244 | } | ||
| 245 | EXPORT_SYMBOL_GPL(__rcu_read_lock); | ||
| 246 | |||
| 247 | /* | ||
| 248 | * Check for preempted RCU readers blocking the current grace period | 240 | * Check for preempted RCU readers blocking the current grace period |
| 249 | * for the specified rcu_node structure. If the caller needs a reliable | 241 | * for the specified rcu_node structure. If the caller needs a reliable |
| 250 | * answer, it must hold the rcu_node's ->lock. | 242 | * answer, it must hold the rcu_node's ->lock. |
| @@ -310,7 +302,7 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t, | |||
| 310 | * notify RCU core processing or task having blocked during the RCU | 302 | * notify RCU core processing or task having blocked during the RCU |
| 311 | * read-side critical section. | 303 | * read-side critical section. |
| 312 | */ | 304 | */ |
| 313 | static noinline void rcu_read_unlock_special(struct task_struct *t) | 305 | void rcu_read_unlock_special(struct task_struct *t) |
| 314 | { | 306 | { |
| 315 | int empty; | 307 | int empty; |
| 316 | int empty_exp; | 308 | int empty_exp; |
| @@ -398,8 +390,9 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) | |||
| 398 | rnp->grphi, | 390 | rnp->grphi, |
| 399 | !!rnp->gp_tasks); | 391 | !!rnp->gp_tasks); |
| 400 | rcu_report_unblock_qs_rnp(rnp, flags); | 392 | rcu_report_unblock_qs_rnp(rnp, flags); |
| 401 | } else | 393 | } else { |
| 402 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 394 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 395 | } | ||
| 403 | 396 | ||
| 404 | #ifdef CONFIG_RCU_BOOST | 397 | #ifdef CONFIG_RCU_BOOST |
| 405 | /* Unboost if we were boosted. */ | 398 | /* Unboost if we were boosted. */ |
| @@ -418,38 +411,6 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) | |||
| 418 | } | 411 | } |
| 419 | } | 412 | } |
| 420 | 413 | ||
| 421 | /* | ||
| 422 | * Tree-preemptible RCU implementation for rcu_read_unlock(). | ||
| 423 | * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost | ||
| 424 | * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then | ||
| 425 | * invoke rcu_read_unlock_special() to clean up after a context switch | ||
| 426 | * in an RCU read-side critical section and other special cases. | ||
| 427 | */ | ||
| 428 | void __rcu_read_unlock(void) | ||
| 429 | { | ||
| 430 | struct task_struct *t = current; | ||
| 431 | |||
| 432 | if (t->rcu_read_lock_nesting != 1) | ||
| 433 | --t->rcu_read_lock_nesting; | ||
| 434 | else { | ||
| 435 | barrier(); /* critical section before exit code. */ | ||
| 436 | t->rcu_read_lock_nesting = INT_MIN; | ||
| 437 | barrier(); /* assign before ->rcu_read_unlock_special load */ | ||
| 438 | if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) | ||
| 439 | rcu_read_unlock_special(t); | ||
| 440 | barrier(); /* ->rcu_read_unlock_special load before assign */ | ||
| 441 | t->rcu_read_lock_nesting = 0; | ||
| 442 | } | ||
| 443 | #ifdef CONFIG_PROVE_LOCKING | ||
| 444 | { | ||
| 445 | int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting); | ||
| 446 | |||
| 447 | WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); | ||
| 448 | } | ||
| 449 | #endif /* #ifdef CONFIG_PROVE_LOCKING */ | ||
| 450 | } | ||
| 451 | EXPORT_SYMBOL_GPL(__rcu_read_unlock); | ||
| 452 | |||
| 453 | #ifdef CONFIG_RCU_CPU_STALL_VERBOSE | 414 | #ifdef CONFIG_RCU_CPU_STALL_VERBOSE |
| 454 | 415 | ||
| 455 | /* | 416 | /* |
| @@ -540,16 +501,6 @@ static int rcu_print_task_stall(struct rcu_node *rnp) | |||
| 540 | } | 501 | } |
| 541 | 502 | ||
| 542 | /* | 503 | /* |
| 543 | * Suppress preemptible RCU's CPU stall warnings by pushing the | ||
| 544 | * time of the next stall-warning message comfortably far into the | ||
| 545 | * future. | ||
| 546 | */ | ||
| 547 | static void rcu_preempt_stall_reset(void) | ||
| 548 | { | ||
| 549 | rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2; | ||
| 550 | } | ||
| 551 | |||
| 552 | /* | ||
| 553 | * Check that the list of blocked tasks for the newly completed grace | 504 | * Check that the list of blocked tasks for the newly completed grace |
| 554 | * period is in fact empty. It is a serious bug to complete a grace | 505 | * period is in fact empty. It is a serious bug to complete a grace |
| 555 | * period that still has RCU readers blocked! This function must be | 506 | * period that still has RCU readers blocked! This function must be |
| @@ -650,14 +601,6 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | |||
| 650 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 601 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
| 651 | 602 | ||
| 652 | /* | 603 | /* |
| 653 | * Do CPU-offline processing for preemptible RCU. | ||
| 654 | */ | ||
| 655 | static void rcu_preempt_cleanup_dead_cpu(int cpu) | ||
| 656 | { | ||
| 657 | rcu_cleanup_dead_cpu(cpu, &rcu_preempt_state); | ||
| 658 | } | ||
| 659 | |||
| 660 | /* | ||
| 661 | * Check for a quiescent state from the current CPU. When a task blocks, | 604 | * Check for a quiescent state from the current CPU. When a task blocks, |
| 662 | * the task is recorded in the corresponding CPU's rcu_node structure, | 605 | * the task is recorded in the corresponding CPU's rcu_node structure, |
| 663 | * which is checked elsewhere. | 606 | * which is checked elsewhere. |
| @@ -677,15 +620,6 @@ static void rcu_preempt_check_callbacks(int cpu) | |||
| 677 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; | 620 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; |
| 678 | } | 621 | } |
| 679 | 622 | ||
| 680 | /* | ||
| 681 | * Process callbacks for preemptible RCU. | ||
| 682 | */ | ||
| 683 | static void rcu_preempt_process_callbacks(void) | ||
| 684 | { | ||
| 685 | __rcu_process_callbacks(&rcu_preempt_state, | ||
| 686 | &__get_cpu_var(rcu_preempt_data)); | ||
| 687 | } | ||
| 688 | |||
| 689 | #ifdef CONFIG_RCU_BOOST | 623 | #ifdef CONFIG_RCU_BOOST |
| 690 | 624 | ||
| 691 | static void rcu_preempt_do_callbacks(void) | 625 | static void rcu_preempt_do_callbacks(void) |
| @@ -824,9 +758,9 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) | |||
| 824 | int must_wait = 0; | 758 | int must_wait = 0; |
| 825 | 759 | ||
| 826 | raw_spin_lock_irqsave(&rnp->lock, flags); | 760 | raw_spin_lock_irqsave(&rnp->lock, flags); |
| 827 | if (list_empty(&rnp->blkd_tasks)) | 761 | if (list_empty(&rnp->blkd_tasks)) { |
| 828 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 762 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 829 | else { | 763 | } else { |
| 830 | rnp->exp_tasks = rnp->blkd_tasks.next; | 764 | rnp->exp_tasks = rnp->blkd_tasks.next; |
| 831 | rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ | 765 | rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ |
| 832 | must_wait = 1; | 766 | must_wait = 1; |
| @@ -870,9 +804,9 @@ void synchronize_rcu_expedited(void) | |||
| 870 | * expedited grace period for us, just leave. | 804 | * expedited grace period for us, just leave. |
| 871 | */ | 805 | */ |
| 872 | while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { | 806 | while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { |
| 873 | if (trycount++ < 10) | 807 | if (trycount++ < 10) { |
| 874 | udelay(trycount * num_online_cpus()); | 808 | udelay(trycount * num_online_cpus()); |
| 875 | else { | 809 | } else { |
| 876 | synchronize_rcu(); | 810 | synchronize_rcu(); |
| 877 | return; | 811 | return; |
| 878 | } | 812 | } |
| @@ -917,51 +851,16 @@ mb_ret: | |||
| 917 | } | 851 | } |
| 918 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | 852 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); |
| 919 | 853 | ||
| 920 | /* | ||
| 921 | * Check to see if there is any immediate preemptible-RCU-related work | ||
| 922 | * to be done. | ||
| 923 | */ | ||
| 924 | static int rcu_preempt_pending(int cpu) | ||
| 925 | { | ||
| 926 | return __rcu_pending(&rcu_preempt_state, | ||
| 927 | &per_cpu(rcu_preempt_data, cpu)); | ||
| 928 | } | ||
| 929 | |||
| 930 | /* | ||
| 931 | * Does preemptible RCU have callbacks on this CPU? | ||
| 932 | */ | ||
| 933 | static int rcu_preempt_cpu_has_callbacks(int cpu) | ||
| 934 | { | ||
| 935 | return !!per_cpu(rcu_preempt_data, cpu).nxtlist; | ||
| 936 | } | ||
| 937 | |||
| 938 | /** | 854 | /** |
| 939 | * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. | 855 | * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. |
| 940 | */ | 856 | */ |
| 941 | void rcu_barrier(void) | 857 | void rcu_barrier(void) |
| 942 | { | 858 | { |
| 943 | _rcu_barrier(&rcu_preempt_state, call_rcu); | 859 | _rcu_barrier(&rcu_preempt_state); |
| 944 | } | 860 | } |
| 945 | EXPORT_SYMBOL_GPL(rcu_barrier); | 861 | EXPORT_SYMBOL_GPL(rcu_barrier); |
| 946 | 862 | ||
| 947 | /* | 863 | /* |
| 948 | * Initialize preemptible RCU's per-CPU data. | ||
| 949 | */ | ||
| 950 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu) | ||
| 951 | { | ||
| 952 | rcu_init_percpu_data(cpu, &rcu_preempt_state, 1); | ||
| 953 | } | ||
| 954 | |||
| 955 | /* | ||
| 956 | * Move preemptible RCU's callbacks from dying CPU to other online CPU | ||
| 957 | * and record a quiescent state. | ||
| 958 | */ | ||
| 959 | static void rcu_preempt_cleanup_dying_cpu(void) | ||
| 960 | { | ||
| 961 | rcu_cleanup_dying_cpu(&rcu_preempt_state); | ||
| 962 | } | ||
| 963 | |||
| 964 | /* | ||
| 965 | * Initialize preemptible RCU's state structures. | 864 | * Initialize preemptible RCU's state structures. |
| 966 | */ | 865 | */ |
| 967 | static void __init __rcu_init_preempt(void) | 866 | static void __init __rcu_init_preempt(void) |
| @@ -1002,6 +901,14 @@ void rcu_force_quiescent_state(void) | |||
| 1002 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); | 901 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); |
| 1003 | 902 | ||
| 1004 | /* | 903 | /* |
| 904 | * Because preemptible RCU does not exist, we never have to check for | ||
| 905 | * CPUs being in quiescent states. | ||
| 906 | */ | ||
| 907 | static void rcu_preempt_note_context_switch(int cpu) | ||
| 908 | { | ||
| 909 | } | ||
| 910 | |||
| 911 | /* | ||
| 1005 | * Because preemptible RCU does not exist, there are never any preempted | 912 | * Because preemptible RCU does not exist, there are never any preempted |
| 1006 | * RCU readers. | 913 | * RCU readers. |
| 1007 | */ | 914 | */ |
| @@ -1038,14 +945,6 @@ static int rcu_print_task_stall(struct rcu_node *rnp) | |||
| 1038 | } | 945 | } |
| 1039 | 946 | ||
| 1040 | /* | 947 | /* |
| 1041 | * Because preemptible RCU does not exist, there is no need to suppress | ||
| 1042 | * its CPU stall warnings. | ||
| 1043 | */ | ||
| 1044 | static void rcu_preempt_stall_reset(void) | ||
| 1045 | { | ||
| 1046 | } | ||
| 1047 | |||
| 1048 | /* | ||
| 1049 | * Because there is no preemptible RCU, there can be no readers blocked, | 948 | * Because there is no preemptible RCU, there can be no readers blocked, |
| 1050 | * so there is no need to check for blocked tasks. So check only for | 949 | * so there is no need to check for blocked tasks. So check only for |
| 1051 | * bogus qsmask values. | 950 | * bogus qsmask values. |
| @@ -1073,14 +972,6 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | |||
| 1073 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 972 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
| 1074 | 973 | ||
| 1075 | /* | 974 | /* |
| 1076 | * Because preemptible RCU does not exist, it never needs CPU-offline | ||
| 1077 | * processing. | ||
| 1078 | */ | ||
| 1079 | static void rcu_preempt_cleanup_dead_cpu(int cpu) | ||
| 1080 | { | ||
| 1081 | } | ||
| 1082 | |||
| 1083 | /* | ||
| 1084 | * Because preemptible RCU does not exist, it never has any callbacks | 975 | * Because preemptible RCU does not exist, it never has any callbacks |
| 1085 | * to check. | 976 | * to check. |
| 1086 | */ | 977 | */ |
| @@ -1089,14 +980,6 @@ static void rcu_preempt_check_callbacks(int cpu) | |||
| 1089 | } | 980 | } |
| 1090 | 981 | ||
| 1091 | /* | 982 | /* |
| 1092 | * Because preemptible RCU does not exist, it never has any callbacks | ||
| 1093 | * to process. | ||
| 1094 | */ | ||
| 1095 | static void rcu_preempt_process_callbacks(void) | ||
| 1096 | { | ||
| 1097 | } | ||
| 1098 | |||
| 1099 | /* | ||
| 1100 | * Queue an RCU callback for lazy invocation after a grace period. | 983 | * Queue an RCU callback for lazy invocation after a grace period. |
| 1101 | * This will likely be later named something like "call_rcu_lazy()", | 984 | * This will likely be later named something like "call_rcu_lazy()", |
| 1102 | * but this change will require some way of tagging the lazy RCU | 985 | * but this change will require some way of tagging the lazy RCU |
| @@ -1137,22 +1020,6 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, | |||
| 1137 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 1020 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
| 1138 | 1021 | ||
| 1139 | /* | 1022 | /* |
| 1140 | * Because preemptible RCU does not exist, it never has any work to do. | ||
| 1141 | */ | ||
| 1142 | static int rcu_preempt_pending(int cpu) | ||
| 1143 | { | ||
| 1144 | return 0; | ||
| 1145 | } | ||
| 1146 | |||
| 1147 | /* | ||
| 1148 | * Because preemptible RCU does not exist, it never has callbacks | ||
| 1149 | */ | ||
| 1150 | static int rcu_preempt_cpu_has_callbacks(int cpu) | ||
| 1151 | { | ||
| 1152 | return 0; | ||
| 1153 | } | ||
| 1154 | |||
| 1155 | /* | ||
| 1156 | * Because preemptible RCU does not exist, rcu_barrier() is just | 1023 | * Because preemptible RCU does not exist, rcu_barrier() is just |
| 1157 | * another name for rcu_barrier_sched(). | 1024 | * another name for rcu_barrier_sched(). |
| 1158 | */ | 1025 | */ |
| @@ -1163,21 +1030,6 @@ void rcu_barrier(void) | |||
| 1163 | EXPORT_SYMBOL_GPL(rcu_barrier); | 1030 | EXPORT_SYMBOL_GPL(rcu_barrier); |
| 1164 | 1031 | ||
| 1165 | /* | 1032 | /* |
| 1166 | * Because preemptible RCU does not exist, there is no per-CPU | ||
| 1167 | * data to initialize. | ||
| 1168 | */ | ||
| 1169 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu) | ||
| 1170 | { | ||
| 1171 | } | ||
| 1172 | |||
| 1173 | /* | ||
| 1174 | * Because there is no preemptible RCU, there is no cleanup to do. | ||
| 1175 | */ | ||
| 1176 | static void rcu_preempt_cleanup_dying_cpu(void) | ||
| 1177 | { | ||
| 1178 | } | ||
| 1179 | |||
| 1180 | /* | ||
| 1181 | * Because preemptible RCU does not exist, it need not be initialized. | 1033 | * Because preemptible RCU does not exist, it need not be initialized. |
| 1182 | */ | 1034 | */ |
| 1183 | static void __init __rcu_init_preempt(void) | 1035 | static void __init __rcu_init_preempt(void) |
| @@ -1960,9 +1812,11 @@ static void rcu_idle_count_callbacks_posted(void) | |||
| 1960 | */ | 1812 | */ |
| 1961 | #define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */ | 1813 | #define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */ |
| 1962 | #define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */ | 1814 | #define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */ |
| 1963 | #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ | 1815 | #define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ |
| 1964 | #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ | 1816 | #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ |
| 1965 | 1817 | ||
| 1818 | extern int tick_nohz_enabled; | ||
| 1819 | |||
| 1966 | /* | 1820 | /* |
| 1967 | * Does the specified flavor of RCU have non-lazy callbacks pending on | 1821 | * Does the specified flavor of RCU have non-lazy callbacks pending on |
| 1968 | * the specified CPU? Both RCU flavor and CPU are specified by the | 1822 | * the specified CPU? Both RCU flavor and CPU are specified by the |
| @@ -2039,10 +1893,13 @@ int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) | |||
| 2039 | return 1; | 1893 | return 1; |
| 2040 | } | 1894 | } |
| 2041 | /* Set up for the possibility that RCU will post a timer. */ | 1895 | /* Set up for the possibility that RCU will post a timer. */ |
| 2042 | if (rcu_cpu_has_nonlazy_callbacks(cpu)) | 1896 | if (rcu_cpu_has_nonlazy_callbacks(cpu)) { |
| 2043 | *delta_jiffies = RCU_IDLE_GP_DELAY; | 1897 | *delta_jiffies = round_up(RCU_IDLE_GP_DELAY + jiffies, |
| 2044 | else | 1898 | RCU_IDLE_GP_DELAY) - jiffies; |
| 2045 | *delta_jiffies = RCU_IDLE_LAZY_GP_DELAY; | 1899 | } else { |
| 1900 | *delta_jiffies = jiffies + RCU_IDLE_LAZY_GP_DELAY; | ||
| 1901 | *delta_jiffies = round_jiffies(*delta_jiffies) - jiffies; | ||
| 1902 | } | ||
| 2046 | return 0; | 1903 | return 0; |
| 2047 | } | 1904 | } |
| 2048 | 1905 | ||
| @@ -2101,6 +1958,7 @@ static void rcu_cleanup_after_idle(int cpu) | |||
| 2101 | 1958 | ||
| 2102 | del_timer(&rdtp->idle_gp_timer); | 1959 | del_timer(&rdtp->idle_gp_timer); |
| 2103 | trace_rcu_prep_idle("Cleanup after idle"); | 1960 | trace_rcu_prep_idle("Cleanup after idle"); |
| 1961 | rdtp->tick_nohz_enabled_snap = ACCESS_ONCE(tick_nohz_enabled); | ||
| 2104 | } | 1962 | } |
| 2105 | 1963 | ||
| 2106 | /* | 1964 | /* |
| @@ -2126,6 +1984,18 @@ static void rcu_prepare_for_idle(int cpu) | |||
| 2126 | { | 1984 | { |
| 2127 | struct timer_list *tp; | 1985 | struct timer_list *tp; |
| 2128 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | 1986 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); |
| 1987 | int tne; | ||
| 1988 | |||
| 1989 | /* Handle nohz enablement switches conservatively. */ | ||
| 1990 | tne = ACCESS_ONCE(tick_nohz_enabled); | ||
| 1991 | if (tne != rdtp->tick_nohz_enabled_snap) { | ||
| 1992 | if (rcu_cpu_has_callbacks(cpu)) | ||
| 1993 | invoke_rcu_core(); /* force nohz to see update. */ | ||
| 1994 | rdtp->tick_nohz_enabled_snap = tne; | ||
| 1995 | return; | ||
| 1996 | } | ||
| 1997 | if (!tne) | ||
| 1998 | return; | ||
| 2129 | 1999 | ||
| 2130 | /* | 2000 | /* |
| 2131 | * If this is an idle re-entry, for example, due to use of | 2001 | * If this is an idle re-entry, for example, due to use of |
| @@ -2179,10 +2049,11 @@ static void rcu_prepare_for_idle(int cpu) | |||
| 2179 | if (rcu_cpu_has_nonlazy_callbacks(cpu)) { | 2049 | if (rcu_cpu_has_nonlazy_callbacks(cpu)) { |
| 2180 | trace_rcu_prep_idle("Dyntick with callbacks"); | 2050 | trace_rcu_prep_idle("Dyntick with callbacks"); |
| 2181 | rdtp->idle_gp_timer_expires = | 2051 | rdtp->idle_gp_timer_expires = |
| 2182 | jiffies + RCU_IDLE_GP_DELAY; | 2052 | round_up(jiffies + RCU_IDLE_GP_DELAY, |
| 2053 | RCU_IDLE_GP_DELAY); | ||
| 2183 | } else { | 2054 | } else { |
| 2184 | rdtp->idle_gp_timer_expires = | 2055 | rdtp->idle_gp_timer_expires = |
| 2185 | jiffies + RCU_IDLE_LAZY_GP_DELAY; | 2056 | round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY); |
| 2186 | trace_rcu_prep_idle("Dyntick with lazy callbacks"); | 2057 | trace_rcu_prep_idle("Dyntick with lazy callbacks"); |
| 2187 | } | 2058 | } |
| 2188 | tp = &rdtp->idle_gp_timer; | 2059 | tp = &rdtp->idle_gp_timer; |
| @@ -2223,8 +2094,9 @@ static void rcu_prepare_for_idle(int cpu) | |||
| 2223 | if (rcu_cpu_has_callbacks(cpu)) { | 2094 | if (rcu_cpu_has_callbacks(cpu)) { |
| 2224 | trace_rcu_prep_idle("More callbacks"); | 2095 | trace_rcu_prep_idle("More callbacks"); |
| 2225 | invoke_rcu_core(); | 2096 | invoke_rcu_core(); |
| 2226 | } else | 2097 | } else { |
| 2227 | trace_rcu_prep_idle("Callbacks drained"); | 2098 | trace_rcu_prep_idle("Callbacks drained"); |
| 2099 | } | ||
| 2228 | } | 2100 | } |
| 2229 | 2101 | ||
| 2230 | /* | 2102 | /* |
| @@ -2261,6 +2133,7 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu) | |||
| 2261 | 2133 | ||
| 2262 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) | 2134 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) |
| 2263 | { | 2135 | { |
| 2136 | *cp = '\0'; | ||
| 2264 | } | 2137 | } |
| 2265 | 2138 | ||
| 2266 | #endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ | 2139 | #endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ |
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index d4bc16ddd1d4..abffb486e94e 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c | |||
| @@ -46,6 +46,31 @@ | |||
| 46 | #define RCU_TREE_NONCORE | 46 | #define RCU_TREE_NONCORE |
| 47 | #include "rcutree.h" | 47 | #include "rcutree.h" |
| 48 | 48 | ||
| 49 | static int show_rcubarrier(struct seq_file *m, void *unused) | ||
| 50 | { | ||
| 51 | struct rcu_state *rsp; | ||
| 52 | |||
| 53 | for_each_rcu_flavor(rsp) | ||
| 54 | seq_printf(m, "%s: %c bcc: %d nbd: %lu\n", | ||
| 55 | rsp->name, rsp->rcu_barrier_in_progress ? 'B' : '.', | ||
| 56 | atomic_read(&rsp->barrier_cpu_count), | ||
| 57 | rsp->n_barrier_done); | ||
| 58 | return 0; | ||
| 59 | } | ||
| 60 | |||
| 61 | static int rcubarrier_open(struct inode *inode, struct file *file) | ||
| 62 | { | ||
| 63 | return single_open(file, show_rcubarrier, NULL); | ||
| 64 | } | ||
| 65 | |||
| 66 | static const struct file_operations rcubarrier_fops = { | ||
| 67 | .owner = THIS_MODULE, | ||
| 68 | .open = rcubarrier_open, | ||
| 69 | .read = seq_read, | ||
| 70 | .llseek = seq_lseek, | ||
| 71 | .release = single_release, | ||
| 72 | }; | ||
| 73 | |||
| 49 | #ifdef CONFIG_RCU_BOOST | 74 | #ifdef CONFIG_RCU_BOOST |
| 50 | 75 | ||
| 51 | static char convert_kthread_status(unsigned int kthread_status) | 76 | static char convert_kthread_status(unsigned int kthread_status) |
| @@ -95,24 +120,16 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | |||
| 95 | rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); | 120 | rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); |
| 96 | } | 121 | } |
| 97 | 122 | ||
| 98 | #define PRINT_RCU_DATA(name, func, m) \ | ||
| 99 | do { \ | ||
| 100 | int _p_r_d_i; \ | ||
| 101 | \ | ||
| 102 | for_each_possible_cpu(_p_r_d_i) \ | ||
| 103 | func(m, &per_cpu(name, _p_r_d_i)); \ | ||
| 104 | } while (0) | ||
| 105 | |||
| 106 | static int show_rcudata(struct seq_file *m, void *unused) | 123 | static int show_rcudata(struct seq_file *m, void *unused) |
| 107 | { | 124 | { |
| 108 | #ifdef CONFIG_TREE_PREEMPT_RCU | 125 | int cpu; |
| 109 | seq_puts(m, "rcu_preempt:\n"); | 126 | struct rcu_state *rsp; |
| 110 | PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data, m); | 127 | |
| 111 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 128 | for_each_rcu_flavor(rsp) { |
| 112 | seq_puts(m, "rcu_sched:\n"); | 129 | seq_printf(m, "%s:\n", rsp->name); |
| 113 | PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data, m); | 130 | for_each_possible_cpu(cpu) |
| 114 | seq_puts(m, "rcu_bh:\n"); | 131 | print_one_rcu_data(m, per_cpu_ptr(rsp->rda, cpu)); |
| 115 | PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m); | 132 | } |
| 116 | return 0; | 133 | return 0; |
| 117 | } | 134 | } |
| 118 | 135 | ||
| @@ -166,6 +183,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) | |||
| 166 | 183 | ||
| 167 | static int show_rcudata_csv(struct seq_file *m, void *unused) | 184 | static int show_rcudata_csv(struct seq_file *m, void *unused) |
| 168 | { | 185 | { |
| 186 | int cpu; | ||
| 187 | struct rcu_state *rsp; | ||
| 188 | |||
| 169 | seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); | 189 | seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); |
| 170 | seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); | 190 | seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); |
| 171 | seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\""); | 191 | seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\""); |
| @@ -173,14 +193,11 @@ static int show_rcudata_csv(struct seq_file *m, void *unused) | |||
| 173 | seq_puts(m, "\"kt\",\"ktl\""); | 193 | seq_puts(m, "\"kt\",\"ktl\""); |
| 174 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 194 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
| 175 | seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n"); | 195 | seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n"); |
| 176 | #ifdef CONFIG_TREE_PREEMPT_RCU | 196 | for_each_rcu_flavor(rsp) { |
| 177 | seq_puts(m, "\"rcu_preempt:\"\n"); | 197 | seq_printf(m, "\"%s:\"\n", rsp->name); |
| 178 | PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); | 198 | for_each_possible_cpu(cpu) |
| 179 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 199 | print_one_rcu_data_csv(m, per_cpu_ptr(rsp->rda, cpu)); |
| 180 | seq_puts(m, "\"rcu_sched:\"\n"); | 200 | } |
| 181 | PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data_csv, m); | ||
| 182 | seq_puts(m, "\"rcu_bh:\"\n"); | ||
| 183 | PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m); | ||
| 184 | return 0; | 201 | return 0; |
| 185 | } | 202 | } |
| 186 | 203 | ||
| @@ -201,8 +218,7 @@ static const struct file_operations rcudata_csv_fops = { | |||
| 201 | 218 | ||
| 202 | static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) | 219 | static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) |
| 203 | { | 220 | { |
| 204 | seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu " | 221 | seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu ", |
| 205 | "j=%04x bt=%04x\n", | ||
| 206 | rnp->grplo, rnp->grphi, | 222 | rnp->grplo, rnp->grphi, |
| 207 | "T."[list_empty(&rnp->blkd_tasks)], | 223 | "T."[list_empty(&rnp->blkd_tasks)], |
| 208 | "N."[!rnp->gp_tasks], | 224 | "N."[!rnp->gp_tasks], |
| @@ -210,11 +226,11 @@ static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) | |||
| 210 | "B."[!rnp->boost_tasks], | 226 | "B."[!rnp->boost_tasks], |
| 211 | convert_kthread_status(rnp->boost_kthread_status), | 227 | convert_kthread_status(rnp->boost_kthread_status), |
| 212 | rnp->n_tasks_boosted, rnp->n_exp_boosts, | 228 | rnp->n_tasks_boosted, rnp->n_exp_boosts, |
| 213 | rnp->n_normal_boosts, | 229 | rnp->n_normal_boosts); |
| 230 | seq_printf(m, "j=%04x bt=%04x\n", | ||
| 214 | (int)(jiffies & 0xffff), | 231 | (int)(jiffies & 0xffff), |
| 215 | (int)(rnp->boost_time & 0xffff)); | 232 | (int)(rnp->boost_time & 0xffff)); |
| 216 | seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n", | 233 | seq_printf(m, " balk: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n", |
| 217 | " balk", | ||
| 218 | rnp->n_balk_blkd_tasks, | 234 | rnp->n_balk_blkd_tasks, |
| 219 | rnp->n_balk_exp_gp_tasks, | 235 | rnp->n_balk_exp_gp_tasks, |
| 220 | rnp->n_balk_boost_tasks, | 236 | rnp->n_balk_boost_tasks, |
| @@ -270,15 +286,15 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | |||
| 270 | struct rcu_node *rnp; | 286 | struct rcu_node *rnp; |
| 271 | 287 | ||
| 272 | gpnum = rsp->gpnum; | 288 | gpnum = rsp->gpnum; |
| 273 | seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " | 289 | seq_printf(m, "%s: c=%lu g=%lu s=%d jfq=%ld j=%x ", |
| 274 | "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", | 290 | rsp->name, rsp->completed, gpnum, rsp->fqs_state, |
| 275 | rsp->completed, gpnum, rsp->fqs_state, | ||
| 276 | (long)(rsp->jiffies_force_qs - jiffies), | 291 | (long)(rsp->jiffies_force_qs - jiffies), |
| 277 | (int)(jiffies & 0xffff), | 292 | (int)(jiffies & 0xffff)); |
| 293 | seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", | ||
| 278 | rsp->n_force_qs, rsp->n_force_qs_ngp, | 294 | rsp->n_force_qs, rsp->n_force_qs_ngp, |
| 279 | rsp->n_force_qs - rsp->n_force_qs_ngp, | 295 | rsp->n_force_qs - rsp->n_force_qs_ngp, |
| 280 | rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen); | 296 | rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen); |
| 281 | for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { | 297 | for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) { |
| 282 | if (rnp->level != level) { | 298 | if (rnp->level != level) { |
| 283 | seq_puts(m, "\n"); | 299 | seq_puts(m, "\n"); |
| 284 | level = rnp->level; | 300 | level = rnp->level; |
| @@ -295,14 +311,10 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | |||
| 295 | 311 | ||
| 296 | static int show_rcuhier(struct seq_file *m, void *unused) | 312 | static int show_rcuhier(struct seq_file *m, void *unused) |
| 297 | { | 313 | { |
| 298 | #ifdef CONFIG_TREE_PREEMPT_RCU | 314 | struct rcu_state *rsp; |
| 299 | seq_puts(m, "rcu_preempt:\n"); | 315 | |
| 300 | print_one_rcu_state(m, &rcu_preempt_state); | 316 | for_each_rcu_flavor(rsp) |
| 301 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 317 | print_one_rcu_state(m, rsp); |
| 302 | seq_puts(m, "rcu_sched:\n"); | ||
| 303 | print_one_rcu_state(m, &rcu_sched_state); | ||
| 304 | seq_puts(m, "rcu_bh:\n"); | ||
| 305 | print_one_rcu_state(m, &rcu_bh_state); | ||
| 306 | return 0; | 318 | return 0; |
| 307 | } | 319 | } |
| 308 | 320 | ||
| @@ -343,11 +355,10 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) | |||
| 343 | 355 | ||
| 344 | static int show_rcugp(struct seq_file *m, void *unused) | 356 | static int show_rcugp(struct seq_file *m, void *unused) |
| 345 | { | 357 | { |
| 346 | #ifdef CONFIG_TREE_PREEMPT_RCU | 358 | struct rcu_state *rsp; |
| 347 | show_one_rcugp(m, &rcu_preempt_state); | 359 | |
| 348 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 360 | for_each_rcu_flavor(rsp) |
| 349 | show_one_rcugp(m, &rcu_sched_state); | 361 | show_one_rcugp(m, rsp); |
| 350 | show_one_rcugp(m, &rcu_bh_state); | ||
| 351 | return 0; | 362 | return 0; |
| 352 | } | 363 | } |
| 353 | 364 | ||
| @@ -366,44 +377,36 @@ static const struct file_operations rcugp_fops = { | |||
| 366 | 377 | ||
| 367 | static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) | 378 | static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) |
| 368 | { | 379 | { |
| 369 | seq_printf(m, "%3d%cnp=%ld " | 380 | seq_printf(m, "%3d%cnp=%ld ", |
| 370 | "qsp=%ld rpq=%ld cbr=%ld cng=%ld " | ||
| 371 | "gpc=%ld gps=%ld nf=%ld nn=%ld\n", | ||
| 372 | rdp->cpu, | 381 | rdp->cpu, |
| 373 | cpu_is_offline(rdp->cpu) ? '!' : ' ', | 382 | cpu_is_offline(rdp->cpu) ? '!' : ' ', |
| 374 | rdp->n_rcu_pending, | 383 | rdp->n_rcu_pending); |
| 384 | seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ", | ||
| 375 | rdp->n_rp_qs_pending, | 385 | rdp->n_rp_qs_pending, |
| 376 | rdp->n_rp_report_qs, | 386 | rdp->n_rp_report_qs, |
| 377 | rdp->n_rp_cb_ready, | 387 | rdp->n_rp_cb_ready, |
| 378 | rdp->n_rp_cpu_needs_gp, | 388 | rdp->n_rp_cpu_needs_gp); |
| 389 | seq_printf(m, "gpc=%ld gps=%ld nf=%ld nn=%ld\n", | ||
| 379 | rdp->n_rp_gp_completed, | 390 | rdp->n_rp_gp_completed, |
| 380 | rdp->n_rp_gp_started, | 391 | rdp->n_rp_gp_started, |
| 381 | rdp->n_rp_need_fqs, | 392 | rdp->n_rp_need_fqs, |
| 382 | rdp->n_rp_need_nothing); | 393 | rdp->n_rp_need_nothing); |
| 383 | } | 394 | } |
| 384 | 395 | ||
| 385 | static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp) | 396 | static int show_rcu_pending(struct seq_file *m, void *unused) |
| 386 | { | 397 | { |
| 387 | int cpu; | 398 | int cpu; |
| 388 | struct rcu_data *rdp; | 399 | struct rcu_data *rdp; |
| 389 | 400 | struct rcu_state *rsp; | |
| 390 | for_each_possible_cpu(cpu) { | 401 | |
| 391 | rdp = per_cpu_ptr(rsp->rda, cpu); | 402 | for_each_rcu_flavor(rsp) { |
| 392 | if (rdp->beenonline) | 403 | seq_printf(m, "%s:\n", rsp->name); |
| 393 | print_one_rcu_pending(m, rdp); | 404 | for_each_possible_cpu(cpu) { |
| 405 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
| 406 | if (rdp->beenonline) | ||
| 407 | print_one_rcu_pending(m, rdp); | ||
| 408 | } | ||
| 394 | } | 409 | } |
| 395 | } | ||
| 396 | |||
| 397 | static int show_rcu_pending(struct seq_file *m, void *unused) | ||
| 398 | { | ||
| 399 | #ifdef CONFIG_TREE_PREEMPT_RCU | ||
| 400 | seq_puts(m, "rcu_preempt:\n"); | ||
| 401 | print_rcu_pendings(m, &rcu_preempt_state); | ||
| 402 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | ||
| 403 | seq_puts(m, "rcu_sched:\n"); | ||
| 404 | print_rcu_pendings(m, &rcu_sched_state); | ||
| 405 | seq_puts(m, "rcu_bh:\n"); | ||
| 406 | print_rcu_pendings(m, &rcu_bh_state); | ||
| 407 | return 0; | 410 | return 0; |
| 408 | } | 411 | } |
| 409 | 412 | ||
| @@ -453,6 +456,11 @@ static int __init rcutree_trace_init(void) | |||
| 453 | if (!rcudir) | 456 | if (!rcudir) |
| 454 | goto free_out; | 457 | goto free_out; |
| 455 | 458 | ||
| 459 | retval = debugfs_create_file("rcubarrier", 0444, rcudir, | ||
| 460 | NULL, &rcubarrier_fops); | ||
| 461 | if (!retval) | ||
| 462 | goto free_out; | ||
| 463 | |||
| 456 | retval = debugfs_create_file("rcudata", 0444, rcudir, | 464 | retval = debugfs_create_file("rcudata", 0444, rcudir, |
| 457 | NULL, &rcudata_fops); | 465 | NULL, &rcudata_fops); |
| 458 | if (!retval) | 466 | if (!retval) |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d5594a4268d4..468bdd44c1ba 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
| @@ -2081,7 +2081,6 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
| 2081 | #endif | 2081 | #endif |
| 2082 | 2082 | ||
| 2083 | /* Here we just switch the register state and the stack. */ | 2083 | /* Here we just switch the register state and the stack. */ |
| 2084 | rcu_switch_from(prev); | ||
| 2085 | switch_to(prev, next, prev); | 2084 | switch_to(prev, next, prev); |
| 2086 | 2085 | ||
| 2087 | barrier(); | 2086 | barrier(); |
| @@ -2161,11 +2160,73 @@ unsigned long this_cpu_load(void) | |||
| 2161 | } | 2160 | } |
| 2162 | 2161 | ||
| 2163 | 2162 | ||
| 2163 | /* | ||
| 2164 | * Global load-average calculations | ||
| 2165 | * | ||
| 2166 | * We take a distributed and async approach to calculating the global load-avg | ||
| 2167 | * in order to minimize overhead. | ||
| 2168 | * | ||
| 2169 | * The global load average is an exponentially decaying average of nr_running + | ||
| 2170 | * nr_uninterruptible. | ||
| 2171 | * | ||
| 2172 | * Once every LOAD_FREQ: | ||
| 2173 | * | ||
| 2174 | * nr_active = 0; | ||
| 2175 | * for_each_possible_cpu(cpu) | ||
| 2176 | * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; | ||
| 2177 | * | ||
| 2178 | * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) | ||
| 2179 | * | ||
| 2180 | * Due to a number of reasons the above turns in the mess below: | ||
| 2181 | * | ||
| 2182 | * - for_each_possible_cpu() is prohibitively expensive on machines with | ||
| 2183 | * serious number of cpus, therefore we need to take a distributed approach | ||
| 2184 | * to calculating nr_active. | ||
| 2185 | * | ||
| 2186 | * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 | ||
| 2187 | * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } | ||
| 2188 | * | ||
| 2189 | * So assuming nr_active := 0 when we start out -- true per definition, we | ||
| 2190 | * can simply take per-cpu deltas and fold those into a global accumulate | ||
| 2191 | * to obtain the same result. See calc_load_fold_active(). | ||
| 2192 | * | ||
| 2193 | * Furthermore, in order to avoid synchronizing all per-cpu delta folding | ||
| 2194 | * across the machine, we assume 10 ticks is sufficient time for every | ||
| 2195 | * cpu to have completed this task. | ||
| 2196 | * | ||
| 2197 | * This places an upper-bound on the IRQ-off latency of the machine. Then | ||
| 2198 | * again, being late doesn't loose the delta, just wrecks the sample. | ||
| 2199 | * | ||
| 2200 | * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because | ||
| 2201 | * this would add another cross-cpu cacheline miss and atomic operation | ||
| 2202 | * to the wakeup path. Instead we increment on whatever cpu the task ran | ||
| 2203 | * when it went into uninterruptible state and decrement on whatever cpu | ||
| 2204 | * did the wakeup. This means that only the sum of nr_uninterruptible over | ||
| 2205 | * all cpus yields the correct result. | ||
| 2206 | * | ||
| 2207 | * This covers the NO_HZ=n code, for extra head-aches, see the comment below. | ||
| 2208 | */ | ||
| 2209 | |||
| 2164 | /* Variables and functions for calc_load */ | 2210 | /* Variables and functions for calc_load */ |
| 2165 | static atomic_long_t calc_load_tasks; | 2211 | static atomic_long_t calc_load_tasks; |
| 2166 | static unsigned long calc_load_update; | 2212 | static unsigned long calc_load_update; |
| 2167 | unsigned long avenrun[3]; | 2213 | unsigned long avenrun[3]; |
| 2168 | EXPORT_SYMBOL(avenrun); | 2214 | EXPORT_SYMBOL(avenrun); /* should be removed */ |
| 2215 | |||
| 2216 | /** | ||
| 2217 | * get_avenrun - get the load average array | ||
| 2218 | * @loads: pointer to dest load array | ||
| 2219 | * @offset: offset to add | ||
| 2220 | * @shift: shift count to shift the result left | ||
| 2221 | * | ||
| 2222 | * These values are estimates at best, so no need for locking. | ||
| 2223 | */ | ||
| 2224 | void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | ||
| 2225 | { | ||
| 2226 | loads[0] = (avenrun[0] + offset) << shift; | ||
| 2227 | loads[1] = (avenrun[1] + offset) << shift; | ||
| 2228 | loads[2] = (avenrun[2] + offset) << shift; | ||
| 2229 | } | ||
| 2169 | 2230 | ||
| 2170 | static long calc_load_fold_active(struct rq *this_rq) | 2231 | static long calc_load_fold_active(struct rq *this_rq) |
| 2171 | { | 2232 | { |
| @@ -2182,6 +2243,9 @@ static long calc_load_fold_active(struct rq *this_rq) | |||
| 2182 | return delta; | 2243 | return delta; |
| 2183 | } | 2244 | } |
| 2184 | 2245 | ||
| 2246 | /* | ||
| 2247 | * a1 = a0 * e + a * (1 - e) | ||
| 2248 | */ | ||
| 2185 | static unsigned long | 2249 | static unsigned long |
| 2186 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | 2250 | calc_load(unsigned long load, unsigned long exp, unsigned long active) |
| 2187 | { | 2251 | { |
| @@ -2193,30 +2257,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) | |||
| 2193 | 2257 | ||
| 2194 | #ifdef CONFIG_NO_HZ | 2258 | #ifdef CONFIG_NO_HZ |
| 2195 | /* | 2259 | /* |
| 2196 | * For NO_HZ we delay the active fold to the next LOAD_FREQ update. | 2260 | * Handle NO_HZ for the global load-average. |
| 2261 | * | ||
| 2262 | * Since the above described distributed algorithm to compute the global | ||
| 2263 | * load-average relies on per-cpu sampling from the tick, it is affected by | ||
| 2264 | * NO_HZ. | ||
| 2265 | * | ||
| 2266 | * The basic idea is to fold the nr_active delta into a global idle-delta upon | ||
| 2267 | * entering NO_HZ state such that we can include this as an 'extra' cpu delta | ||
| 2268 | * when we read the global state. | ||
| 2269 | * | ||
| 2270 | * Obviously reality has to ruin such a delightfully simple scheme: | ||
| 2271 | * | ||
| 2272 | * - When we go NO_HZ idle during the window, we can negate our sample | ||
| 2273 | * contribution, causing under-accounting. | ||
| 2274 | * | ||
| 2275 | * We avoid this by keeping two idle-delta counters and flipping them | ||
| 2276 | * when the window starts, thus separating old and new NO_HZ load. | ||
| 2277 | * | ||
| 2278 | * The only trick is the slight shift in index flip for read vs write. | ||
| 2279 | * | ||
| 2280 | * 0s 5s 10s 15s | ||
| 2281 | * +10 +10 +10 +10 | ||
| 2282 | * |-|-----------|-|-----------|-|-----------|-| | ||
| 2283 | * r:0 0 1 1 0 0 1 1 0 | ||
| 2284 | * w:0 1 1 0 0 1 1 0 0 | ||
| 2285 | * | ||
| 2286 | * This ensures we'll fold the old idle contribution in this window while | ||
| 2287 | * accumlating the new one. | ||
| 2288 | * | ||
| 2289 | * - When we wake up from NO_HZ idle during the window, we push up our | ||
| 2290 | * contribution, since we effectively move our sample point to a known | ||
| 2291 | * busy state. | ||
| 2292 | * | ||
| 2293 | * This is solved by pushing the window forward, and thus skipping the | ||
| 2294 | * sample, for this cpu (effectively using the idle-delta for this cpu which | ||
| 2295 | * was in effect at the time the window opened). This also solves the issue | ||
| 2296 | * of having to deal with a cpu having been in NOHZ idle for multiple | ||
| 2297 | * LOAD_FREQ intervals. | ||
| 2197 | * | 2298 | * |
| 2198 | * When making the ILB scale, we should try to pull this in as well. | 2299 | * When making the ILB scale, we should try to pull this in as well. |
| 2199 | */ | 2300 | */ |
| 2200 | static atomic_long_t calc_load_tasks_idle; | 2301 | static atomic_long_t calc_load_idle[2]; |
| 2302 | static int calc_load_idx; | ||
| 2201 | 2303 | ||
| 2202 | void calc_load_account_idle(struct rq *this_rq) | 2304 | static inline int calc_load_write_idx(void) |
| 2203 | { | 2305 | { |
| 2306 | int idx = calc_load_idx; | ||
| 2307 | |||
| 2308 | /* | ||
| 2309 | * See calc_global_nohz(), if we observe the new index, we also | ||
| 2310 | * need to observe the new update time. | ||
| 2311 | */ | ||
| 2312 | smp_rmb(); | ||
| 2313 | |||
| 2314 | /* | ||
| 2315 | * If the folding window started, make sure we start writing in the | ||
| 2316 | * next idle-delta. | ||
| 2317 | */ | ||
| 2318 | if (!time_before(jiffies, calc_load_update)) | ||
| 2319 | idx++; | ||
| 2320 | |||
| 2321 | return idx & 1; | ||
| 2322 | } | ||
| 2323 | |||
| 2324 | static inline int calc_load_read_idx(void) | ||
| 2325 | { | ||
| 2326 | return calc_load_idx & 1; | ||
| 2327 | } | ||
| 2328 | |||
| 2329 | void calc_load_enter_idle(void) | ||
| 2330 | { | ||
| 2331 | struct rq *this_rq = this_rq(); | ||
| 2204 | long delta; | 2332 | long delta; |
| 2205 | 2333 | ||
| 2334 | /* | ||
| 2335 | * We're going into NOHZ mode, if there's any pending delta, fold it | ||
| 2336 | * into the pending idle delta. | ||
| 2337 | */ | ||
| 2206 | delta = calc_load_fold_active(this_rq); | 2338 | delta = calc_load_fold_active(this_rq); |
| 2207 | if (delta) | 2339 | if (delta) { |
| 2208 | atomic_long_add(delta, &calc_load_tasks_idle); | 2340 | int idx = calc_load_write_idx(); |
| 2341 | atomic_long_add(delta, &calc_load_idle[idx]); | ||
| 2342 | } | ||
| 2209 | } | 2343 | } |
| 2210 | 2344 | ||
| 2211 | static long calc_load_fold_idle(void) | 2345 | void calc_load_exit_idle(void) |
| 2212 | { | 2346 | { |
| 2213 | long delta = 0; | 2347 | struct rq *this_rq = this_rq(); |
| 2348 | |||
| 2349 | /* | ||
| 2350 | * If we're still before the sample window, we're done. | ||
| 2351 | */ | ||
| 2352 | if (time_before(jiffies, this_rq->calc_load_update)) | ||
| 2353 | return; | ||
| 2214 | 2354 | ||
| 2215 | /* | 2355 | /* |
| 2216 | * Its got a race, we don't care... | 2356 | * We woke inside or after the sample window, this means we're already |
| 2357 | * accounted through the nohz accounting, so skip the entire deal and | ||
| 2358 | * sync up for the next window. | ||
| 2217 | */ | 2359 | */ |
| 2218 | if (atomic_long_read(&calc_load_tasks_idle)) | 2360 | this_rq->calc_load_update = calc_load_update; |
| 2219 | delta = atomic_long_xchg(&calc_load_tasks_idle, 0); | 2361 | if (time_before(jiffies, this_rq->calc_load_update + 10)) |
| 2362 | this_rq->calc_load_update += LOAD_FREQ; | ||
| 2363 | } | ||
| 2364 | |||
| 2365 | static long calc_load_fold_idle(void) | ||
| 2366 | { | ||
| 2367 | int idx = calc_load_read_idx(); | ||
| 2368 | long delta = 0; | ||
| 2369 | |||
| 2370 | if (atomic_long_read(&calc_load_idle[idx])) | ||
| 2371 | delta = atomic_long_xchg(&calc_load_idle[idx], 0); | ||
| 2220 | 2372 | ||
| 2221 | return delta; | 2373 | return delta; |
| 2222 | } | 2374 | } |
| @@ -2302,66 +2454,39 @@ static void calc_global_nohz(void) | |||
| 2302 | { | 2454 | { |
| 2303 | long delta, active, n; | 2455 | long delta, active, n; |
| 2304 | 2456 | ||
| 2305 | /* | 2457 | if (!time_before(jiffies, calc_load_update + 10)) { |
| 2306 | * If we crossed a calc_load_update boundary, make sure to fold | 2458 | /* |
| 2307 | * any pending idle changes, the respective CPUs might have | 2459 | * Catch-up, fold however many we are behind still |
| 2308 | * missed the tick driven calc_load_account_active() update | 2460 | */ |
| 2309 | * due to NO_HZ. | 2461 | delta = jiffies - calc_load_update - 10; |
| 2310 | */ | 2462 | n = 1 + (delta / LOAD_FREQ); |
| 2311 | delta = calc_load_fold_idle(); | ||
| 2312 | if (delta) | ||
| 2313 | atomic_long_add(delta, &calc_load_tasks); | ||
| 2314 | |||
| 2315 | /* | ||
| 2316 | * It could be the one fold was all it took, we done! | ||
| 2317 | */ | ||
| 2318 | if (time_before(jiffies, calc_load_update + 10)) | ||
| 2319 | return; | ||
| 2320 | |||
| 2321 | /* | ||
| 2322 | * Catch-up, fold however many we are behind still | ||
| 2323 | */ | ||
| 2324 | delta = jiffies - calc_load_update - 10; | ||
| 2325 | n = 1 + (delta / LOAD_FREQ); | ||
| 2326 | 2463 | ||
| 2327 | active = atomic_long_read(&calc_load_tasks); | 2464 | active = atomic_long_read(&calc_load_tasks); |
| 2328 | active = active > 0 ? active * FIXED_1 : 0; | 2465 | active = active > 0 ? active * FIXED_1 : 0; |
| 2329 | 2466 | ||
| 2330 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); | 2467 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); |
| 2331 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); | 2468 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); |
| 2332 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); | 2469 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); |
| 2333 | 2470 | ||
| 2334 | calc_load_update += n * LOAD_FREQ; | 2471 | calc_load_update += n * LOAD_FREQ; |
| 2335 | } | 2472 | } |
| 2336 | #else | ||
| 2337 | void calc_load_account_idle(struct rq *this_rq) | ||
| 2338 | { | ||
| 2339 | } | ||
| 2340 | 2473 | ||
| 2341 | static inline long calc_load_fold_idle(void) | 2474 | /* |
| 2342 | { | 2475 | * Flip the idle index... |
| 2343 | return 0; | 2476 | * |
| 2477 | * Make sure we first write the new time then flip the index, so that | ||
| 2478 | * calc_load_write_idx() will see the new time when it reads the new | ||
| 2479 | * index, this avoids a double flip messing things up. | ||
| 2480 | */ | ||
| 2481 | smp_wmb(); | ||
| 2482 | calc_load_idx++; | ||
| 2344 | } | 2483 | } |
| 2484 | #else /* !CONFIG_NO_HZ */ | ||
| 2345 | 2485 | ||
| 2346 | static void calc_global_nohz(void) | 2486 | static inline long calc_load_fold_idle(void) { return 0; } |
| 2347 | { | 2487 | static inline void calc_global_nohz(void) { } |
| 2348 | } | ||
| 2349 | #endif | ||
| 2350 | 2488 | ||
| 2351 | /** | 2489 | #endif /* CONFIG_NO_HZ */ |
| 2352 | * get_avenrun - get the load average array | ||
| 2353 | * @loads: pointer to dest load array | ||
| 2354 | * @offset: offset to add | ||
| 2355 | * @shift: shift count to shift the result left | ||
| 2356 | * | ||
| 2357 | * These values are estimates at best, so no need for locking. | ||
| 2358 | */ | ||
| 2359 | void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | ||
| 2360 | { | ||
| 2361 | loads[0] = (avenrun[0] + offset) << shift; | ||
| 2362 | loads[1] = (avenrun[1] + offset) << shift; | ||
| 2363 | loads[2] = (avenrun[2] + offset) << shift; | ||
| 2364 | } | ||
| 2365 | 2490 | ||
| 2366 | /* | 2491 | /* |
| 2367 | * calc_load - update the avenrun load estimates 10 ticks after the | 2492 | * calc_load - update the avenrun load estimates 10 ticks after the |
| @@ -2369,11 +2494,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | |||
| 2369 | */ | 2494 | */ |
| 2370 | void calc_global_load(unsigned long ticks) | 2495 | void calc_global_load(unsigned long ticks) |
| 2371 | { | 2496 | { |
| 2372 | long active; | 2497 | long active, delta; |
| 2373 | 2498 | ||
| 2374 | if (time_before(jiffies, calc_load_update + 10)) | 2499 | if (time_before(jiffies, calc_load_update + 10)) |
| 2375 | return; | 2500 | return; |
| 2376 | 2501 | ||
| 2502 | /* | ||
| 2503 | * Fold the 'old' idle-delta to include all NO_HZ cpus. | ||
| 2504 | */ | ||
| 2505 | delta = calc_load_fold_idle(); | ||
| 2506 | if (delta) | ||
| 2507 | atomic_long_add(delta, &calc_load_tasks); | ||
| 2508 | |||
| 2377 | active = atomic_long_read(&calc_load_tasks); | 2509 | active = atomic_long_read(&calc_load_tasks); |
| 2378 | active = active > 0 ? active * FIXED_1 : 0; | 2510 | active = active > 0 ? active * FIXED_1 : 0; |
| 2379 | 2511 | ||
| @@ -2384,12 +2516,7 @@ void calc_global_load(unsigned long ticks) | |||
| 2384 | calc_load_update += LOAD_FREQ; | 2516 | calc_load_update += LOAD_FREQ; |
| 2385 | 2517 | ||
| 2386 | /* | 2518 | /* |
| 2387 | * Account one period with whatever state we found before | 2519 | * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. |
| 2388 | * folding in the nohz state and ageing the entire idle period. | ||
| 2389 | * | ||
| 2390 | * This avoids loosing a sample when we go idle between | ||
| 2391 | * calc_load_account_active() (10 ticks ago) and now and thus | ||
| 2392 | * under-accounting. | ||
| 2393 | */ | 2520 | */ |
| 2394 | calc_global_nohz(); | 2521 | calc_global_nohz(); |
| 2395 | } | 2522 | } |
| @@ -2406,7 +2533,6 @@ static void calc_load_account_active(struct rq *this_rq) | |||
| 2406 | return; | 2533 | return; |
| 2407 | 2534 | ||
| 2408 | delta = calc_load_fold_active(this_rq); | 2535 | delta = calc_load_fold_active(this_rq); |
| 2409 | delta += calc_load_fold_idle(); | ||
| 2410 | if (delta) | 2536 | if (delta) |
| 2411 | atomic_long_add(delta, &calc_load_tasks); | 2537 | atomic_long_add(delta, &calc_load_tasks); |
| 2412 | 2538 | ||
| @@ -2414,6 +2540,10 @@ static void calc_load_account_active(struct rq *this_rq) | |||
| 2414 | } | 2540 | } |
| 2415 | 2541 | ||
| 2416 | /* | 2542 | /* |
| 2543 | * End of global load-average stuff | ||
| 2544 | */ | ||
| 2545 | |||
| 2546 | /* | ||
| 2417 | * The exact cpuload at various idx values, calculated at every tick would be | 2547 | * The exact cpuload at various idx values, calculated at every tick would be |
| 2418 | * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load | 2548 | * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load |
| 2419 | * | 2549 | * |
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index b44d604b35d1..b6baf370cae9 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c | |||
| @@ -25,7 +25,6 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl | |||
| 25 | static struct task_struct *pick_next_task_idle(struct rq *rq) | 25 | static struct task_struct *pick_next_task_idle(struct rq *rq) |
| 26 | { | 26 | { |
| 27 | schedstat_inc(rq, sched_goidle); | 27 | schedstat_inc(rq, sched_goidle); |
| 28 | calc_load_account_idle(rq); | ||
| 29 | return rq->idle; | 28 | return rq->idle; |
| 30 | } | 29 | } |
| 31 | 30 | ||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6d52cea7f33d..55844f24435a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
| @@ -942,8 +942,6 @@ static inline u64 sched_avg_period(void) | |||
| 942 | return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; | 942 | return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; |
| 943 | } | 943 | } |
| 944 | 944 | ||
| 945 | void calc_load_account_idle(struct rq *this_rq); | ||
| 946 | |||
| 947 | #ifdef CONFIG_SCHED_HRTICK | 945 | #ifdef CONFIG_SCHED_HRTICK |
| 948 | 946 | ||
| 949 | /* | 947 | /* |
diff --git a/kernel/smp.c b/kernel/smp.c index d0ae5b24875e..29dd40a9f2f4 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
| @@ -581,26 +581,6 @@ int smp_call_function(smp_call_func_t func, void *info, int wait) | |||
| 581 | return 0; | 581 | return 0; |
| 582 | } | 582 | } |
| 583 | EXPORT_SYMBOL(smp_call_function); | 583 | EXPORT_SYMBOL(smp_call_function); |
| 584 | |||
| 585 | void ipi_call_lock(void) | ||
| 586 | { | ||
| 587 | raw_spin_lock(&call_function.lock); | ||
| 588 | } | ||
| 589 | |||
| 590 | void ipi_call_unlock(void) | ||
| 591 | { | ||
| 592 | raw_spin_unlock(&call_function.lock); | ||
| 593 | } | ||
| 594 | |||
| 595 | void ipi_call_lock_irq(void) | ||
| 596 | { | ||
| 597 | raw_spin_lock_irq(&call_function.lock); | ||
| 598 | } | ||
| 599 | |||
| 600 | void ipi_call_unlock_irq(void) | ||
| 601 | { | ||
| 602 | raw_spin_unlock_irq(&call_function.lock); | ||
| 603 | } | ||
| 604 | #endif /* USE_GENERIC_SMP_HELPERS */ | 584 | #endif /* USE_GENERIC_SMP_HELPERS */ |
| 605 | 585 | ||
| 606 | /* Setup configured maximum number of CPUs to activate */ | 586 | /* Setup configured maximum number of CPUs to activate */ |
diff --git a/kernel/smpboot.h b/kernel/smpboot.h index 80c0acfb8472..6ef9433e1c70 100644 --- a/kernel/smpboot.h +++ b/kernel/smpboot.h | |||
| @@ -3,8 +3,6 @@ | |||
| 3 | 3 | ||
| 4 | struct task_struct; | 4 | struct task_struct; |
| 5 | 5 | ||
| 6 | int smpboot_prepare(unsigned int cpu); | ||
| 7 | |||
| 8 | #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD | 6 | #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD |
| 9 | struct task_struct *idle_thread_get(unsigned int cpu); | 7 | struct task_struct *idle_thread_get(unsigned int cpu); |
| 10 | void idle_thread_set_boot_cpu(void); | 8 | void idle_thread_set_boot_cpu(void); |
diff --git a/kernel/sys.c b/kernel/sys.c index e0c8ffc50d7f..2d39a84cd857 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
| @@ -1788,7 +1788,6 @@ SYSCALL_DEFINE1(umask, int, mask) | |||
| 1788 | #ifdef CONFIG_CHECKPOINT_RESTORE | 1788 | #ifdef CONFIG_CHECKPOINT_RESTORE |
| 1789 | static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) | 1789 | static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) |
| 1790 | { | 1790 | { |
| 1791 | struct vm_area_struct *vma; | ||
| 1792 | struct file *exe_file; | 1791 | struct file *exe_file; |
| 1793 | struct dentry *dentry; | 1792 | struct dentry *dentry; |
| 1794 | int err; | 1793 | int err; |
| @@ -1816,13 +1815,17 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) | |||
| 1816 | down_write(&mm->mmap_sem); | 1815 | down_write(&mm->mmap_sem); |
| 1817 | 1816 | ||
| 1818 | /* | 1817 | /* |
| 1819 | * Forbid mm->exe_file change if there are mapped other files. | 1818 | * Forbid mm->exe_file change if old file still mapped. |
| 1820 | */ | 1819 | */ |
| 1821 | err = -EBUSY; | 1820 | err = -EBUSY; |
| 1822 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 1821 | if (mm->exe_file) { |
| 1823 | if (vma->vm_file && !path_equal(&vma->vm_file->f_path, | 1822 | struct vm_area_struct *vma; |
| 1824 | &exe_file->f_path)) | 1823 | |
| 1825 | goto exit_unlock; | 1824 | for (vma = mm->mmap; vma; vma = vma->vm_next) |
| 1825 | if (vma->vm_file && | ||
| 1826 | path_equal(&vma->vm_file->f_path, | ||
| 1827 | &mm->exe_file->f_path)) | ||
| 1828 | goto exit_unlock; | ||
| 1826 | } | 1829 | } |
| 1827 | 1830 | ||
| 1828 | /* | 1831 | /* |
| @@ -1835,6 +1838,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) | |||
| 1835 | if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) | 1838 | if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) |
| 1836 | goto exit_unlock; | 1839 | goto exit_unlock; |
| 1837 | 1840 | ||
| 1841 | err = 0; | ||
| 1838 | set_mm_exe_file(mm, exe_file); | 1842 | set_mm_exe_file(mm, exe_file); |
| 1839 | exit_unlock: | 1843 | exit_unlock: |
| 1840 | up_write(&mm->mmap_sem); | 1844 | up_write(&mm->mmap_sem); |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 70b33abcc7bb..b7fbadc5c973 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
| @@ -409,7 +409,9 @@ int second_overflow(unsigned long secs) | |||
| 409 | time_state = TIME_DEL; | 409 | time_state = TIME_DEL; |
| 410 | break; | 410 | break; |
| 411 | case TIME_INS: | 411 | case TIME_INS: |
| 412 | if (secs % 86400 == 0) { | 412 | if (!(time_status & STA_INS)) |
| 413 | time_state = TIME_OK; | ||
| 414 | else if (secs % 86400 == 0) { | ||
| 413 | leap = -1; | 415 | leap = -1; |
| 414 | time_state = TIME_OOP; | 416 | time_state = TIME_OOP; |
| 415 | time_tai++; | 417 | time_tai++; |
| @@ -418,7 +420,9 @@ int second_overflow(unsigned long secs) | |||
| 418 | } | 420 | } |
| 419 | break; | 421 | break; |
| 420 | case TIME_DEL: | 422 | case TIME_DEL: |
| 421 | if ((secs + 1) % 86400 == 0) { | 423 | if (!(time_status & STA_DEL)) |
| 424 | time_state = TIME_OK; | ||
| 425 | else if ((secs + 1) % 86400 == 0) { | ||
| 422 | leap = 1; | 426 | leap = 1; |
| 423 | time_tai--; | 427 | time_tai--; |
| 424 | time_state = TIME_WAIT; | 428 | time_state = TIME_WAIT; |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 869997833928..024540f97f74 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
| @@ -105,7 +105,7 @@ static ktime_t tick_init_jiffy_update(void) | |||
| 105 | /* | 105 | /* |
| 106 | * NO HZ enabled ? | 106 | * NO HZ enabled ? |
| 107 | */ | 107 | */ |
| 108 | static int tick_nohz_enabled __read_mostly = 1; | 108 | int tick_nohz_enabled __read_mostly = 1; |
| 109 | 109 | ||
| 110 | /* | 110 | /* |
| 111 | * Enable / Disable tickless mode | 111 | * Enable / Disable tickless mode |
| @@ -271,50 +271,15 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) | |||
| 271 | } | 271 | } |
| 272 | EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); | 272 | EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); |
| 273 | 273 | ||
| 274 | static void tick_nohz_stop_sched_tick(struct tick_sched *ts) | 274 | static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, |
| 275 | ktime_t now, int cpu) | ||
| 275 | { | 276 | { |
| 276 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; | 277 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; |
| 278 | ktime_t last_update, expires, ret = { .tv64 = 0 }; | ||
| 277 | unsigned long rcu_delta_jiffies; | 279 | unsigned long rcu_delta_jiffies; |
| 278 | ktime_t last_update, expires, now; | ||
| 279 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; | 280 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; |
| 280 | u64 time_delta; | 281 | u64 time_delta; |
| 281 | int cpu; | ||
| 282 | |||
| 283 | cpu = smp_processor_id(); | ||
| 284 | ts = &per_cpu(tick_cpu_sched, cpu); | ||
| 285 | |||
| 286 | now = tick_nohz_start_idle(cpu, ts); | ||
| 287 | |||
| 288 | /* | ||
| 289 | * If this cpu is offline and it is the one which updates | ||
| 290 | * jiffies, then give up the assignment and let it be taken by | ||
| 291 | * the cpu which runs the tick timer next. If we don't drop | ||
| 292 | * this here the jiffies might be stale and do_timer() never | ||
| 293 | * invoked. | ||
| 294 | */ | ||
| 295 | if (unlikely(!cpu_online(cpu))) { | ||
| 296 | if (cpu == tick_do_timer_cpu) | ||
| 297 | tick_do_timer_cpu = TICK_DO_TIMER_NONE; | ||
| 298 | } | ||
| 299 | |||
| 300 | if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) | ||
| 301 | return; | ||
| 302 | 282 | ||
| 303 | if (need_resched()) | ||
| 304 | return; | ||
| 305 | |||
| 306 | if (unlikely(local_softirq_pending() && cpu_online(cpu))) { | ||
| 307 | static int ratelimit; | ||
| 308 | |||
| 309 | if (ratelimit < 10) { | ||
| 310 | printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", | ||
| 311 | (unsigned int) local_softirq_pending()); | ||
| 312 | ratelimit++; | ||
| 313 | } | ||
| 314 | return; | ||
| 315 | } | ||
| 316 | |||
| 317 | ts->idle_calls++; | ||
| 318 | /* Read jiffies and the time when jiffies were updated last */ | 283 | /* Read jiffies and the time when jiffies were updated last */ |
| 319 | do { | 284 | do { |
| 320 | seq = read_seqbegin(&xtime_lock); | 285 | seq = read_seqbegin(&xtime_lock); |
| @@ -397,6 +362,8 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) | |||
| 397 | if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) | 362 | if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) |
| 398 | goto out; | 363 | goto out; |
| 399 | 364 | ||
| 365 | ret = expires; | ||
| 366 | |||
| 400 | /* | 367 | /* |
| 401 | * nohz_stop_sched_tick can be called several times before | 368 | * nohz_stop_sched_tick can be called several times before |
| 402 | * the nohz_restart_sched_tick is called. This happens when | 369 | * the nohz_restart_sched_tick is called. This happens when |
| @@ -406,17 +373,12 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) | |||
| 406 | */ | 373 | */ |
| 407 | if (!ts->tick_stopped) { | 374 | if (!ts->tick_stopped) { |
| 408 | select_nohz_load_balancer(1); | 375 | select_nohz_load_balancer(1); |
| 376 | calc_load_enter_idle(); | ||
| 409 | 377 | ||
| 410 | ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); | 378 | ts->last_tick = hrtimer_get_expires(&ts->sched_timer); |
| 411 | ts->tick_stopped = 1; | 379 | ts->tick_stopped = 1; |
| 412 | ts->idle_jiffies = last_jiffies; | ||
| 413 | } | 380 | } |
| 414 | 381 | ||
| 415 | ts->idle_sleeps++; | ||
| 416 | |||
| 417 | /* Mark expires */ | ||
| 418 | ts->idle_expires = expires; | ||
| 419 | |||
| 420 | /* | 382 | /* |
| 421 | * If the expiration time == KTIME_MAX, then | 383 | * If the expiration time == KTIME_MAX, then |
| 422 | * in this case we simply stop the tick timer. | 384 | * in this case we simply stop the tick timer. |
| @@ -447,6 +409,65 @@ out: | |||
| 447 | ts->next_jiffies = next_jiffies; | 409 | ts->next_jiffies = next_jiffies; |
| 448 | ts->last_jiffies = last_jiffies; | 410 | ts->last_jiffies = last_jiffies; |
| 449 | ts->sleep_length = ktime_sub(dev->next_event, now); | 411 | ts->sleep_length = ktime_sub(dev->next_event, now); |
| 412 | |||
| 413 | return ret; | ||
| 414 | } | ||
| 415 | |||
| 416 | static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) | ||
| 417 | { | ||
| 418 | /* | ||
| 419 | * If this cpu is offline and it is the one which updates | ||
| 420 | * jiffies, then give up the assignment and let it be taken by | ||
| 421 | * the cpu which runs the tick timer next. If we don't drop | ||
| 422 | * this here the jiffies might be stale and do_timer() never | ||
| 423 | * invoked. | ||
| 424 | */ | ||
| 425 | if (unlikely(!cpu_online(cpu))) { | ||
| 426 | if (cpu == tick_do_timer_cpu) | ||
| 427 | tick_do_timer_cpu = TICK_DO_TIMER_NONE; | ||
| 428 | } | ||
| 429 | |||
| 430 | if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) | ||
| 431 | return false; | ||
| 432 | |||
| 433 | if (need_resched()) | ||
| 434 | return false; | ||
| 435 | |||
| 436 | if (unlikely(local_softirq_pending() && cpu_online(cpu))) { | ||
| 437 | static int ratelimit; | ||
| 438 | |||
| 439 | if (ratelimit < 10) { | ||
| 440 | printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", | ||
| 441 | (unsigned int) local_softirq_pending()); | ||
| 442 | ratelimit++; | ||
| 443 | } | ||
| 444 | return false; | ||
| 445 | } | ||
| 446 | |||
| 447 | return true; | ||
| 448 | } | ||
| 449 | |||
| 450 | static void __tick_nohz_idle_enter(struct tick_sched *ts) | ||
| 451 | { | ||
| 452 | ktime_t now, expires; | ||
| 453 | int cpu = smp_processor_id(); | ||
| 454 | |||
| 455 | now = tick_nohz_start_idle(cpu, ts); | ||
| 456 | |||
| 457 | if (can_stop_idle_tick(cpu, ts)) { | ||
| 458 | int was_stopped = ts->tick_stopped; | ||
| 459 | |||
| 460 | ts->idle_calls++; | ||
| 461 | |||
| 462 | expires = tick_nohz_stop_sched_tick(ts, now, cpu); | ||
| 463 | if (expires.tv64 > 0LL) { | ||
| 464 | ts->idle_sleeps++; | ||
| 465 | ts->idle_expires = expires; | ||
| 466 | } | ||
| 467 | |||
| 468 | if (!was_stopped && ts->tick_stopped) | ||
| 469 | ts->idle_jiffies = ts->last_jiffies; | ||
| 470 | } | ||
| 450 | } | 471 | } |
| 451 | 472 | ||
| 452 | /** | 473 | /** |
| @@ -484,7 +505,7 @@ void tick_nohz_idle_enter(void) | |||
| 484 | * update of the idle time accounting in tick_nohz_start_idle(). | 505 | * update of the idle time accounting in tick_nohz_start_idle(). |
| 485 | */ | 506 | */ |
| 486 | ts->inidle = 1; | 507 | ts->inidle = 1; |
| 487 | tick_nohz_stop_sched_tick(ts); | 508 | __tick_nohz_idle_enter(ts); |
| 488 | 509 | ||
| 489 | local_irq_enable(); | 510 | local_irq_enable(); |
| 490 | } | 511 | } |
| @@ -504,7 +525,7 @@ void tick_nohz_irq_exit(void) | |||
| 504 | if (!ts->inidle) | 525 | if (!ts->inidle) |
| 505 | return; | 526 | return; |
| 506 | 527 | ||
| 507 | tick_nohz_stop_sched_tick(ts); | 528 | __tick_nohz_idle_enter(ts); |
| 508 | } | 529 | } |
| 509 | 530 | ||
| 510 | /** | 531 | /** |
| @@ -522,7 +543,7 @@ ktime_t tick_nohz_get_sleep_length(void) | |||
| 522 | static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) | 543 | static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) |
| 523 | { | 544 | { |
| 524 | hrtimer_cancel(&ts->sched_timer); | 545 | hrtimer_cancel(&ts->sched_timer); |
| 525 | hrtimer_set_expires(&ts->sched_timer, ts->idle_tick); | 546 | hrtimer_set_expires(&ts->sched_timer, ts->last_tick); |
| 526 | 547 | ||
| 527 | while (1) { | 548 | while (1) { |
| 528 | /* Forward the time to expire in the future */ | 549 | /* Forward the time to expire in the future */ |
| @@ -545,6 +566,41 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) | |||
| 545 | } | 566 | } |
| 546 | } | 567 | } |
| 547 | 568 | ||
| 569 | static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) | ||
| 570 | { | ||
| 571 | /* Update jiffies first */ | ||
| 572 | select_nohz_load_balancer(0); | ||
| 573 | tick_do_update_jiffies64(now); | ||
| 574 | update_cpu_load_nohz(); | ||
| 575 | |||
| 576 | touch_softlockup_watchdog(); | ||
| 577 | /* | ||
| 578 | * Cancel the scheduled timer and restore the tick | ||
| 579 | */ | ||
| 580 | ts->tick_stopped = 0; | ||
| 581 | ts->idle_exittime = now; | ||
| 582 | |||
| 583 | tick_nohz_restart(ts, now); | ||
| 584 | } | ||
| 585 | |||
| 586 | static void tick_nohz_account_idle_ticks(struct tick_sched *ts) | ||
| 587 | { | ||
| 588 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | ||
| 589 | unsigned long ticks; | ||
| 590 | /* | ||
| 591 | * We stopped the tick in idle. Update process times would miss the | ||
| 592 | * time we slept as update_process_times does only a 1 tick | ||
| 593 | * accounting. Enforce that this is accounted to idle ! | ||
| 594 | */ | ||
| 595 | ticks = jiffies - ts->idle_jiffies; | ||
| 596 | /* | ||
| 597 | * We might be one off. Do not randomly account a huge number of ticks! | ||
| 598 | */ | ||
| 599 | if (ticks && ticks < LONG_MAX) | ||
| 600 | account_idle_ticks(ticks); | ||
| 601 | #endif | ||
| 602 | } | ||
| 603 | |||
| 548 | /** | 604 | /** |
| 549 | * tick_nohz_idle_exit - restart the idle tick from the idle task | 605 | * tick_nohz_idle_exit - restart the idle tick from the idle task |
| 550 | * | 606 | * |
| @@ -556,9 +612,6 @@ void tick_nohz_idle_exit(void) | |||
| 556 | { | 612 | { |
| 557 | int cpu = smp_processor_id(); | 613 | int cpu = smp_processor_id(); |
| 558 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | 614 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); |
| 559 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | ||
| 560 | unsigned long ticks; | ||
| 561 | #endif | ||
| 562 | ktime_t now; | 615 | ktime_t now; |
| 563 | 616 | ||
| 564 | local_irq_disable(); | 617 | local_irq_disable(); |
| @@ -573,39 +626,11 @@ void tick_nohz_idle_exit(void) | |||
| 573 | if (ts->idle_active) | 626 | if (ts->idle_active) |
| 574 | tick_nohz_stop_idle(cpu, now); | 627 | tick_nohz_stop_idle(cpu, now); |
| 575 | 628 | ||
| 576 | if (!ts->tick_stopped) { | 629 | if (ts->tick_stopped) { |
| 577 | local_irq_enable(); | 630 | tick_nohz_restart_sched_tick(ts, now); |
| 578 | return; | 631 | tick_nohz_account_idle_ticks(ts); |
| 579 | } | 632 | } |
| 580 | 633 | ||
| 581 | /* Update jiffies first */ | ||
| 582 | select_nohz_load_balancer(0); | ||
| 583 | tick_do_update_jiffies64(now); | ||
| 584 | update_cpu_load_nohz(); | ||
| 585 | |||
| 586 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | ||
| 587 | /* | ||
| 588 | * We stopped the tick in idle. Update process times would miss the | ||
| 589 | * time we slept as update_process_times does only a 1 tick | ||
| 590 | * accounting. Enforce that this is accounted to idle ! | ||
| 591 | */ | ||
| 592 | ticks = jiffies - ts->idle_jiffies; | ||
| 593 | /* | ||
| 594 | * We might be one off. Do not randomly account a huge number of ticks! | ||
| 595 | */ | ||
| 596 | if (ticks && ticks < LONG_MAX) | ||
| 597 | account_idle_ticks(ticks); | ||
| 598 | #endif | ||
| 599 | |||
| 600 | touch_softlockup_watchdog(); | ||
| 601 | /* | ||
| 602 | * Cancel the scheduled timer and restore the tick | ||
| 603 | */ | ||
| 604 | ts->tick_stopped = 0; | ||
| 605 | ts->idle_exittime = now; | ||
| 606 | |||
| 607 | tick_nohz_restart(ts, now); | ||
| 608 | |||
| 609 | local_irq_enable(); | 634 | local_irq_enable(); |
| 610 | } | 635 | } |
| 611 | 636 | ||
| @@ -809,7 +834,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) | |||
| 809 | */ | 834 | */ |
| 810 | if (ts->tick_stopped) { | 835 | if (ts->tick_stopped) { |
| 811 | touch_softlockup_watchdog(); | 836 | touch_softlockup_watchdog(); |
| 812 | ts->idle_jiffies++; | 837 | if (idle_cpu(cpu)) |
| 838 | ts->idle_jiffies++; | ||
| 813 | } | 839 | } |
| 814 | update_process_times(user_mode(regs)); | 840 | update_process_times(user_mode(regs)); |
| 815 | profile_tick(CPU_PROFILING); | 841 | profile_tick(CPU_PROFILING); |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 6f46a00a1e8a..f045cc50832d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
| @@ -24,32 +24,32 @@ | |||
| 24 | /* Structure holding internal timekeeping values. */ | 24 | /* Structure holding internal timekeeping values. */ |
| 25 | struct timekeeper { | 25 | struct timekeeper { |
| 26 | /* Current clocksource used for timekeeping. */ | 26 | /* Current clocksource used for timekeeping. */ |
| 27 | struct clocksource *clock; | 27 | struct clocksource *clock; |
| 28 | /* NTP adjusted clock multiplier */ | 28 | /* NTP adjusted clock multiplier */ |
| 29 | u32 mult; | 29 | u32 mult; |
| 30 | /* The shift value of the current clocksource. */ | 30 | /* The shift value of the current clocksource. */ |
| 31 | int shift; | 31 | u32 shift; |
| 32 | |||
| 33 | /* Number of clock cycles in one NTP interval. */ | 32 | /* Number of clock cycles in one NTP interval. */ |
| 34 | cycle_t cycle_interval; | 33 | cycle_t cycle_interval; |
| 35 | /* Number of clock shifted nano seconds in one NTP interval. */ | 34 | /* Number of clock shifted nano seconds in one NTP interval. */ |
| 36 | u64 xtime_interval; | 35 | u64 xtime_interval; |
| 37 | /* shifted nano seconds left over when rounding cycle_interval */ | 36 | /* shifted nano seconds left over when rounding cycle_interval */ |
| 38 | s64 xtime_remainder; | 37 | s64 xtime_remainder; |
| 39 | /* Raw nano seconds accumulated per NTP interval. */ | 38 | /* Raw nano seconds accumulated per NTP interval. */ |
| 40 | u32 raw_interval; | 39 | u32 raw_interval; |
| 40 | |||
| 41 | /* Current CLOCK_REALTIME time in seconds */ | ||
| 42 | u64 xtime_sec; | ||
| 43 | /* Clock shifted nano seconds */ | ||
| 44 | u64 xtime_nsec; | ||
| 41 | 45 | ||
| 42 | /* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */ | ||
| 43 | u64 xtime_nsec; | ||
| 44 | /* Difference between accumulated time and NTP time in ntp | 46 | /* Difference between accumulated time and NTP time in ntp |
| 45 | * shifted nano seconds. */ | 47 | * shifted nano seconds. */ |
| 46 | s64 ntp_error; | 48 | s64 ntp_error; |
| 47 | /* Shift conversion between clock shifted nano seconds and | 49 | /* Shift conversion between clock shifted nano seconds and |
| 48 | * ntp shifted nano seconds. */ | 50 | * ntp shifted nano seconds. */ |
| 49 | int ntp_error_shift; | 51 | u32 ntp_error_shift; |
| 50 | 52 | ||
| 51 | /* The current time */ | ||
| 52 | struct timespec xtime; | ||
| 53 | /* | 53 | /* |
| 54 | * wall_to_monotonic is what we need to add to xtime (or xtime corrected | 54 | * wall_to_monotonic is what we need to add to xtime (or xtime corrected |
| 55 | * for sub jiffie times) to get to monotonic time. Monotonic is pegged | 55 | * for sub jiffie times) to get to monotonic time. Monotonic is pegged |
| @@ -64,14 +64,17 @@ struct timekeeper { | |||
| 64 | * - wall_to_monotonic is no longer the boot time, getboottime must be | 64 | * - wall_to_monotonic is no longer the boot time, getboottime must be |
| 65 | * used instead. | 65 | * used instead. |
| 66 | */ | 66 | */ |
| 67 | struct timespec wall_to_monotonic; | 67 | struct timespec wall_to_monotonic; |
| 68 | /* time spent in suspend */ | 68 | /* time spent in suspend */ |
| 69 | struct timespec total_sleep_time; | 69 | struct timespec total_sleep_time; |
| 70 | /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ | 70 | /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ |
| 71 | struct timespec raw_time; | 71 | struct timespec raw_time; |
| 72 | 72 | /* Offset clock monotonic -> clock realtime */ | |
| 73 | ktime_t offs_real; | ||
| 74 | /* Offset clock monotonic -> clock boottime */ | ||
| 75 | ktime_t offs_boot; | ||
| 73 | /* Seqlock for all timekeeper values */ | 76 | /* Seqlock for all timekeeper values */ |
| 74 | seqlock_t lock; | 77 | seqlock_t lock; |
| 75 | }; | 78 | }; |
| 76 | 79 | ||
| 77 | static struct timekeeper timekeeper; | 80 | static struct timekeeper timekeeper; |
| @@ -82,11 +85,37 @@ static struct timekeeper timekeeper; | |||
| 82 | */ | 85 | */ |
| 83 | __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); | 86 | __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); |
| 84 | 87 | ||
| 85 | |||
| 86 | /* flag for if timekeeping is suspended */ | 88 | /* flag for if timekeeping is suspended */ |
| 87 | int __read_mostly timekeeping_suspended; | 89 | int __read_mostly timekeeping_suspended; |
| 88 | 90 | ||
| 91 | static inline void tk_normalize_xtime(struct timekeeper *tk) | ||
| 92 | { | ||
| 93 | while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) { | ||
| 94 | tk->xtime_nsec -= (u64)NSEC_PER_SEC << tk->shift; | ||
| 95 | tk->xtime_sec++; | ||
| 96 | } | ||
| 97 | } | ||
| 98 | |||
| 99 | static struct timespec tk_xtime(struct timekeeper *tk) | ||
| 100 | { | ||
| 101 | struct timespec ts; | ||
| 102 | |||
| 103 | ts.tv_sec = tk->xtime_sec; | ||
| 104 | ts.tv_nsec = (long)(tk->xtime_nsec >> tk->shift); | ||
| 105 | return ts; | ||
| 106 | } | ||
| 107 | |||
| 108 | static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts) | ||
| 109 | { | ||
| 110 | tk->xtime_sec = ts->tv_sec; | ||
| 111 | tk->xtime_nsec = ts->tv_nsec << tk->shift; | ||
| 112 | } | ||
| 89 | 113 | ||
| 114 | static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts) | ||
| 115 | { | ||
| 116 | tk->xtime_sec += ts->tv_sec; | ||
| 117 | tk->xtime_nsec += ts->tv_nsec << tk->shift; | ||
| 118 | } | ||
| 90 | 119 | ||
| 91 | /** | 120 | /** |
| 92 | * timekeeper_setup_internals - Set up internals to use clocksource clock. | 121 | * timekeeper_setup_internals - Set up internals to use clocksource clock. |
| @@ -98,12 +127,14 @@ int __read_mostly timekeeping_suspended; | |||
| 98 | * | 127 | * |
| 99 | * Unless you're the timekeeping code, you should not be using this! | 128 | * Unless you're the timekeeping code, you should not be using this! |
| 100 | */ | 129 | */ |
| 101 | static void timekeeper_setup_internals(struct clocksource *clock) | 130 | static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) |
| 102 | { | 131 | { |
| 103 | cycle_t interval; | 132 | cycle_t interval; |
| 104 | u64 tmp, ntpinterval; | 133 | u64 tmp, ntpinterval; |
| 134 | struct clocksource *old_clock; | ||
| 105 | 135 | ||
| 106 | timekeeper.clock = clock; | 136 | old_clock = tk->clock; |
| 137 | tk->clock = clock; | ||
| 107 | clock->cycle_last = clock->read(clock); | 138 | clock->cycle_last = clock->read(clock); |
| 108 | 139 | ||
| 109 | /* Do the ns -> cycle conversion first, using original mult */ | 140 | /* Do the ns -> cycle conversion first, using original mult */ |
| @@ -116,71 +147,96 @@ static void timekeeper_setup_internals(struct clocksource *clock) | |||
| 116 | tmp = 1; | 147 | tmp = 1; |
| 117 | 148 | ||
| 118 | interval = (cycle_t) tmp; | 149 | interval = (cycle_t) tmp; |
| 119 | timekeeper.cycle_interval = interval; | 150 | tk->cycle_interval = interval; |
| 120 | 151 | ||
| 121 | /* Go back from cycles -> shifted ns */ | 152 | /* Go back from cycles -> shifted ns */ |
| 122 | timekeeper.xtime_interval = (u64) interval * clock->mult; | 153 | tk->xtime_interval = (u64) interval * clock->mult; |
| 123 | timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval; | 154 | tk->xtime_remainder = ntpinterval - tk->xtime_interval; |
| 124 | timekeeper.raw_interval = | 155 | tk->raw_interval = |
| 125 | ((u64) interval * clock->mult) >> clock->shift; | 156 | ((u64) interval * clock->mult) >> clock->shift; |
| 126 | 157 | ||
| 127 | timekeeper.xtime_nsec = 0; | 158 | /* if changing clocks, convert xtime_nsec shift units */ |
| 128 | timekeeper.shift = clock->shift; | 159 | if (old_clock) { |
| 160 | int shift_change = clock->shift - old_clock->shift; | ||
| 161 | if (shift_change < 0) | ||
| 162 | tk->xtime_nsec >>= -shift_change; | ||
| 163 | else | ||
| 164 | tk->xtime_nsec <<= shift_change; | ||
| 165 | } | ||
| 166 | tk->shift = clock->shift; | ||
| 129 | 167 | ||
| 130 | timekeeper.ntp_error = 0; | 168 | tk->ntp_error = 0; |
| 131 | timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; | 169 | tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; |
| 132 | 170 | ||
| 133 | /* | 171 | /* |
| 134 | * The timekeeper keeps its own mult values for the currently | 172 | * The timekeeper keeps its own mult values for the currently |
| 135 | * active clocksource. These value will be adjusted via NTP | 173 | * active clocksource. These value will be adjusted via NTP |
| 136 | * to counteract clock drifting. | 174 | * to counteract clock drifting. |
| 137 | */ | 175 | */ |
| 138 | timekeeper.mult = clock->mult; | 176 | tk->mult = clock->mult; |
| 139 | } | 177 | } |
| 140 | 178 | ||
| 141 | /* Timekeeper helper functions. */ | 179 | /* Timekeeper helper functions. */ |
| 142 | static inline s64 timekeeping_get_ns(void) | 180 | static inline s64 timekeeping_get_ns(struct timekeeper *tk) |
| 143 | { | 181 | { |
| 144 | cycle_t cycle_now, cycle_delta; | 182 | cycle_t cycle_now, cycle_delta; |
| 145 | struct clocksource *clock; | 183 | struct clocksource *clock; |
| 184 | s64 nsec; | ||
| 146 | 185 | ||
| 147 | /* read clocksource: */ | 186 | /* read clocksource: */ |
| 148 | clock = timekeeper.clock; | 187 | clock = tk->clock; |
| 149 | cycle_now = clock->read(clock); | 188 | cycle_now = clock->read(clock); |
| 150 | 189 | ||
| 151 | /* calculate the delta since the last update_wall_time: */ | 190 | /* calculate the delta since the last update_wall_time: */ |
| 152 | cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; | 191 | cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; |
| 153 | 192 | ||
| 154 | /* return delta convert to nanoseconds using ntp adjusted mult. */ | 193 | nsec = cycle_delta * tk->mult + tk->xtime_nsec; |
| 155 | return clocksource_cyc2ns(cycle_delta, timekeeper.mult, | 194 | nsec >>= tk->shift; |
| 156 | timekeeper.shift); | 195 | |
| 196 | /* If arch requires, add in gettimeoffset() */ | ||
| 197 | return nsec + arch_gettimeoffset(); | ||
| 157 | } | 198 | } |
| 158 | 199 | ||
| 159 | static inline s64 timekeeping_get_ns_raw(void) | 200 | static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) |
| 160 | { | 201 | { |
| 161 | cycle_t cycle_now, cycle_delta; | 202 | cycle_t cycle_now, cycle_delta; |
| 162 | struct clocksource *clock; | 203 | struct clocksource *clock; |
| 204 | s64 nsec; | ||
| 163 | 205 | ||
| 164 | /* read clocksource: */ | 206 | /* read clocksource: */ |
| 165 | clock = timekeeper.clock; | 207 | clock = tk->clock; |
| 166 | cycle_now = clock->read(clock); | 208 | cycle_now = clock->read(clock); |
| 167 | 209 | ||
| 168 | /* calculate the delta since the last update_wall_time: */ | 210 | /* calculate the delta since the last update_wall_time: */ |
| 169 | cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; | 211 | cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; |
| 170 | 212 | ||
| 171 | /* return delta convert to nanoseconds. */ | 213 | /* convert delta to nanoseconds. */ |
| 172 | return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); | 214 | nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); |
| 215 | |||
| 216 | /* If arch requires, add in gettimeoffset() */ | ||
| 217 | return nsec + arch_gettimeoffset(); | ||
| 218 | } | ||
| 219 | |||
| 220 | static void update_rt_offset(struct timekeeper *tk) | ||
| 221 | { | ||
| 222 | struct timespec tmp, *wtm = &tk->wall_to_monotonic; | ||
| 223 | |||
| 224 | set_normalized_timespec(&tmp, -wtm->tv_sec, -wtm->tv_nsec); | ||
| 225 | tk->offs_real = timespec_to_ktime(tmp); | ||
| 173 | } | 226 | } |
| 174 | 227 | ||
| 175 | /* must hold write on timekeeper.lock */ | 228 | /* must hold write on timekeeper.lock */ |
| 176 | static void timekeeping_update(bool clearntp) | 229 | static void timekeeping_update(struct timekeeper *tk, bool clearntp) |
| 177 | { | 230 | { |
| 231 | struct timespec xt; | ||
| 232 | |||
| 178 | if (clearntp) { | 233 | if (clearntp) { |
| 179 | timekeeper.ntp_error = 0; | 234 | tk->ntp_error = 0; |
| 180 | ntp_clear(); | 235 | ntp_clear(); |
| 181 | } | 236 | } |
| 182 | update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, | 237 | update_rt_offset(tk); |
| 183 | timekeeper.clock, timekeeper.mult); | 238 | xt = tk_xtime(tk); |
| 239 | update_vsyscall(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult); | ||
| 184 | } | 240 | } |
| 185 | 241 | ||
| 186 | 242 | ||
| @@ -191,27 +247,26 @@ static void timekeeping_update(bool clearntp) | |||
| 191 | * update_wall_time(). This is useful before significant clock changes, | 247 | * update_wall_time(). This is useful before significant clock changes, |
| 192 | * as it avoids having to deal with this time offset explicitly. | 248 | * as it avoids having to deal with this time offset explicitly. |
| 193 | */ | 249 | */ |
| 194 | static void timekeeping_forward_now(void) | 250 | static void timekeeping_forward_now(struct timekeeper *tk) |
| 195 | { | 251 | { |
| 196 | cycle_t cycle_now, cycle_delta; | 252 | cycle_t cycle_now, cycle_delta; |
| 197 | struct clocksource *clock; | 253 | struct clocksource *clock; |
| 198 | s64 nsec; | 254 | s64 nsec; |
| 199 | 255 | ||
| 200 | clock = timekeeper.clock; | 256 | clock = tk->clock; |
| 201 | cycle_now = clock->read(clock); | 257 | cycle_now = clock->read(clock); |
| 202 | cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; | 258 | cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; |
| 203 | clock->cycle_last = cycle_now; | 259 | clock->cycle_last = cycle_now; |
| 204 | 260 | ||
| 205 | nsec = clocksource_cyc2ns(cycle_delta, timekeeper.mult, | 261 | tk->xtime_nsec += cycle_delta * tk->mult; |
| 206 | timekeeper.shift); | ||
| 207 | 262 | ||
| 208 | /* If arch requires, add in gettimeoffset() */ | 263 | /* If arch requires, add in gettimeoffset() */ |
| 209 | nsec += arch_gettimeoffset(); | 264 | tk->xtime_nsec += arch_gettimeoffset() << tk->shift; |
| 210 | 265 | ||
| 211 | timespec_add_ns(&timekeeper.xtime, nsec); | 266 | tk_normalize_xtime(tk); |
| 212 | 267 | ||
| 213 | nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); | 268 | nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); |
| 214 | timespec_add_ns(&timekeeper.raw_time, nsec); | 269 | timespec_add_ns(&tk->raw_time, nsec); |
| 215 | } | 270 | } |
| 216 | 271 | ||
| 217 | /** | 272 | /** |
| @@ -223,18 +278,15 @@ static void timekeeping_forward_now(void) | |||
| 223 | void getnstimeofday(struct timespec *ts) | 278 | void getnstimeofday(struct timespec *ts) |
| 224 | { | 279 | { |
| 225 | unsigned long seq; | 280 | unsigned long seq; |
| 226 | s64 nsecs; | 281 | s64 nsecs = 0; |
| 227 | 282 | ||
| 228 | WARN_ON(timekeeping_suspended); | 283 | WARN_ON(timekeeping_suspended); |
| 229 | 284 | ||
| 230 | do { | 285 | do { |
| 231 | seq = read_seqbegin(&timekeeper.lock); | 286 | seq = read_seqbegin(&timekeeper.lock); |
| 232 | 287 | ||
| 233 | *ts = timekeeper.xtime; | 288 | ts->tv_sec = timekeeper.xtime_sec; |
| 234 | nsecs = timekeeping_get_ns(); | 289 | ts->tv_nsec = timekeeping_get_ns(&timekeeper); |
| 235 | |||
| 236 | /* If arch requires, add in gettimeoffset() */ | ||
| 237 | nsecs += arch_gettimeoffset(); | ||
| 238 | 290 | ||
| 239 | } while (read_seqretry(&timekeeper.lock, seq)); | 291 | } while (read_seqretry(&timekeeper.lock, seq)); |
| 240 | 292 | ||
| @@ -251,13 +303,10 @@ ktime_t ktime_get(void) | |||
| 251 | 303 | ||
| 252 | do { | 304 | do { |
| 253 | seq = read_seqbegin(&timekeeper.lock); | 305 | seq = read_seqbegin(&timekeeper.lock); |
| 254 | secs = timekeeper.xtime.tv_sec + | 306 | secs = timekeeper.xtime_sec + |
| 255 | timekeeper.wall_to_monotonic.tv_sec; | 307 | timekeeper.wall_to_monotonic.tv_sec; |
| 256 | nsecs = timekeeper.xtime.tv_nsec + | 308 | nsecs = timekeeping_get_ns(&timekeeper) + |
| 257 | timekeeper.wall_to_monotonic.tv_nsec; | 309 | timekeeper.wall_to_monotonic.tv_nsec; |
| 258 | nsecs += timekeeping_get_ns(); | ||
| 259 | /* If arch requires, add in gettimeoffset() */ | ||
| 260 | nsecs += arch_gettimeoffset(); | ||
| 261 | 310 | ||
| 262 | } while (read_seqretry(&timekeeper.lock, seq)); | 311 | } while (read_seqretry(&timekeeper.lock, seq)); |
| 263 | /* | 312 | /* |
| @@ -280,22 +329,19 @@ void ktime_get_ts(struct timespec *ts) | |||
| 280 | { | 329 | { |
| 281 | struct timespec tomono; | 330 | struct timespec tomono; |
| 282 | unsigned int seq; | 331 | unsigned int seq; |
| 283 | s64 nsecs; | ||
| 284 | 332 | ||
| 285 | WARN_ON(timekeeping_suspended); | 333 | WARN_ON(timekeeping_suspended); |
| 286 | 334 | ||
| 287 | do { | 335 | do { |
| 288 | seq = read_seqbegin(&timekeeper.lock); | 336 | seq = read_seqbegin(&timekeeper.lock); |
| 289 | *ts = timekeeper.xtime; | 337 | ts->tv_sec = timekeeper.xtime_sec; |
| 338 | ts->tv_nsec = timekeeping_get_ns(&timekeeper); | ||
| 290 | tomono = timekeeper.wall_to_monotonic; | 339 | tomono = timekeeper.wall_to_monotonic; |
| 291 | nsecs = timekeeping_get_ns(); | ||
| 292 | /* If arch requires, add in gettimeoffset() */ | ||
| 293 | nsecs += arch_gettimeoffset(); | ||
| 294 | 340 | ||
| 295 | } while (read_seqretry(&timekeeper.lock, seq)); | 341 | } while (read_seqretry(&timekeeper.lock, seq)); |
| 296 | 342 | ||
| 297 | set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, | 343 | set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, |
| 298 | ts->tv_nsec + tomono.tv_nsec + nsecs); | 344 | ts->tv_nsec + tomono.tv_nsec); |
| 299 | } | 345 | } |
| 300 | EXPORT_SYMBOL_GPL(ktime_get_ts); | 346 | EXPORT_SYMBOL_GPL(ktime_get_ts); |
| 301 | 347 | ||
| @@ -318,20 +364,14 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) | |||
| 318 | WARN_ON_ONCE(timekeeping_suspended); | 364 | WARN_ON_ONCE(timekeeping_suspended); |
| 319 | 365 | ||
| 320 | do { | 366 | do { |
| 321 | u32 arch_offset; | ||
| 322 | |||
| 323 | seq = read_seqbegin(&timekeeper.lock); | 367 | seq = read_seqbegin(&timekeeper.lock); |
| 324 | 368 | ||
| 325 | *ts_raw = timekeeper.raw_time; | 369 | *ts_raw = timekeeper.raw_time; |
| 326 | *ts_real = timekeeper.xtime; | 370 | ts_real->tv_sec = timekeeper.xtime_sec; |
| 327 | 371 | ts_real->tv_nsec = 0; | |
| 328 | nsecs_raw = timekeeping_get_ns_raw(); | ||
| 329 | nsecs_real = timekeeping_get_ns(); | ||
| 330 | 372 | ||
| 331 | /* If arch requires, add in gettimeoffset() */ | 373 | nsecs_raw = timekeeping_get_ns_raw(&timekeeper); |
| 332 | arch_offset = arch_gettimeoffset(); | 374 | nsecs_real = timekeeping_get_ns(&timekeeper); |
| 333 | nsecs_raw += arch_offset; | ||
| 334 | nsecs_real += arch_offset; | ||
| 335 | 375 | ||
| 336 | } while (read_seqretry(&timekeeper.lock, seq)); | 376 | } while (read_seqretry(&timekeeper.lock, seq)); |
| 337 | 377 | ||
| @@ -366,7 +406,7 @@ EXPORT_SYMBOL(do_gettimeofday); | |||
| 366 | */ | 406 | */ |
| 367 | int do_settimeofday(const struct timespec *tv) | 407 | int do_settimeofday(const struct timespec *tv) |
| 368 | { | 408 | { |
| 369 | struct timespec ts_delta; | 409 | struct timespec ts_delta, xt; |
| 370 | unsigned long flags; | 410 | unsigned long flags; |
| 371 | 411 | ||
| 372 | if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) | 412 | if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) |
| @@ -374,15 +414,18 @@ int do_settimeofday(const struct timespec *tv) | |||
| 374 | 414 | ||
| 375 | write_seqlock_irqsave(&timekeeper.lock, flags); | 415 | write_seqlock_irqsave(&timekeeper.lock, flags); |
| 376 | 416 | ||
| 377 | timekeeping_forward_now(); | 417 | timekeeping_forward_now(&timekeeper); |
| 418 | |||
| 419 | xt = tk_xtime(&timekeeper); | ||
| 420 | ts_delta.tv_sec = tv->tv_sec - xt.tv_sec; | ||
| 421 | ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec; | ||
| 378 | 422 | ||
| 379 | ts_delta.tv_sec = tv->tv_sec - timekeeper.xtime.tv_sec; | ||
| 380 | ts_delta.tv_nsec = tv->tv_nsec - timekeeper.xtime.tv_nsec; | ||
| 381 | timekeeper.wall_to_monotonic = | 423 | timekeeper.wall_to_monotonic = |
| 382 | timespec_sub(timekeeper.wall_to_monotonic, ts_delta); | 424 | timespec_sub(timekeeper.wall_to_monotonic, ts_delta); |
| 383 | 425 | ||
| 384 | timekeeper.xtime = *tv; | 426 | tk_set_xtime(&timekeeper, tv); |
| 385 | timekeeping_update(true); | 427 | |
| 428 | timekeeping_update(&timekeeper, true); | ||
| 386 | 429 | ||
| 387 | write_sequnlock_irqrestore(&timekeeper.lock, flags); | 430 | write_sequnlock_irqrestore(&timekeeper.lock, flags); |
| 388 | 431 | ||
| @@ -409,13 +452,14 @@ int timekeeping_inject_offset(struct timespec *ts) | |||
| 409 | 452 | ||
| 410 | write_seqlock_irqsave(&timekeeper.lock, flags); | 453 | write_seqlock_irqsave(&timekeeper.lock, flags); |
| 411 | 454 | ||
| 412 | timekeeping_forward_now(); | 455 | timekeeping_forward_now(&timekeeper); |
| 413 | 456 | ||
| 414 | timekeeper.xtime = timespec_add(timekeeper.xtime, *ts); | 457 | |
| 458 | tk_xtime_add(&timekeeper, ts); | ||
| 415 | timekeeper.wall_to_monotonic = | 459 | timekeeper.wall_to_monotonic = |
| 416 | timespec_sub(timekeeper.wall_to_monotonic, *ts); | 460 | timespec_sub(timekeeper.wall_to_monotonic, *ts); |
| 417 | 461 | ||
| 418 | timekeeping_update(true); | 462 | timekeeping_update(&timekeeper, true); |
| 419 | 463 | ||
| 420 | write_sequnlock_irqrestore(&timekeeper.lock, flags); | 464 | write_sequnlock_irqrestore(&timekeeper.lock, flags); |
| 421 | 465 | ||
| @@ -440,14 +484,14 @@ static int change_clocksource(void *data) | |||
| 440 | 484 | ||
| 441 | write_seqlock_irqsave(&timekeeper.lock, flags); | 485 | write_seqlock_irqsave(&timekeeper.lock, flags); |
| 442 | 486 | ||
| 443 | timekeeping_forward_now(); | 487 | timekeeping_forward_now(&timekeeper); |
| 444 | if (!new->enable || new->enable(new) == 0) { | 488 | if (!new->enable || new->enable(new) == 0) { |
| 445 | old = timekeeper.clock; | 489 | old = timekeeper.clock; |
| 446 | timekeeper_setup_internals(new); | 490 | tk_setup_internals(&timekeeper, new); |
| 447 | if (old->disable) | 491 | if (old->disable) |
| 448 | old->disable(old); | 492 | old->disable(old); |
| 449 | } | 493 | } |
| 450 | timekeeping_update(true); | 494 | timekeeping_update(&timekeeper, true); |
| 451 | 495 | ||
| 452 | write_sequnlock_irqrestore(&timekeeper.lock, flags); | 496 | write_sequnlock_irqrestore(&timekeeper.lock, flags); |
| 453 | 497 | ||
| @@ -497,7 +541,7 @@ void getrawmonotonic(struct timespec *ts) | |||
| 497 | 541 | ||
| 498 | do { | 542 | do { |
| 499 | seq = read_seqbegin(&timekeeper.lock); | 543 | seq = read_seqbegin(&timekeeper.lock); |
| 500 | nsecs = timekeeping_get_ns_raw(); | 544 | nsecs = timekeeping_get_ns_raw(&timekeeper); |
| 501 | *ts = timekeeper.raw_time; | 545 | *ts = timekeeper.raw_time; |
| 502 | 546 | ||
| 503 | } while (read_seqretry(&timekeeper.lock, seq)); | 547 | } while (read_seqretry(&timekeeper.lock, seq)); |
| @@ -532,6 +576,7 @@ u64 timekeeping_max_deferment(void) | |||
| 532 | { | 576 | { |
| 533 | unsigned long seq; | 577 | unsigned long seq; |
| 534 | u64 ret; | 578 | u64 ret; |
| 579 | |||
| 535 | do { | 580 | do { |
| 536 | seq = read_seqbegin(&timekeeper.lock); | 581 | seq = read_seqbegin(&timekeeper.lock); |
| 537 | 582 | ||
| @@ -592,18 +637,17 @@ void __init timekeeping_init(void) | |||
| 592 | clock = clocksource_default_clock(); | 637 | clock = clocksource_default_clock(); |
| 593 | if (clock->enable) | 638 | if (clock->enable) |
| 594 | clock->enable(clock); | 639 | clock->enable(clock); |
| 595 | timekeeper_setup_internals(clock); | 640 | tk_setup_internals(&timekeeper, clock); |
| 596 | 641 | ||
| 597 | timekeeper.xtime.tv_sec = now.tv_sec; | 642 | tk_set_xtime(&timekeeper, &now); |
| 598 | timekeeper.xtime.tv_nsec = now.tv_nsec; | ||
| 599 | timekeeper.raw_time.tv_sec = 0; | 643 | timekeeper.raw_time.tv_sec = 0; |
| 600 | timekeeper.raw_time.tv_nsec = 0; | 644 | timekeeper.raw_time.tv_nsec = 0; |
| 601 | if (boot.tv_sec == 0 && boot.tv_nsec == 0) { | 645 | if (boot.tv_sec == 0 && boot.tv_nsec == 0) |
| 602 | boot.tv_sec = timekeeper.xtime.tv_sec; | 646 | boot = tk_xtime(&timekeeper); |
| 603 | boot.tv_nsec = timekeeper.xtime.tv_nsec; | 647 | |
| 604 | } | ||
| 605 | set_normalized_timespec(&timekeeper.wall_to_monotonic, | 648 | set_normalized_timespec(&timekeeper.wall_to_monotonic, |
| 606 | -boot.tv_sec, -boot.tv_nsec); | 649 | -boot.tv_sec, -boot.tv_nsec); |
| 650 | update_rt_offset(&timekeeper); | ||
| 607 | timekeeper.total_sleep_time.tv_sec = 0; | 651 | timekeeper.total_sleep_time.tv_sec = 0; |
| 608 | timekeeper.total_sleep_time.tv_nsec = 0; | 652 | timekeeper.total_sleep_time.tv_nsec = 0; |
| 609 | write_sequnlock_irqrestore(&timekeeper.lock, flags); | 653 | write_sequnlock_irqrestore(&timekeeper.lock, flags); |
| @@ -612,6 +656,12 @@ void __init timekeeping_init(void) | |||
| 612 | /* time in seconds when suspend began */ | 656 | /* time in seconds when suspend began */ |
| 613 | static struct timespec timekeeping_suspend_time; | 657 | static struct timespec timekeeping_suspend_time; |
| 614 | 658 | ||
| 659 | static void update_sleep_time(struct timespec t) | ||
| 660 | { | ||
| 661 | timekeeper.total_sleep_time = t; | ||
| 662 | timekeeper.offs_boot = timespec_to_ktime(t); | ||
| 663 | } | ||
| 664 | |||
| 615 | /** | 665 | /** |
| 616 | * __timekeeping_inject_sleeptime - Internal function to add sleep interval | 666 | * __timekeeping_inject_sleeptime - Internal function to add sleep interval |
| 617 | * @delta: pointer to a timespec delta value | 667 | * @delta: pointer to a timespec delta value |
| @@ -619,7 +669,8 @@ static struct timespec timekeeping_suspend_time; | |||
| 619 | * Takes a timespec offset measuring a suspend interval and properly | 669 | * Takes a timespec offset measuring a suspend interval and properly |
| 620 | * adds the sleep offset to the timekeeping variables. | 670 | * adds the sleep offset to the timekeeping variables. |
| 621 | */ | 671 | */ |
| 622 | static void __timekeeping_inject_sleeptime(struct timespec *delta) | 672 | static void __timekeeping_inject_sleeptime(struct timekeeper *tk, |
| 673 | struct timespec *delta) | ||
| 623 | { | 674 | { |
| 624 | if (!timespec_valid(delta)) { | 675 | if (!timespec_valid(delta)) { |
| 625 | printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " | 676 | printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " |
| @@ -627,11 +678,9 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta) | |||
| 627 | return; | 678 | return; |
| 628 | } | 679 | } |
| 629 | 680 | ||
| 630 | timekeeper.xtime = timespec_add(timekeeper.xtime, *delta); | 681 | tk_xtime_add(tk, delta); |
| 631 | timekeeper.wall_to_monotonic = | 682 | tk->wall_to_monotonic = timespec_sub(tk->wall_to_monotonic, *delta); |
| 632 | timespec_sub(timekeeper.wall_to_monotonic, *delta); | 683 | update_sleep_time(timespec_add(tk->total_sleep_time, *delta)); |
| 633 | timekeeper.total_sleep_time = timespec_add( | ||
| 634 | timekeeper.total_sleep_time, *delta); | ||
| 635 | } | 684 | } |
| 636 | 685 | ||
| 637 | 686 | ||
| @@ -657,11 +706,11 @@ void timekeeping_inject_sleeptime(struct timespec *delta) | |||
| 657 | 706 | ||
| 658 | write_seqlock_irqsave(&timekeeper.lock, flags); | 707 | write_seqlock_irqsave(&timekeeper.lock, flags); |
| 659 | 708 | ||
| 660 | timekeeping_forward_now(); | 709 | timekeeping_forward_now(&timekeeper); |
| 661 | 710 | ||
| 662 | __timekeeping_inject_sleeptime(delta); | 711 | __timekeeping_inject_sleeptime(&timekeeper, delta); |
| 663 | 712 | ||
| 664 | timekeeping_update(true); | 713 | timekeeping_update(&timekeeper, true); |
| 665 | 714 | ||
| 666 | write_sequnlock_irqrestore(&timekeeper.lock, flags); | 715 | write_sequnlock_irqrestore(&timekeeper.lock, flags); |
| 667 | 716 | ||
| @@ -690,12 +739,13 @@ static void timekeeping_resume(void) | |||
| 690 | 739 | ||
| 691 | if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { | 740 | if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { |
| 692 | ts = timespec_sub(ts, timekeeping_suspend_time); | 741 | ts = timespec_sub(ts, timekeeping_suspend_time); |
| 693 | __timekeeping_inject_sleeptime(&ts); | 742 | __timekeeping_inject_sleeptime(&timekeeper, &ts); |
| 694 | } | 743 | } |
| 695 | /* re-base the last cycle value */ | 744 | /* re-base the last cycle value */ |
| 696 | timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); | 745 | timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); |
| 697 | timekeeper.ntp_error = 0; | 746 | timekeeper.ntp_error = 0; |
| 698 | timekeeping_suspended = 0; | 747 | timekeeping_suspended = 0; |
| 748 | timekeeping_update(&timekeeper, false); | ||
| 699 | write_sequnlock_irqrestore(&timekeeper.lock, flags); | 749 | write_sequnlock_irqrestore(&timekeeper.lock, flags); |
| 700 | 750 | ||
| 701 | touch_softlockup_watchdog(); | 751 | touch_softlockup_watchdog(); |
| @@ -715,7 +765,7 @@ static int timekeeping_suspend(void) | |||
| 715 | read_persistent_clock(&timekeeping_suspend_time); | 765 | read_persistent_clock(&timekeeping_suspend_time); |
| 716 | 766 | ||
| 717 | write_seqlock_irqsave(&timekeeper.lock, flags); | 767 | write_seqlock_irqsave(&timekeeper.lock, flags); |
| 718 | timekeeping_forward_now(); | 768 | timekeeping_forward_now(&timekeeper); |
| 719 | timekeeping_suspended = 1; | 769 | timekeeping_suspended = 1; |
| 720 | 770 | ||
| 721 | /* | 771 | /* |
| @@ -724,7 +774,7 @@ static int timekeeping_suspend(void) | |||
| 724 | * try to compensate so the difference in system time | 774 | * try to compensate so the difference in system time |
| 725 | * and persistent_clock time stays close to constant. | 775 | * and persistent_clock time stays close to constant. |
| 726 | */ | 776 | */ |
| 727 | delta = timespec_sub(timekeeper.xtime, timekeeping_suspend_time); | 777 | delta = timespec_sub(tk_xtime(&timekeeper), timekeeping_suspend_time); |
| 728 | delta_delta = timespec_sub(delta, old_delta); | 778 | delta_delta = timespec_sub(delta, old_delta); |
| 729 | if (abs(delta_delta.tv_sec) >= 2) { | 779 | if (abs(delta_delta.tv_sec) >= 2) { |
| 730 | /* | 780 | /* |
| @@ -763,7 +813,8 @@ device_initcall(timekeeping_init_ops); | |||
| 763 | * If the error is already larger, we look ahead even further | 813 | * If the error is already larger, we look ahead even further |
| 764 | * to compensate for late or lost adjustments. | 814 | * to compensate for late or lost adjustments. |
| 765 | */ | 815 | */ |
| 766 | static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, | 816 | static __always_inline int timekeeping_bigadjust(struct timekeeper *tk, |
| 817 | s64 error, s64 *interval, | ||
| 767 | s64 *offset) | 818 | s64 *offset) |
| 768 | { | 819 | { |
| 769 | s64 tick_error, i; | 820 | s64 tick_error, i; |
| @@ -779,7 +830,7 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, | |||
| 779 | * here. This is tuned so that an error of about 1 msec is adjusted | 830 | * here. This is tuned so that an error of about 1 msec is adjusted |
| 780 | * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). | 831 | * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). |
| 781 | */ | 832 | */ |
| 782 | error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); | 833 | error2 = tk->ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); |
| 783 | error2 = abs(error2); | 834 | error2 = abs(error2); |
| 784 | for (look_ahead = 0; error2 > 0; look_ahead++) | 835 | for (look_ahead = 0; error2 > 0; look_ahead++) |
| 785 | error2 >>= 2; | 836 | error2 >>= 2; |
| @@ -788,8 +839,8 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, | |||
| 788 | * Now calculate the error in (1 << look_ahead) ticks, but first | 839 | * Now calculate the error in (1 << look_ahead) ticks, but first |
| 789 | * remove the single look ahead already included in the error. | 840 | * remove the single look ahead already included in the error. |
| 790 | */ | 841 | */ |
| 791 | tick_error = ntp_tick_length() >> (timekeeper.ntp_error_shift + 1); | 842 | tick_error = ntp_tick_length() >> (tk->ntp_error_shift + 1); |
| 792 | tick_error -= timekeeper.xtime_interval >> 1; | 843 | tick_error -= tk->xtime_interval >> 1; |
| 793 | error = ((error - tick_error) >> look_ahead) + tick_error; | 844 | error = ((error - tick_error) >> look_ahead) + tick_error; |
| 794 | 845 | ||
| 795 | /* Finally calculate the adjustment shift value. */ | 846 | /* Finally calculate the adjustment shift value. */ |
| @@ -814,9 +865,9 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, | |||
| 814 | * this is optimized for the most common adjustments of -1,0,1, | 865 | * this is optimized for the most common adjustments of -1,0,1, |
| 815 | * for other values we can do a bit more work. | 866 | * for other values we can do a bit more work. |
| 816 | */ | 867 | */ |
| 817 | static void timekeeping_adjust(s64 offset) | 868 | static void timekeeping_adjust(struct timekeeper *tk, s64 offset) |
| 818 | { | 869 | { |
| 819 | s64 error, interval = timekeeper.cycle_interval; | 870 | s64 error, interval = tk->cycle_interval; |
| 820 | int adj; | 871 | int adj; |
| 821 | 872 | ||
| 822 | /* | 873 | /* |
| @@ -832,7 +883,7 @@ static void timekeeping_adjust(s64 offset) | |||
| 832 | * | 883 | * |
| 833 | * Note: It does not "save" on aggravation when reading the code. | 884 | * Note: It does not "save" on aggravation when reading the code. |
| 834 | */ | 885 | */ |
| 835 | error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); | 886 | error = tk->ntp_error >> (tk->ntp_error_shift - 1); |
| 836 | if (error > interval) { | 887 | if (error > interval) { |
| 837 | /* | 888 | /* |
| 838 | * We now divide error by 4(via shift), which checks if | 889 | * We now divide error by 4(via shift), which checks if |
| @@ -854,7 +905,8 @@ static void timekeeping_adjust(s64 offset) | |||
| 854 | if (likely(error <= interval)) | 905 | if (likely(error <= interval)) |
| 855 | adj = 1; | 906 | adj = 1; |
| 856 | else | 907 | else |
| 857 | adj = timekeeping_bigadjust(error, &interval, &offset); | 908 | adj = timekeeping_bigadjust(tk, error, &interval, |
| 909 | &offset); | ||
| 858 | } else if (error < -interval) { | 910 | } else if (error < -interval) { |
| 859 | /* See comment above, this is just switched for the negative */ | 911 | /* See comment above, this is just switched for the negative */ |
| 860 | error >>= 2; | 912 | error >>= 2; |
| @@ -863,18 +915,17 @@ static void timekeeping_adjust(s64 offset) | |||
| 863 | interval = -interval; | 915 | interval = -interval; |
| 864 | offset = -offset; | 916 | offset = -offset; |
| 865 | } else | 917 | } else |
| 866 | adj = timekeeping_bigadjust(error, &interval, &offset); | 918 | adj = timekeeping_bigadjust(tk, error, &interval, |
| 867 | } else /* No adjustment needed */ | 919 | &offset); |
| 920 | } else | ||
| 868 | return; | 921 | return; |
| 869 | 922 | ||
| 870 | if (unlikely(timekeeper.clock->maxadj && | 923 | if (unlikely(tk->clock->maxadj && |
| 871 | (timekeeper.mult + adj > | 924 | (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) { |
| 872 | timekeeper.clock->mult + timekeeper.clock->maxadj))) { | ||
| 873 | printk_once(KERN_WARNING | 925 | printk_once(KERN_WARNING |
| 874 | "Adjusting %s more than 11%% (%ld vs %ld)\n", | 926 | "Adjusting %s more than 11%% (%ld vs %ld)\n", |
| 875 | timekeeper.clock->name, (long)timekeeper.mult + adj, | 927 | tk->clock->name, (long)tk->mult + adj, |
| 876 | (long)timekeeper.clock->mult + | 928 | (long)tk->clock->mult + tk->clock->maxadj); |
| 877 | timekeeper.clock->maxadj); | ||
| 878 | } | 929 | } |
| 879 | /* | 930 | /* |
| 880 | * So the following can be confusing. | 931 | * So the following can be confusing. |
| @@ -925,11 +976,60 @@ static void timekeeping_adjust(s64 offset) | |||
| 925 | * | 976 | * |
| 926 | * XXX - TODO: Doc ntp_error calculation. | 977 | * XXX - TODO: Doc ntp_error calculation. |
| 927 | */ | 978 | */ |
| 928 | timekeeper.mult += adj; | 979 | tk->mult += adj; |
| 929 | timekeeper.xtime_interval += interval; | 980 | tk->xtime_interval += interval; |
| 930 | timekeeper.xtime_nsec -= offset; | 981 | tk->xtime_nsec -= offset; |
| 931 | timekeeper.ntp_error -= (interval - offset) << | 982 | tk->ntp_error -= (interval - offset) << tk->ntp_error_shift; |
| 932 | timekeeper.ntp_error_shift; | 983 | |
| 984 | /* | ||
| 985 | * It may be possible that when we entered this function, xtime_nsec | ||
| 986 | * was very small. Further, if we're slightly speeding the clocksource | ||
| 987 | * in the code above, its possible the required corrective factor to | ||
| 988 | * xtime_nsec could cause it to underflow. | ||
| 989 | * | ||
| 990 | * Now, since we already accumulated the second, cannot simply roll | ||
| 991 | * the accumulated second back, since the NTP subsystem has been | ||
| 992 | * notified via second_overflow. So instead we push xtime_nsec forward | ||
| 993 | * by the amount we underflowed, and add that amount into the error. | ||
| 994 | * | ||
| 995 | * We'll correct this error next time through this function, when | ||
| 996 | * xtime_nsec is not as small. | ||
| 997 | */ | ||
| 998 | if (unlikely((s64)tk->xtime_nsec < 0)) { | ||
| 999 | s64 neg = -(s64)tk->xtime_nsec; | ||
| 1000 | tk->xtime_nsec = 0; | ||
| 1001 | tk->ntp_error += neg << tk->ntp_error_shift; | ||
| 1002 | } | ||
| 1003 | |||
| 1004 | } | ||
| 1005 | |||
| 1006 | |||
| 1007 | /** | ||
| 1008 | * accumulate_nsecs_to_secs - Accumulates nsecs into secs | ||
| 1009 | * | ||
| 1010 | * Helper function that accumulates a the nsecs greater then a second | ||
| 1011 | * from the xtime_nsec field to the xtime_secs field. | ||
| 1012 | * It also calls into the NTP code to handle leapsecond processing. | ||
| 1013 | * | ||
| 1014 | */ | ||
| 1015 | static inline void accumulate_nsecs_to_secs(struct timekeeper *tk) | ||
| 1016 | { | ||
| 1017 | u64 nsecps = (u64)NSEC_PER_SEC << tk->shift; | ||
| 1018 | |||
| 1019 | while (tk->xtime_nsec >= nsecps) { | ||
| 1020 | int leap; | ||
| 1021 | |||
| 1022 | tk->xtime_nsec -= nsecps; | ||
| 1023 | tk->xtime_sec++; | ||
| 1024 | |||
| 1025 | /* Figure out if its a leap sec and apply if needed */ | ||
| 1026 | leap = second_overflow(tk->xtime_sec); | ||
| 1027 | tk->xtime_sec += leap; | ||
| 1028 | tk->wall_to_monotonic.tv_sec -= leap; | ||
| 1029 | if (leap) | ||
| 1030 | clock_was_set_delayed(); | ||
| 1031 | |||
| 1032 | } | ||
| 933 | } | 1033 | } |
| 934 | 1034 | ||
| 935 | 1035 | ||
| @@ -942,44 +1042,36 @@ static void timekeeping_adjust(s64 offset) | |||
| 942 | * | 1042 | * |
| 943 | * Returns the unconsumed cycles. | 1043 | * Returns the unconsumed cycles. |
| 944 | */ | 1044 | */ |
| 945 | static cycle_t logarithmic_accumulation(cycle_t offset, int shift) | 1045 | static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, |
| 1046 | u32 shift) | ||
| 946 | { | 1047 | { |
| 947 | u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; | ||
| 948 | u64 raw_nsecs; | 1048 | u64 raw_nsecs; |
| 949 | 1049 | ||
| 950 | /* If the offset is smaller than a shifted interval, do nothing */ | 1050 | /* If the offset is smaller then a shifted interval, do nothing */ |
| 951 | if (offset < timekeeper.cycle_interval<<shift) | 1051 | if (offset < tk->cycle_interval<<shift) |
| 952 | return offset; | 1052 | return offset; |
| 953 | 1053 | ||
| 954 | /* Accumulate one shifted interval */ | 1054 | /* Accumulate one shifted interval */ |
| 955 | offset -= timekeeper.cycle_interval << shift; | 1055 | offset -= tk->cycle_interval << shift; |
| 956 | timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift; | 1056 | tk->clock->cycle_last += tk->cycle_interval << shift; |
| 957 | 1057 | ||
| 958 | timekeeper.xtime_nsec += timekeeper.xtime_interval << shift; | 1058 | tk->xtime_nsec += tk->xtime_interval << shift; |
| 959 | while (timekeeper.xtime_nsec >= nsecps) { | 1059 | accumulate_nsecs_to_secs(tk); |
| 960 | int leap; | ||
| 961 | timekeeper.xtime_nsec -= nsecps; | ||
| 962 | timekeeper.xtime.tv_sec++; | ||
| 963 | leap = second_overflow(timekeeper.xtime.tv_sec); | ||
| 964 | timekeeper.xtime.tv_sec += leap; | ||
| 965 | timekeeper.wall_to_monotonic.tv_sec -= leap; | ||
| 966 | } | ||
| 967 | 1060 | ||
| 968 | /* Accumulate raw time */ | 1061 | /* Accumulate raw time */ |
| 969 | raw_nsecs = timekeeper.raw_interval << shift; | 1062 | raw_nsecs = tk->raw_interval << shift; |
| 970 | raw_nsecs += timekeeper.raw_time.tv_nsec; | 1063 | raw_nsecs += tk->raw_time.tv_nsec; |
| 971 | if (raw_nsecs >= NSEC_PER_SEC) { | 1064 | if (raw_nsecs >= NSEC_PER_SEC) { |
| 972 | u64 raw_secs = raw_nsecs; | 1065 | u64 raw_secs = raw_nsecs; |
| 973 | raw_nsecs = do_div(raw_secs, NSEC_PER_SEC); | 1066 | raw_nsecs = do_div(raw_secs, NSEC_PER_SEC); |
| 974 | timekeeper.raw_time.tv_sec += raw_secs; | 1067 | tk->raw_time.tv_sec += raw_secs; |
| 975 | } | 1068 | } |
| 976 | timekeeper.raw_time.tv_nsec = raw_nsecs; | 1069 | tk->raw_time.tv_nsec = raw_nsecs; |
| 977 | 1070 | ||
| 978 | /* Accumulate error between NTP and clock interval */ | 1071 | /* Accumulate error between NTP and clock interval */ |
| 979 | timekeeper.ntp_error += ntp_tick_length() << shift; | 1072 | tk->ntp_error += ntp_tick_length() << shift; |
| 980 | timekeeper.ntp_error -= | 1073 | tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) << |
| 981 | (timekeeper.xtime_interval + timekeeper.xtime_remainder) << | 1074 | (tk->ntp_error_shift + shift); |
| 982 | (timekeeper.ntp_error_shift + shift); | ||
| 983 | 1075 | ||
| 984 | return offset; | 1076 | return offset; |
| 985 | } | 1077 | } |
| @@ -995,6 +1087,7 @@ static void update_wall_time(void) | |||
| 995 | cycle_t offset; | 1087 | cycle_t offset; |
| 996 | int shift = 0, maxshift; | 1088 | int shift = 0, maxshift; |
| 997 | unsigned long flags; | 1089 | unsigned long flags; |
| 1090 | s64 remainder; | ||
| 998 | 1091 | ||
| 999 | write_seqlock_irqsave(&timekeeper.lock, flags); | 1092 | write_seqlock_irqsave(&timekeeper.lock, flags); |
| 1000 | 1093 | ||
| @@ -1009,8 +1102,6 @@ static void update_wall_time(void) | |||
| 1009 | #else | 1102 | #else |
| 1010 | offset = (clock->read(clock) - clock->cycle_last) & clock->mask; | 1103 | offset = (clock->read(clock) - clock->cycle_last) & clock->mask; |
| 1011 | #endif | 1104 | #endif |
| 1012 | timekeeper.xtime_nsec = (s64)timekeeper.xtime.tv_nsec << | ||
| 1013 | timekeeper.shift; | ||
| 1014 | 1105 | ||
| 1015 | /* | 1106 | /* |
| 1016 | * With NO_HZ we may have to accumulate many cycle_intervals | 1107 | * With NO_HZ we may have to accumulate many cycle_intervals |
| @@ -1026,62 +1117,36 @@ static void update_wall_time(void) | |||
| 1026 | maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; | 1117 | maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; |
| 1027 | shift = min(shift, maxshift); | 1118 | shift = min(shift, maxshift); |
| 1028 | while (offset >= timekeeper.cycle_interval) { | 1119 | while (offset >= timekeeper.cycle_interval) { |
| 1029 | offset = logarithmic_accumulation(offset, shift); | 1120 | offset = logarithmic_accumulation(&timekeeper, offset, shift); |
| 1030 | if(offset < timekeeper.cycle_interval<<shift) | 1121 | if(offset < timekeeper.cycle_interval<<shift) |
| 1031 | shift--; | 1122 | shift--; |
| 1032 | } | 1123 | } |
| 1033 | 1124 | ||
| 1034 | /* correct the clock when NTP error is too big */ | 1125 | /* correct the clock when NTP error is too big */ |
| 1035 | timekeeping_adjust(offset); | 1126 | timekeeping_adjust(&timekeeper, offset); |
| 1036 | |||
| 1037 | /* | ||
| 1038 | * Since in the loop above, we accumulate any amount of time | ||
| 1039 | * in xtime_nsec over a second into xtime.tv_sec, its possible for | ||
| 1040 | * xtime_nsec to be fairly small after the loop. Further, if we're | ||
| 1041 | * slightly speeding the clocksource up in timekeeping_adjust(), | ||
| 1042 | * its possible the required corrective factor to xtime_nsec could | ||
| 1043 | * cause it to underflow. | ||
| 1044 | * | ||
| 1045 | * Now, we cannot simply roll the accumulated second back, since | ||
| 1046 | * the NTP subsystem has been notified via second_overflow. So | ||
| 1047 | * instead we push xtime_nsec forward by the amount we underflowed, | ||
| 1048 | * and add that amount into the error. | ||
| 1049 | * | ||
| 1050 | * We'll correct this error next time through this function, when | ||
| 1051 | * xtime_nsec is not as small. | ||
| 1052 | */ | ||
| 1053 | if (unlikely((s64)timekeeper.xtime_nsec < 0)) { | ||
| 1054 | s64 neg = -(s64)timekeeper.xtime_nsec; | ||
| 1055 | timekeeper.xtime_nsec = 0; | ||
| 1056 | timekeeper.ntp_error += neg << timekeeper.ntp_error_shift; | ||
| 1057 | } | ||
| 1058 | 1127 | ||
| 1059 | 1128 | ||
| 1060 | /* | 1129 | /* |
| 1061 | * Store full nanoseconds into xtime after rounding it up and | 1130 | * Store only full nanoseconds into xtime_nsec after rounding |
| 1062 | * add the remainder to the error difference. | 1131 | * it up and add the remainder to the error difference. |
| 1063 | */ | 1132 | * XXX - This is necessary to avoid small 1ns inconsistnecies caused |
| 1064 | timekeeper.xtime.tv_nsec = ((s64)timekeeper.xtime_nsec >> | 1133 | * by truncating the remainder in vsyscalls. However, it causes |
| 1065 | timekeeper.shift) + 1; | 1134 | * additional work to be done in timekeeping_adjust(). Once |
| 1066 | timekeeper.xtime_nsec -= (s64)timekeeper.xtime.tv_nsec << | 1135 | * the vsyscall implementations are converted to use xtime_nsec |
| 1067 | timekeeper.shift; | 1136 | * (shifted nanoseconds), this can be killed. |
| 1068 | timekeeper.ntp_error += timekeeper.xtime_nsec << | 1137 | */ |
| 1069 | timekeeper.ntp_error_shift; | 1138 | remainder = timekeeper.xtime_nsec & ((1 << timekeeper.shift) - 1); |
| 1139 | timekeeper.xtime_nsec -= remainder; | ||
| 1140 | timekeeper.xtime_nsec += 1 << timekeeper.shift; | ||
| 1141 | timekeeper.ntp_error += remainder << timekeeper.ntp_error_shift; | ||
| 1070 | 1142 | ||
| 1071 | /* | 1143 | /* |
| 1072 | * Finally, make sure that after the rounding | 1144 | * Finally, make sure that after the rounding |
| 1073 | * xtime.tv_nsec isn't larger than NSEC_PER_SEC | 1145 | * xtime_nsec isn't larger than NSEC_PER_SEC |
| 1074 | */ | 1146 | */ |
| 1075 | if (unlikely(timekeeper.xtime.tv_nsec >= NSEC_PER_SEC)) { | 1147 | accumulate_nsecs_to_secs(&timekeeper); |
| 1076 | int leap; | ||
| 1077 | timekeeper.xtime.tv_nsec -= NSEC_PER_SEC; | ||
| 1078 | timekeeper.xtime.tv_sec++; | ||
| 1079 | leap = second_overflow(timekeeper.xtime.tv_sec); | ||
| 1080 | timekeeper.xtime.tv_sec += leap; | ||
| 1081 | timekeeper.wall_to_monotonic.tv_sec -= leap; | ||
| 1082 | } | ||
| 1083 | 1148 | ||
| 1084 | timekeeping_update(false); | 1149 | timekeeping_update(&timekeeper, false); |
| 1085 | 1150 | ||
| 1086 | out: | 1151 | out: |
| 1087 | write_sequnlock_irqrestore(&timekeeper.lock, flags); | 1152 | write_sequnlock_irqrestore(&timekeeper.lock, flags); |
| @@ -1126,21 +1191,20 @@ void get_monotonic_boottime(struct timespec *ts) | |||
| 1126 | { | 1191 | { |
| 1127 | struct timespec tomono, sleep; | 1192 | struct timespec tomono, sleep; |
| 1128 | unsigned int seq; | 1193 | unsigned int seq; |
| 1129 | s64 nsecs; | ||
| 1130 | 1194 | ||
| 1131 | WARN_ON(timekeeping_suspended); | 1195 | WARN_ON(timekeeping_suspended); |
| 1132 | 1196 | ||
| 1133 | do { | 1197 | do { |
| 1134 | seq = read_seqbegin(&timekeeper.lock); | 1198 | seq = read_seqbegin(&timekeeper.lock); |
| 1135 | *ts = timekeeper.xtime; | 1199 | ts->tv_sec = timekeeper.xtime_sec; |
| 1200 | ts->tv_nsec = timekeeping_get_ns(&timekeeper); | ||
| 1136 | tomono = timekeeper.wall_to_monotonic; | 1201 | tomono = timekeeper.wall_to_monotonic; |
| 1137 | sleep = timekeeper.total_sleep_time; | 1202 | sleep = timekeeper.total_sleep_time; |
| 1138 | nsecs = timekeeping_get_ns(); | ||
| 1139 | 1203 | ||
| 1140 | } while (read_seqretry(&timekeeper.lock, seq)); | 1204 | } while (read_seqretry(&timekeeper.lock, seq)); |
| 1141 | 1205 | ||
| 1142 | set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec, | 1206 | set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec, |
| 1143 | ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs); | 1207 | ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec); |
| 1144 | } | 1208 | } |
| 1145 | EXPORT_SYMBOL_GPL(get_monotonic_boottime); | 1209 | EXPORT_SYMBOL_GPL(get_monotonic_boottime); |
| 1146 | 1210 | ||
| @@ -1173,13 +1237,13 @@ EXPORT_SYMBOL_GPL(monotonic_to_bootbased); | |||
| 1173 | 1237 | ||
| 1174 | unsigned long get_seconds(void) | 1238 | unsigned long get_seconds(void) |
| 1175 | { | 1239 | { |
| 1176 | return timekeeper.xtime.tv_sec; | 1240 | return timekeeper.xtime_sec; |
| 1177 | } | 1241 | } |
| 1178 | EXPORT_SYMBOL(get_seconds); | 1242 | EXPORT_SYMBOL(get_seconds); |
| 1179 | 1243 | ||
| 1180 | struct timespec __current_kernel_time(void) | 1244 | struct timespec __current_kernel_time(void) |
| 1181 | { | 1245 | { |
| 1182 | return timekeeper.xtime; | 1246 | return tk_xtime(&timekeeper); |
| 1183 | } | 1247 | } |
| 1184 | 1248 | ||
| 1185 | struct timespec current_kernel_time(void) | 1249 | struct timespec current_kernel_time(void) |
| @@ -1190,7 +1254,7 @@ struct timespec current_kernel_time(void) | |||
| 1190 | do { | 1254 | do { |
| 1191 | seq = read_seqbegin(&timekeeper.lock); | 1255 | seq = read_seqbegin(&timekeeper.lock); |
| 1192 | 1256 | ||
| 1193 | now = timekeeper.xtime; | 1257 | now = tk_xtime(&timekeeper); |
| 1194 | } while (read_seqretry(&timekeeper.lock, seq)); | 1258 | } while (read_seqretry(&timekeeper.lock, seq)); |
| 1195 | 1259 | ||
| 1196 | return now; | 1260 | return now; |
| @@ -1205,7 +1269,7 @@ struct timespec get_monotonic_coarse(void) | |||
| 1205 | do { | 1269 | do { |
| 1206 | seq = read_seqbegin(&timekeeper.lock); | 1270 | seq = read_seqbegin(&timekeeper.lock); |
| 1207 | 1271 | ||
| 1208 | now = timekeeper.xtime; | 1272 | now = tk_xtime(&timekeeper); |
| 1209 | mono = timekeeper.wall_to_monotonic; | 1273 | mono = timekeeper.wall_to_monotonic; |
| 1210 | } while (read_seqretry(&timekeeper.lock, seq)); | 1274 | } while (read_seqretry(&timekeeper.lock, seq)); |
| 1211 | 1275 | ||
| @@ -1240,12 +1304,43 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, | |||
| 1240 | 1304 | ||
| 1241 | do { | 1305 | do { |
| 1242 | seq = read_seqbegin(&timekeeper.lock); | 1306 | seq = read_seqbegin(&timekeeper.lock); |
| 1243 | *xtim = timekeeper.xtime; | 1307 | *xtim = tk_xtime(&timekeeper); |
| 1244 | *wtom = timekeeper.wall_to_monotonic; | 1308 | *wtom = timekeeper.wall_to_monotonic; |
| 1245 | *sleep = timekeeper.total_sleep_time; | 1309 | *sleep = timekeeper.total_sleep_time; |
| 1246 | } while (read_seqretry(&timekeeper.lock, seq)); | 1310 | } while (read_seqretry(&timekeeper.lock, seq)); |
| 1247 | } | 1311 | } |
| 1248 | 1312 | ||
| 1313 | #ifdef CONFIG_HIGH_RES_TIMERS | ||
| 1314 | /** | ||
| 1315 | * ktime_get_update_offsets - hrtimer helper | ||
| 1316 | * @offs_real: pointer to storage for monotonic -> realtime offset | ||
| 1317 | * @offs_boot: pointer to storage for monotonic -> boottime offset | ||
| 1318 | * | ||
| 1319 | * Returns current monotonic time and updates the offsets | ||
| 1320 | * Called from hrtimer_interupt() or retrigger_next_event() | ||
| 1321 | */ | ||
| 1322 | ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) | ||
| 1323 | { | ||
| 1324 | ktime_t now; | ||
| 1325 | unsigned int seq; | ||
| 1326 | u64 secs, nsecs; | ||
| 1327 | |||
| 1328 | do { | ||
| 1329 | seq = read_seqbegin(&timekeeper.lock); | ||
| 1330 | |||
| 1331 | secs = timekeeper.xtime_sec; | ||
| 1332 | nsecs = timekeeping_get_ns(&timekeeper); | ||
| 1333 | |||
| 1334 | *offs_real = timekeeper.offs_real; | ||
| 1335 | *offs_boot = timekeeper.offs_boot; | ||
| 1336 | } while (read_seqretry(&timekeeper.lock, seq)); | ||
| 1337 | |||
| 1338 | now = ktime_add_ns(ktime_set(secs, 0), nsecs); | ||
| 1339 | now = ktime_sub(now, *offs_real); | ||
| 1340 | return now; | ||
| 1341 | } | ||
| 1342 | #endif | ||
| 1343 | |||
| 1249 | /** | 1344 | /** |
| 1250 | * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format | 1345 | * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format |
| 1251 | */ | 1346 | */ |
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 3258455549f4..af5a7e9f164b 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c | |||
| @@ -167,7 +167,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) | |||
| 167 | { | 167 | { |
| 168 | struct tick_sched *ts = tick_get_tick_sched(cpu); | 168 | struct tick_sched *ts = tick_get_tick_sched(cpu); |
| 169 | P(nohz_mode); | 169 | P(nohz_mode); |
| 170 | P_ns(idle_tick); | 170 | P_ns(last_tick); |
| 171 | P(tick_stopped); | 171 | P(tick_stopped); |
| 172 | P(idle_jiffies); | 172 | P(idle_jiffies); |
| 173 | P(idle_calls); | 173 | P(idle_calls); |
| @@ -259,7 +259,7 @@ static int timer_list_show(struct seq_file *m, void *v) | |||
| 259 | u64 now = ktime_to_ns(ktime_get()); | 259 | u64 now = ktime_to_ns(ktime_get()); |
| 260 | int cpu; | 260 | int cpu; |
| 261 | 261 | ||
| 262 | SEQ_printf(m, "Timer List Version: v0.6\n"); | 262 | SEQ_printf(m, "Timer List Version: v0.7\n"); |
| 263 | SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); | 263 | SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); |
| 264 | SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); | 264 | SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); |
| 265 | 265 | ||
diff --git a/kernel/timer.c b/kernel/timer.c index 6ec7e7e0db43..a61c09374eba 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
| @@ -77,6 +77,7 @@ struct tvec_base { | |||
| 77 | struct timer_list *running_timer; | 77 | struct timer_list *running_timer; |
| 78 | unsigned long timer_jiffies; | 78 | unsigned long timer_jiffies; |
| 79 | unsigned long next_timer; | 79 | unsigned long next_timer; |
| 80 | unsigned long active_timers; | ||
| 80 | struct tvec_root tv1; | 81 | struct tvec_root tv1; |
| 81 | struct tvec tv2; | 82 | struct tvec tv2; |
| 82 | struct tvec tv3; | 83 | struct tvec tv3; |
| @@ -330,7 +331,8 @@ void set_timer_slack(struct timer_list *timer, int slack_hz) | |||
| 330 | } | 331 | } |
| 331 | EXPORT_SYMBOL_GPL(set_timer_slack); | 332 | EXPORT_SYMBOL_GPL(set_timer_slack); |
| 332 | 333 | ||
| 333 | static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) | 334 | static void |
| 335 | __internal_add_timer(struct tvec_base *base, struct timer_list *timer) | ||
| 334 | { | 336 | { |
| 335 | unsigned long expires = timer->expires; | 337 | unsigned long expires = timer->expires; |
| 336 | unsigned long idx = expires - base->timer_jiffies; | 338 | unsigned long idx = expires - base->timer_jiffies; |
| @@ -372,6 +374,19 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) | |||
| 372 | list_add_tail(&timer->entry, vec); | 374 | list_add_tail(&timer->entry, vec); |
| 373 | } | 375 | } |
| 374 | 376 | ||
| 377 | static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) | ||
| 378 | { | ||
| 379 | __internal_add_timer(base, timer); | ||
| 380 | /* | ||
| 381 | * Update base->active_timers and base->next_timer | ||
| 382 | */ | ||
| 383 | if (!tbase_get_deferrable(timer->base)) { | ||
| 384 | if (time_before(timer->expires, base->next_timer)) | ||
| 385 | base->next_timer = timer->expires; | ||
| 386 | base->active_timers++; | ||
| 387 | } | ||
| 388 | } | ||
| 389 | |||
| 375 | #ifdef CONFIG_TIMER_STATS | 390 | #ifdef CONFIG_TIMER_STATS |
| 376 | void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) | 391 | void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) |
| 377 | { | 392 | { |
| @@ -654,8 +669,7 @@ void init_timer_deferrable_key(struct timer_list *timer, | |||
| 654 | } | 669 | } |
| 655 | EXPORT_SYMBOL(init_timer_deferrable_key); | 670 | EXPORT_SYMBOL(init_timer_deferrable_key); |
| 656 | 671 | ||
| 657 | static inline void detach_timer(struct timer_list *timer, | 672 | static inline void detach_timer(struct timer_list *timer, bool clear_pending) |
| 658 | int clear_pending) | ||
| 659 | { | 673 | { |
| 660 | struct list_head *entry = &timer->entry; | 674 | struct list_head *entry = &timer->entry; |
| 661 | 675 | ||
| @@ -667,6 +681,29 @@ static inline void detach_timer(struct timer_list *timer, | |||
| 667 | entry->prev = LIST_POISON2; | 681 | entry->prev = LIST_POISON2; |
| 668 | } | 682 | } |
| 669 | 683 | ||
| 684 | static inline void | ||
| 685 | detach_expired_timer(struct timer_list *timer, struct tvec_base *base) | ||
| 686 | { | ||
| 687 | detach_timer(timer, true); | ||
| 688 | if (!tbase_get_deferrable(timer->base)) | ||
| 689 | timer->base->active_timers--; | ||
| 690 | } | ||
| 691 | |||
| 692 | static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, | ||
| 693 | bool clear_pending) | ||
| 694 | { | ||
| 695 | if (!timer_pending(timer)) | ||
| 696 | return 0; | ||
| 697 | |||
| 698 | detach_timer(timer, clear_pending); | ||
| 699 | if (!tbase_get_deferrable(timer->base)) { | ||
| 700 | timer->base->active_timers--; | ||
| 701 | if (timer->expires == base->next_timer) | ||
| 702 | base->next_timer = base->timer_jiffies; | ||
| 703 | } | ||
| 704 | return 1; | ||
| 705 | } | ||
| 706 | |||
| 670 | /* | 707 | /* |
| 671 | * We are using hashed locking: holding per_cpu(tvec_bases).lock | 708 | * We are using hashed locking: holding per_cpu(tvec_bases).lock |
| 672 | * means that all timers which are tied to this base via timer->base are | 709 | * means that all timers which are tied to this base via timer->base are |
| @@ -712,16 +749,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires, | |||
| 712 | 749 | ||
| 713 | base = lock_timer_base(timer, &flags); | 750 | base = lock_timer_base(timer, &flags); |
| 714 | 751 | ||
| 715 | if (timer_pending(timer)) { | 752 | ret = detach_if_pending(timer, base, false); |
| 716 | detach_timer(timer, 0); | 753 | if (!ret && pending_only) |
| 717 | if (timer->expires == base->next_timer && | 754 | goto out_unlock; |
| 718 | !tbase_get_deferrable(timer->base)) | ||
| 719 | base->next_timer = base->timer_jiffies; | ||
| 720 | ret = 1; | ||
| 721 | } else { | ||
| 722 | if (pending_only) | ||
| 723 | goto out_unlock; | ||
| 724 | } | ||
| 725 | 755 | ||
| 726 | debug_activate(timer, expires); | 756 | debug_activate(timer, expires); |
| 727 | 757 | ||
| @@ -752,9 +782,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires, | |||
| 752 | } | 782 | } |
| 753 | 783 | ||
| 754 | timer->expires = expires; | 784 | timer->expires = expires; |
| 755 | if (time_before(timer->expires, base->next_timer) && | ||
| 756 | !tbase_get_deferrable(timer->base)) | ||
| 757 | base->next_timer = timer->expires; | ||
| 758 | internal_add_timer(base, timer); | 785 | internal_add_timer(base, timer); |
| 759 | 786 | ||
| 760 | out_unlock: | 787 | out_unlock: |
| @@ -920,9 +947,6 @@ void add_timer_on(struct timer_list *timer, int cpu) | |||
| 920 | spin_lock_irqsave(&base->lock, flags); | 947 | spin_lock_irqsave(&base->lock, flags); |
| 921 | timer_set_base(timer, base); | 948 | timer_set_base(timer, base); |
| 922 | debug_activate(timer, timer->expires); | 949 | debug_activate(timer, timer->expires); |
| 923 | if (time_before(timer->expires, base->next_timer) && | ||
| 924 | !tbase_get_deferrable(timer->base)) | ||
| 925 | base->next_timer = timer->expires; | ||
| 926 | internal_add_timer(base, timer); | 950 | internal_add_timer(base, timer); |
| 927 | /* | 951 | /* |
| 928 | * Check whether the other CPU is idle and needs to be | 952 | * Check whether the other CPU is idle and needs to be |
| @@ -959,13 +983,7 @@ int del_timer(struct timer_list *timer) | |||
| 959 | timer_stats_timer_clear_start_info(timer); | 983 | timer_stats_timer_clear_start_info(timer); |
| 960 | if (timer_pending(timer)) { | 984 | if (timer_pending(timer)) { |
| 961 | base = lock_timer_base(timer, &flags); | 985 | base = lock_timer_base(timer, &flags); |
| 962 | if (timer_pending(timer)) { | 986 | ret = detach_if_pending(timer, base, true); |
| 963 | detach_timer(timer, 1); | ||
| 964 | if (timer->expires == base->next_timer && | ||
| 965 | !tbase_get_deferrable(timer->base)) | ||
| 966 | base->next_timer = base->timer_jiffies; | ||
| 967 | ret = 1; | ||
| 968 | } | ||
| 969 | spin_unlock_irqrestore(&base->lock, flags); | 987 | spin_unlock_irqrestore(&base->lock, flags); |
| 970 | } | 988 | } |
| 971 | 989 | ||
| @@ -990,19 +1008,10 @@ int try_to_del_timer_sync(struct timer_list *timer) | |||
| 990 | 1008 | ||
| 991 | base = lock_timer_base(timer, &flags); | 1009 | base = lock_timer_base(timer, &flags); |
| 992 | 1010 | ||
| 993 | if (base->running_timer == timer) | 1011 | if (base->running_timer != timer) { |
| 994 | goto out; | 1012 | timer_stats_timer_clear_start_info(timer); |
| 995 | 1013 | ret = detach_if_pending(timer, base, true); | |
| 996 | timer_stats_timer_clear_start_info(timer); | ||
| 997 | ret = 0; | ||
| 998 | if (timer_pending(timer)) { | ||
| 999 | detach_timer(timer, 1); | ||
| 1000 | if (timer->expires == base->next_timer && | ||
| 1001 | !tbase_get_deferrable(timer->base)) | ||
| 1002 | base->next_timer = base->timer_jiffies; | ||
| 1003 | ret = 1; | ||
| 1004 | } | 1014 | } |
| 1005 | out: | ||
| 1006 | spin_unlock_irqrestore(&base->lock, flags); | 1015 | spin_unlock_irqrestore(&base->lock, flags); |
| 1007 | 1016 | ||
| 1008 | return ret; | 1017 | return ret; |
| @@ -1089,7 +1098,8 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index) | |||
| 1089 | */ | 1098 | */ |
| 1090 | list_for_each_entry_safe(timer, tmp, &tv_list, entry) { | 1099 | list_for_each_entry_safe(timer, tmp, &tv_list, entry) { |
| 1091 | BUG_ON(tbase_get_base(timer->base) != base); | 1100 | BUG_ON(tbase_get_base(timer->base) != base); |
| 1092 | internal_add_timer(base, timer); | 1101 | /* No accounting, while moving them */ |
| 1102 | __internal_add_timer(base, timer); | ||
| 1093 | } | 1103 | } |
| 1094 | 1104 | ||
| 1095 | return index; | 1105 | return index; |
| @@ -1178,7 +1188,7 @@ static inline void __run_timers(struct tvec_base *base) | |||
| 1178 | timer_stats_account_timer(timer); | 1188 | timer_stats_account_timer(timer); |
| 1179 | 1189 | ||
| 1180 | base->running_timer = timer; | 1190 | base->running_timer = timer; |
| 1181 | detach_timer(timer, 1); | 1191 | detach_expired_timer(timer, base); |
| 1182 | 1192 | ||
| 1183 | spin_unlock_irq(&base->lock); | 1193 | spin_unlock_irq(&base->lock); |
| 1184 | call_timer_fn(timer, fn, data); | 1194 | call_timer_fn(timer, fn, data); |
| @@ -1316,18 +1326,21 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now, | |||
| 1316 | unsigned long get_next_timer_interrupt(unsigned long now) | 1326 | unsigned long get_next_timer_interrupt(unsigned long now) |
| 1317 | { | 1327 | { |
| 1318 | struct tvec_base *base = __this_cpu_read(tvec_bases); | 1328 | struct tvec_base *base = __this_cpu_read(tvec_bases); |
| 1319 | unsigned long expires; | 1329 | unsigned long expires = now + NEXT_TIMER_MAX_DELTA; |
| 1320 | 1330 | ||
| 1321 | /* | 1331 | /* |
| 1322 | * Pretend that there is no timer pending if the cpu is offline. | 1332 | * Pretend that there is no timer pending if the cpu is offline. |
| 1323 | * Possible pending timers will be migrated later to an active cpu. | 1333 | * Possible pending timers will be migrated later to an active cpu. |
| 1324 | */ | 1334 | */ |
| 1325 | if (cpu_is_offline(smp_processor_id())) | 1335 | if (cpu_is_offline(smp_processor_id())) |
| 1326 | return now + NEXT_TIMER_MAX_DELTA; | 1336 | return expires; |
| 1337 | |||
| 1327 | spin_lock(&base->lock); | 1338 | spin_lock(&base->lock); |
| 1328 | if (time_before_eq(base->next_timer, base->timer_jiffies)) | 1339 | if (base->active_timers) { |
| 1329 | base->next_timer = __next_timer_interrupt(base); | 1340 | if (time_before_eq(base->next_timer, base->timer_jiffies)) |
| 1330 | expires = base->next_timer; | 1341 | base->next_timer = __next_timer_interrupt(base); |
| 1342 | expires = base->next_timer; | ||
| 1343 | } | ||
| 1331 | spin_unlock(&base->lock); | 1344 | spin_unlock(&base->lock); |
| 1332 | 1345 | ||
| 1333 | if (time_before_eq(expires, now)) | 1346 | if (time_before_eq(expires, now)) |
| @@ -1704,6 +1717,7 @@ static int __cpuinit init_timers_cpu(int cpu) | |||
| 1704 | 1717 | ||
| 1705 | base->timer_jiffies = jiffies; | 1718 | base->timer_jiffies = jiffies; |
| 1706 | base->next_timer = base->timer_jiffies; | 1719 | base->next_timer = base->timer_jiffies; |
| 1720 | base->active_timers = 0; | ||
| 1707 | return 0; | 1721 | return 0; |
| 1708 | } | 1722 | } |
| 1709 | 1723 | ||
| @@ -1714,11 +1728,9 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea | |||
| 1714 | 1728 | ||
| 1715 | while (!list_empty(head)) { | 1729 | while (!list_empty(head)) { |
| 1716 | timer = list_first_entry(head, struct timer_list, entry); | 1730 | timer = list_first_entry(head, struct timer_list, entry); |
| 1717 | detach_timer(timer, 0); | 1731 | /* We ignore the accounting on the dying cpu */ |
| 1732 | detach_timer(timer, false); | ||
| 1718 | timer_set_base(timer, new_base); | 1733 | timer_set_base(timer, new_base); |
| 1719 | if (time_before(timer->expires, new_base->next_timer) && | ||
| 1720 | !tbase_get_deferrable(timer->base)) | ||
| 1721 | new_base->next_timer = timer->expires; | ||
| 1722 | internal_add_timer(new_base, timer); | 1734 | internal_add_timer(new_base, timer); |
| 1723 | } | 1735 | } |
| 1724 | } | 1736 | } |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a008663d86c8..b4f20fba09fc 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
| @@ -312,7 +312,7 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list, | |||
| 312 | 312 | ||
| 313 | static int __register_ftrace_function(struct ftrace_ops *ops) | 313 | static int __register_ftrace_function(struct ftrace_ops *ops) |
| 314 | { | 314 | { |
| 315 | if (ftrace_disabled) | 315 | if (unlikely(ftrace_disabled)) |
| 316 | return -ENODEV; | 316 | return -ENODEV; |
| 317 | 317 | ||
| 318 | if (FTRACE_WARN_ON(ops == &global_ops)) | 318 | if (FTRACE_WARN_ON(ops == &global_ops)) |
| @@ -4299,16 +4299,12 @@ int register_ftrace_function(struct ftrace_ops *ops) | |||
| 4299 | 4299 | ||
| 4300 | mutex_lock(&ftrace_lock); | 4300 | mutex_lock(&ftrace_lock); |
| 4301 | 4301 | ||
| 4302 | if (unlikely(ftrace_disabled)) | ||
| 4303 | goto out_unlock; | ||
| 4304 | |||
| 4305 | ret = __register_ftrace_function(ops); | 4302 | ret = __register_ftrace_function(ops); |
| 4306 | if (!ret) | 4303 | if (!ret) |
| 4307 | ret = ftrace_startup(ops, 0); | 4304 | ret = ftrace_startup(ops, 0); |
| 4308 | 4305 | ||
| 4309 | |||
| 4310 | out_unlock: | ||
| 4311 | mutex_unlock(&ftrace_lock); | 4306 | mutex_unlock(&ftrace_lock); |
| 4307 | |||
| 4312 | return ret; | 4308 | return ret; |
| 4313 | } | 4309 | } |
| 4314 | EXPORT_SYMBOL_GPL(register_ftrace_function); | 4310 | EXPORT_SYMBOL_GPL(register_ftrace_function); |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1d0f6a8a0e5e..49491fa7daa2 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
| @@ -1075,6 +1075,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu) | |||
| 1075 | rb_init_page(bpage->page); | 1075 | rb_init_page(bpage->page); |
| 1076 | 1076 | ||
| 1077 | INIT_LIST_HEAD(&cpu_buffer->reader_page->list); | 1077 | INIT_LIST_HEAD(&cpu_buffer->reader_page->list); |
| 1078 | INIT_LIST_HEAD(&cpu_buffer->new_pages); | ||
| 1078 | 1079 | ||
| 1079 | ret = rb_allocate_pages(cpu_buffer, nr_pages); | 1080 | ret = rb_allocate_pages(cpu_buffer, nr_pages); |
| 1080 | if (ret < 0) | 1081 | if (ret < 0) |
| @@ -1346,10 +1347,9 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages) | |||
| 1346 | * If something was added to this page, it was full | 1347 | * If something was added to this page, it was full |
| 1347 | * since it is not the tail page. So we deduct the | 1348 | * since it is not the tail page. So we deduct the |
| 1348 | * bytes consumed in ring buffer from here. | 1349 | * bytes consumed in ring buffer from here. |
| 1349 | * No need to update overruns, since this page is | 1350 | * Increment overrun to account for the lost events. |
| 1350 | * deleted from ring buffer and its entries are | ||
| 1351 | * already accounted for. | ||
| 1352 | */ | 1351 | */ |
| 1352 | local_add(page_entries, &cpu_buffer->overrun); | ||
| 1353 | local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); | 1353 | local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); |
| 1354 | } | 1354 | } |
| 1355 | 1355 | ||
| @@ -3239,6 +3239,10 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) | |||
| 3239 | if (cpu_buffer->commit_page == cpu_buffer->reader_page) | 3239 | if (cpu_buffer->commit_page == cpu_buffer->reader_page) |
| 3240 | goto out; | 3240 | goto out; |
| 3241 | 3241 | ||
| 3242 | /* Don't bother swapping if the ring buffer is empty */ | ||
| 3243 | if (rb_num_of_entries(cpu_buffer) == 0) | ||
| 3244 | goto out; | ||
| 3245 | |||
| 3242 | /* | 3246 | /* |
| 3243 | * Reset the reader page to size zero. | 3247 | * Reset the reader page to size zero. |
| 3244 | */ | 3248 | */ |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a7fa0702be1c..a120f98c4112 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
| @@ -830,6 +830,8 @@ int register_tracer(struct tracer *type) | |||
| 830 | current_trace = saved_tracer; | 830 | current_trace = saved_tracer; |
| 831 | if (ret) { | 831 | if (ret) { |
| 832 | printk(KERN_CONT "FAILED!\n"); | 832 | printk(KERN_CONT "FAILED!\n"); |
| 833 | /* Add the warning after printing 'FAILED' */ | ||
| 834 | WARN_ON(1); | ||
| 833 | goto out; | 835 | goto out; |
| 834 | } | 836 | } |
| 835 | /* Only reset on passing, to avoid touching corrupted buffers */ | 837 | /* Only reset on passing, to avoid touching corrupted buffers */ |
| @@ -1708,9 +1710,11 @@ EXPORT_SYMBOL_GPL(trace_vprintk); | |||
| 1708 | 1710 | ||
| 1709 | static void trace_iterator_increment(struct trace_iterator *iter) | 1711 | static void trace_iterator_increment(struct trace_iterator *iter) |
| 1710 | { | 1712 | { |
| 1713 | struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, iter->cpu); | ||
| 1714 | |||
| 1711 | iter->idx++; | 1715 | iter->idx++; |
| 1712 | if (iter->buffer_iter[iter->cpu]) | 1716 | if (buf_iter) |
| 1713 | ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); | 1717 | ring_buffer_read(buf_iter, NULL); |
| 1714 | } | 1718 | } |
| 1715 | 1719 | ||
| 1716 | static struct trace_entry * | 1720 | static struct trace_entry * |
| @@ -1718,7 +1722,7 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, | |||
| 1718 | unsigned long *lost_events) | 1722 | unsigned long *lost_events) |
| 1719 | { | 1723 | { |
| 1720 | struct ring_buffer_event *event; | 1724 | struct ring_buffer_event *event; |
| 1721 | struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; | 1725 | struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, cpu); |
| 1722 | 1726 | ||
| 1723 | if (buf_iter) | 1727 | if (buf_iter) |
| 1724 | event = ring_buffer_iter_peek(buf_iter, ts); | 1728 | event = ring_buffer_iter_peek(buf_iter, ts); |
| @@ -1856,10 +1860,10 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu) | |||
| 1856 | 1860 | ||
| 1857 | tr->data[cpu]->skipped_entries = 0; | 1861 | tr->data[cpu]->skipped_entries = 0; |
| 1858 | 1862 | ||
| 1859 | if (!iter->buffer_iter[cpu]) | 1863 | buf_iter = trace_buffer_iter(iter, cpu); |
| 1864 | if (!buf_iter) | ||
| 1860 | return; | 1865 | return; |
| 1861 | 1866 | ||
| 1862 | buf_iter = iter->buffer_iter[cpu]; | ||
| 1863 | ring_buffer_iter_reset(buf_iter); | 1867 | ring_buffer_iter_reset(buf_iter); |
| 1864 | 1868 | ||
| 1865 | /* | 1869 | /* |
| @@ -2205,13 +2209,15 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) | |||
| 2205 | 2209 | ||
| 2206 | int trace_empty(struct trace_iterator *iter) | 2210 | int trace_empty(struct trace_iterator *iter) |
| 2207 | { | 2211 | { |
| 2212 | struct ring_buffer_iter *buf_iter; | ||
| 2208 | int cpu; | 2213 | int cpu; |
| 2209 | 2214 | ||
| 2210 | /* If we are looking at one CPU buffer, only check that one */ | 2215 | /* If we are looking at one CPU buffer, only check that one */ |
| 2211 | if (iter->cpu_file != TRACE_PIPE_ALL_CPU) { | 2216 | if (iter->cpu_file != TRACE_PIPE_ALL_CPU) { |
| 2212 | cpu = iter->cpu_file; | 2217 | cpu = iter->cpu_file; |
| 2213 | if (iter->buffer_iter[cpu]) { | 2218 | buf_iter = trace_buffer_iter(iter, cpu); |
| 2214 | if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) | 2219 | if (buf_iter) { |
| 2220 | if (!ring_buffer_iter_empty(buf_iter)) | ||
| 2215 | return 0; | 2221 | return 0; |
| 2216 | } else { | 2222 | } else { |
| 2217 | if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) | 2223 | if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) |
| @@ -2221,8 +2227,9 @@ int trace_empty(struct trace_iterator *iter) | |||
| 2221 | } | 2227 | } |
| 2222 | 2228 | ||
| 2223 | for_each_tracing_cpu(cpu) { | 2229 | for_each_tracing_cpu(cpu) { |
| 2224 | if (iter->buffer_iter[cpu]) { | 2230 | buf_iter = trace_buffer_iter(iter, cpu); |
| 2225 | if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) | 2231 | if (buf_iter) { |
| 2232 | if (!ring_buffer_iter_empty(buf_iter)) | ||
| 2226 | return 0; | 2233 | return 0; |
| 2227 | } else { | 2234 | } else { |
| 2228 | if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) | 2235 | if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) |
| @@ -2381,6 +2388,11 @@ __tracing_open(struct inode *inode, struct file *file) | |||
| 2381 | if (!iter) | 2388 | if (!iter) |
| 2382 | return ERR_PTR(-ENOMEM); | 2389 | return ERR_PTR(-ENOMEM); |
| 2383 | 2390 | ||
| 2391 | iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(), | ||
| 2392 | GFP_KERNEL); | ||
| 2393 | if (!iter->buffer_iter) | ||
| 2394 | goto release; | ||
| 2395 | |||
| 2384 | /* | 2396 | /* |
| 2385 | * We make a copy of the current tracer to avoid concurrent | 2397 | * We make a copy of the current tracer to avoid concurrent |
| 2386 | * changes on it while we are reading. | 2398 | * changes on it while we are reading. |
| @@ -2441,6 +2453,8 @@ __tracing_open(struct inode *inode, struct file *file) | |||
| 2441 | fail: | 2453 | fail: |
| 2442 | mutex_unlock(&trace_types_lock); | 2454 | mutex_unlock(&trace_types_lock); |
| 2443 | kfree(iter->trace); | 2455 | kfree(iter->trace); |
| 2456 | kfree(iter->buffer_iter); | ||
| 2457 | release: | ||
| 2444 | seq_release_private(inode, file); | 2458 | seq_release_private(inode, file); |
| 2445 | return ERR_PTR(-ENOMEM); | 2459 | return ERR_PTR(-ENOMEM); |
| 2446 | } | 2460 | } |
| @@ -2481,6 +2495,7 @@ static int tracing_release(struct inode *inode, struct file *file) | |||
| 2481 | mutex_destroy(&iter->mutex); | 2495 | mutex_destroy(&iter->mutex); |
| 2482 | free_cpumask_var(iter->started); | 2496 | free_cpumask_var(iter->started); |
| 2483 | kfree(iter->trace); | 2497 | kfree(iter->trace); |
| 2498 | kfree(iter->buffer_iter); | ||
| 2484 | seq_release_private(inode, file); | 2499 | seq_release_private(inode, file); |
| 2485 | return 0; | 2500 | return 0; |
| 2486 | } | 2501 | } |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5aec220d2de0..55e1f7f0db12 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
| @@ -317,6 +317,14 @@ struct tracer { | |||
| 317 | 317 | ||
| 318 | #define TRACE_PIPE_ALL_CPU -1 | 318 | #define TRACE_PIPE_ALL_CPU -1 |
| 319 | 319 | ||
| 320 | static inline struct ring_buffer_iter * | ||
| 321 | trace_buffer_iter(struct trace_iterator *iter, int cpu) | ||
| 322 | { | ||
| 323 | if (iter->buffer_iter && iter->buffer_iter[cpu]) | ||
| 324 | return iter->buffer_iter[cpu]; | ||
| 325 | return NULL; | ||
| 326 | } | ||
| 327 | |||
| 320 | int tracer_init(struct tracer *t, struct trace_array *tr); | 328 | int tracer_init(struct tracer *t, struct trace_array *tr); |
| 321 | int tracing_is_enabled(void); | 329 | int tracing_is_enabled(void); |
| 322 | void trace_wake_up(void); | 330 | void trace_wake_up(void); |
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index a7d2a4c653d8..ce27c8ba8d31 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
| @@ -538,7 +538,7 @@ get_return_for_leaf(struct trace_iterator *iter, | |||
| 538 | next = &data->ret; | 538 | next = &data->ret; |
| 539 | } else { | 539 | } else { |
| 540 | 540 | ||
| 541 | ring_iter = iter->buffer_iter[iter->cpu]; | 541 | ring_iter = trace_buffer_iter(iter, iter->cpu); |
| 542 | 542 | ||
| 543 | /* First peek to compare current entry and the next one */ | 543 | /* First peek to compare current entry and the next one */ |
| 544 | if (ring_iter) | 544 | if (ring_iter) |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index df611a0e76c5..123b189c732c 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
| @@ -1325,4 +1325,4 @@ __init static int init_events(void) | |||
| 1325 | 1325 | ||
| 1326 | return 0; | 1326 | return 0; |
| 1327 | } | 1327 | } |
| 1328 | device_initcall(init_events); | 1328 | early_initcall(init_events); |
