aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cpu.c2
-rw-r--r--kernel/cpuset.c130
-rw-r--r--kernel/events/uprobes.c213
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/fork.c46
-rw-r--r--kernel/irq/handle.c7
-rw-r--r--kernel/irq/manage.c17
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kmod.c37
-rw-r--r--kernel/panic.c8
-rw-r--r--kernel/power/suspend.c3
-rw-r--r--kernel/printk.c191
-rw-r--r--kernel/resource.c24
-rw-r--r--kernel/sched/core.c94
-rw-r--r--kernel/sched/fair.c113
-rw-r--r--kernel/sched/sched.h23
-rw-r--r--kernel/softirq.c9
-rw-r--r--kernel/sys.c57
-rw-r--r--kernel/sysctl.c51
-rw-r--r--kernel/sysctl_binary.c2
-rw-r--r--kernel/taskstats.c5
-rw-r--r--kernel/trace/trace.c7
-rw-r--r--kernel/trace/trace_functions.c36
-rw-r--r--kernel/watchdog.c21
24 files changed, 742 insertions, 358 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a4eb5227a19e..14d32588cccd 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -416,7 +416,7 @@ int __cpuinit cpu_up(unsigned int cpu)
416 416
417 if (pgdat->node_zonelists->_zonerefs->zone == NULL) { 417 if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
418 mutex_lock(&zonelists_mutex); 418 mutex_lock(&zonelists_mutex);
419 build_all_zonelists(NULL); 419 build_all_zonelists(NULL, NULL);
420 mutex_unlock(&zonelists_mutex); 420 mutex_unlock(&zonelists_mutex);
421 } 421 }
422#endif 422#endif
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8c8bd652dd12..f33c7153b6d7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -147,6 +147,12 @@ typedef enum {
147 CS_SPREAD_SLAB, 147 CS_SPREAD_SLAB,
148} cpuset_flagbits_t; 148} cpuset_flagbits_t;
149 149
150/* the type of hotplug event */
151enum hotplug_event {
152 CPUSET_CPU_OFFLINE,
153 CPUSET_MEM_OFFLINE,
154};
155
150/* convenient tests for these bits */ 156/* convenient tests for these bits */
151static inline int is_cpu_exclusive(const struct cpuset *cs) 157static inline int is_cpu_exclusive(const struct cpuset *cs)
152{ 158{
@@ -1990,8 +1996,36 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
1990} 1996}
1991 1997
1992/* 1998/*
1993 * Walk the specified cpuset subtree and look for empty cpusets. 1999 * Helper function to traverse cpusets.
1994 * The tasks of such cpuset must be moved to a parent cpuset. 2000 * It can be used to walk the cpuset tree from top to bottom, completing
2001 * one layer before dropping down to the next (thus always processing a
2002 * node before any of its children).
2003 */
2004static struct cpuset *cpuset_next(struct list_head *queue)
2005{
2006 struct cpuset *cp;
2007 struct cpuset *child; /* scans child cpusets of cp */
2008 struct cgroup *cont;
2009
2010 if (list_empty(queue))
2011 return NULL;
2012
2013 cp = list_first_entry(queue, struct cpuset, stack_list);
2014 list_del(queue->next);
2015 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
2016 child = cgroup_cs(cont);
2017 list_add_tail(&child->stack_list, queue);
2018 }
2019
2020 return cp;
2021}
2022
2023
2024/*
2025 * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory
2026 * online/offline) and update the cpusets accordingly.
2027 * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such
2028 * cpuset must be moved to a parent cpuset.
1995 * 2029 *
1996 * Called with cgroup_mutex held. We take callback_mutex to modify 2030 * Called with cgroup_mutex held. We take callback_mutex to modify
1997 * cpus_allowed and mems_allowed. 2031 * cpus_allowed and mems_allowed.
@@ -2000,50 +2034,61 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2000 * before dropping down to the next. It always processes a node before 2034 * before dropping down to the next. It always processes a node before
2001 * any of its children. 2035 * any of its children.
2002 * 2036 *
2003 * For now, since we lack memory hot unplug, we'll never see a cpuset 2037 * In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY
2004 * that has tasks along with an empty 'mems'. But if we did see such 2038 * if all present pages from a node are offlined.
2005 * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
2006 */ 2039 */
2007static void scan_for_empty_cpusets(struct cpuset *root) 2040static void
2041scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
2008{ 2042{
2009 LIST_HEAD(queue); 2043 LIST_HEAD(queue);
2010 struct cpuset *cp; /* scans cpusets being updated */ 2044 struct cpuset *cp; /* scans cpusets being updated */
2011 struct cpuset *child; /* scans child cpusets of cp */
2012 struct cgroup *cont;
2013 static nodemask_t oldmems; /* protected by cgroup_mutex */ 2045 static nodemask_t oldmems; /* protected by cgroup_mutex */
2014 2046
2015 list_add_tail((struct list_head *)&root->stack_list, &queue); 2047 list_add_tail((struct list_head *)&root->stack_list, &queue);
2016 2048
2017 while (!list_empty(&queue)) { 2049 switch (event) {
2018 cp = list_first_entry(&queue, struct cpuset, stack_list); 2050 case CPUSET_CPU_OFFLINE:
2019 list_del(queue.next); 2051 while ((cp = cpuset_next(&queue)) != NULL) {
2020 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { 2052
2021 child = cgroup_cs(cont); 2053 /* Continue past cpusets with all cpus online */
2022 list_add_tail(&child->stack_list, &queue); 2054 if (cpumask_subset(cp->cpus_allowed, cpu_active_mask))
2055 continue;
2056
2057 /* Remove offline cpus from this cpuset. */
2058 mutex_lock(&callback_mutex);
2059 cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
2060 cpu_active_mask);
2061 mutex_unlock(&callback_mutex);
2062
2063 /* Move tasks from the empty cpuset to a parent */
2064 if (cpumask_empty(cp->cpus_allowed))
2065 remove_tasks_in_empty_cpuset(cp);
2066 else
2067 update_tasks_cpumask(cp, NULL);
2023 } 2068 }
2069 break;
2024 2070
2025 /* Continue past cpusets with all cpus, mems online */ 2071 case CPUSET_MEM_OFFLINE:
2026 if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) && 2072 while ((cp = cpuset_next(&queue)) != NULL) {
2027 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
2028 continue;
2029 2073
2030 oldmems = cp->mems_allowed; 2074 /* Continue past cpusets with all mems online */
2075 if (nodes_subset(cp->mems_allowed,
2076 node_states[N_HIGH_MEMORY]))
2077 continue;
2031 2078
2032 /* Remove offline cpus and mems from this cpuset. */ 2079 oldmems = cp->mems_allowed;
2033 mutex_lock(&callback_mutex); 2080
2034 cpumask_and(cp->cpus_allowed, cp->cpus_allowed, 2081 /* Remove offline mems from this cpuset. */
2035 cpu_active_mask); 2082 mutex_lock(&callback_mutex);
2036 nodes_and(cp->mems_allowed, cp->mems_allowed, 2083 nodes_and(cp->mems_allowed, cp->mems_allowed,
2037 node_states[N_HIGH_MEMORY]); 2084 node_states[N_HIGH_MEMORY]);
2038 mutex_unlock(&callback_mutex); 2085 mutex_unlock(&callback_mutex);
2039 2086
2040 /* Move tasks from the empty cpuset to a parent */ 2087 /* Move tasks from the empty cpuset to a parent */
2041 if (cpumask_empty(cp->cpus_allowed) || 2088 if (nodes_empty(cp->mems_allowed))
2042 nodes_empty(cp->mems_allowed)) 2089 remove_tasks_in_empty_cpuset(cp);
2043 remove_tasks_in_empty_cpuset(cp); 2090 else
2044 else { 2091 update_tasks_nodemask(cp, &oldmems, NULL);
2045 update_tasks_cpumask(cp, NULL);
2046 update_tasks_nodemask(cp, &oldmems, NULL);
2047 } 2092 }
2048 } 2093 }
2049} 2094}
@@ -2054,13 +2099,19 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2054 * (of no affect) on systems that are actively using CPU hotplug 2099 * (of no affect) on systems that are actively using CPU hotplug
2055 * but making no active use of cpusets. 2100 * but making no active use of cpusets.
2056 * 2101 *
2102 * The only exception to this is suspend/resume, where we don't
2103 * modify cpusets at all.
2104 *
2057 * This routine ensures that top_cpuset.cpus_allowed tracks 2105 * This routine ensures that top_cpuset.cpus_allowed tracks
2058 * cpu_active_mask on each CPU hotplug (cpuhp) event. 2106 * cpu_active_mask on each CPU hotplug (cpuhp) event.
2059 * 2107 *
2060 * Called within get_online_cpus(). Needs to call cgroup_lock() 2108 * Called within get_online_cpus(). Needs to call cgroup_lock()
2061 * before calling generate_sched_domains(). 2109 * before calling generate_sched_domains().
2110 *
2111 * @cpu_online: Indicates whether this is a CPU online event (true) or
2112 * a CPU offline event (false).
2062 */ 2113 */
2063void cpuset_update_active_cpus(void) 2114void cpuset_update_active_cpus(bool cpu_online)
2064{ 2115{
2065 struct sched_domain_attr *attr; 2116 struct sched_domain_attr *attr;
2066 cpumask_var_t *doms; 2117 cpumask_var_t *doms;
@@ -2070,7 +2121,10 @@ void cpuset_update_active_cpus(void)
2070 mutex_lock(&callback_mutex); 2121 mutex_lock(&callback_mutex);
2071 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2122 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2072 mutex_unlock(&callback_mutex); 2123 mutex_unlock(&callback_mutex);
2073 scan_for_empty_cpusets(&top_cpuset); 2124
2125 if (!cpu_online)
2126 scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE);
2127
2074 ndoms = generate_sched_domains(&doms, &attr); 2128 ndoms = generate_sched_domains(&doms, &attr);
2075 cgroup_unlock(); 2129 cgroup_unlock();
2076 2130
@@ -2082,7 +2136,7 @@ void cpuset_update_active_cpus(void)
2082/* 2136/*
2083 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. 2137 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
2084 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. 2138 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
2085 * See also the previous routine cpuset_track_online_cpus(). 2139 * See cpuset_update_active_cpus() for CPU hotplug handling.
2086 */ 2140 */
2087static int cpuset_track_online_nodes(struct notifier_block *self, 2141static int cpuset_track_online_nodes(struct notifier_block *self,
2088 unsigned long action, void *arg) 2142 unsigned long action, void *arg)
@@ -2101,9 +2155,9 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2101 case MEM_OFFLINE: 2155 case MEM_OFFLINE:
2102 /* 2156 /*
2103 * needn't update top_cpuset.mems_allowed explicitly because 2157 * needn't update top_cpuset.mems_allowed explicitly because
2104 * scan_for_empty_cpusets() will update it. 2158 * scan_cpusets_upon_hotplug() will update it.
2105 */ 2159 */
2106 scan_for_empty_cpusets(&top_cpuset); 2160 scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE);
2107 break; 2161 break;
2108 default: 2162 default:
2109 break; 2163 break;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index f93532748bca..c08a22d02f72 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -32,6 +32,7 @@
32#include <linux/swap.h> /* try_to_free_swap */ 32#include <linux/swap.h> /* try_to_free_swap */
33#include <linux/ptrace.h> /* user_enable_single_step */ 33#include <linux/ptrace.h> /* user_enable_single_step */
34#include <linux/kdebug.h> /* notifier mechanism */ 34#include <linux/kdebug.h> /* notifier mechanism */
35#include "../../mm/internal.h" /* munlock_vma_page */
35 36
36#include <linux/uprobes.h> 37#include <linux/uprobes.h>
37 38
@@ -112,14 +113,14 @@ static bool valid_vma(struct vm_area_struct *vma, bool is_register)
112 return false; 113 return false;
113} 114}
114 115
115static loff_t vma_address(struct vm_area_struct *vma, loff_t offset) 116static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset)
116{ 117{
117 loff_t vaddr; 118 return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
118 119}
119 vaddr = vma->vm_start + offset;
120 vaddr -= vma->vm_pgoff << PAGE_SHIFT;
121 120
122 return vaddr; 121static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
122{
123 return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start);
123} 124}
124 125
125/** 126/**
@@ -127,25 +128,27 @@ static loff_t vma_address(struct vm_area_struct *vma, loff_t offset)
127 * based on replace_page in mm/ksm.c 128 * based on replace_page in mm/ksm.c
128 * 129 *
129 * @vma: vma that holds the pte pointing to page 130 * @vma: vma that holds the pte pointing to page
131 * @addr: address the old @page is mapped at
130 * @page: the cowed page we are replacing by kpage 132 * @page: the cowed page we are replacing by kpage
131 * @kpage: the modified page we replace page by 133 * @kpage: the modified page we replace page by
132 * 134 *
133 * Returns 0 on success, -EFAULT on failure. 135 * Returns 0 on success, -EFAULT on failure.
134 */ 136 */
135static int __replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage) 137static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
138 struct page *page, struct page *kpage)
136{ 139{
137 struct mm_struct *mm = vma->vm_mm; 140 struct mm_struct *mm = vma->vm_mm;
138 unsigned long addr;
139 spinlock_t *ptl; 141 spinlock_t *ptl;
140 pte_t *ptep; 142 pte_t *ptep;
143 int err;
141 144
142 addr = page_address_in_vma(page, vma); 145 /* For try_to_free_swap() and munlock_vma_page() below */
143 if (addr == -EFAULT) 146 lock_page(page);
144 return -EFAULT;
145 147
148 err = -EAGAIN;
146 ptep = page_check_address(page, mm, addr, &ptl, 0); 149 ptep = page_check_address(page, mm, addr, &ptl, 0);
147 if (!ptep) 150 if (!ptep)
148 return -EAGAIN; 151 goto unlock;
149 152
150 get_page(kpage); 153 get_page(kpage);
151 page_add_new_anon_rmap(kpage, vma, addr); 154 page_add_new_anon_rmap(kpage, vma, addr);
@@ -162,10 +165,16 @@ static int __replace_page(struct vm_area_struct *vma, struct page *page, struct
162 page_remove_rmap(page); 165 page_remove_rmap(page);
163 if (!page_mapped(page)) 166 if (!page_mapped(page))
164 try_to_free_swap(page); 167 try_to_free_swap(page);
165 put_page(page);
166 pte_unmap_unlock(ptep, ptl); 168 pte_unmap_unlock(ptep, ptl);
167 169
168 return 0; 170 if (vma->vm_flags & VM_LOCKED)
171 munlock_vma_page(page);
172 put_page(page);
173
174 err = 0;
175 unlock:
176 unlock_page(page);
177 return err;
169} 178}
170 179
171/** 180/**
@@ -206,45 +215,23 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
206 unsigned long vaddr, uprobe_opcode_t opcode) 215 unsigned long vaddr, uprobe_opcode_t opcode)
207{ 216{
208 struct page *old_page, *new_page; 217 struct page *old_page, *new_page;
209 struct address_space *mapping;
210 void *vaddr_old, *vaddr_new; 218 void *vaddr_old, *vaddr_new;
211 struct vm_area_struct *vma; 219 struct vm_area_struct *vma;
212 struct uprobe *uprobe;
213 int ret; 220 int ret;
221
214retry: 222retry:
215 /* Read the page with vaddr into memory */ 223 /* Read the page with vaddr into memory */
216 ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma); 224 ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma);
217 if (ret <= 0) 225 if (ret <= 0)
218 return ret; 226 return ret;
219 227
220 ret = -EINVAL;
221
222 /*
223 * We are interested in text pages only. Our pages of interest
224 * should be mapped for read and execute only. We desist from
225 * adding probes in write mapped pages since the breakpoints
226 * might end up in the file copy.
227 */
228 if (!valid_vma(vma, is_swbp_insn(&opcode)))
229 goto put_out;
230
231 uprobe = container_of(auprobe, struct uprobe, arch);
232 mapping = uprobe->inode->i_mapping;
233 if (mapping != vma->vm_file->f_mapping)
234 goto put_out;
235
236 ret = -ENOMEM; 228 ret = -ENOMEM;
237 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); 229 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
238 if (!new_page) 230 if (!new_page)
239 goto put_out; 231 goto put_old;
240 232
241 __SetPageUptodate(new_page); 233 __SetPageUptodate(new_page);
242 234
243 /*
244 * lock page will serialize against do_wp_page()'s
245 * PageAnon() handling
246 */
247 lock_page(old_page);
248 /* copy the page now that we've got it stable */ 235 /* copy the page now that we've got it stable */
249 vaddr_old = kmap_atomic(old_page); 236 vaddr_old = kmap_atomic(old_page);
250 vaddr_new = kmap_atomic(new_page); 237 vaddr_new = kmap_atomic(new_page);
@@ -257,17 +244,13 @@ retry:
257 244
258 ret = anon_vma_prepare(vma); 245 ret = anon_vma_prepare(vma);
259 if (ret) 246 if (ret)
260 goto unlock_out; 247 goto put_new;
261 248
262 lock_page(new_page); 249 ret = __replace_page(vma, vaddr, old_page, new_page);
263 ret = __replace_page(vma, old_page, new_page);
264 unlock_page(new_page);
265 250
266unlock_out: 251put_new:
267 unlock_page(old_page);
268 page_cache_release(new_page); 252 page_cache_release(new_page);
269 253put_old:
270put_out:
271 put_page(old_page); 254 put_page(old_page);
272 255
273 if (unlikely(ret == -EAGAIN)) 256 if (unlikely(ret == -EAGAIN))
@@ -791,7 +774,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
791 curr = info; 774 curr = info;
792 775
793 info->mm = vma->vm_mm; 776 info->mm = vma->vm_mm;
794 info->vaddr = vma_address(vma, offset); 777 info->vaddr = offset_to_vaddr(vma, offset);
795 } 778 }
796 mutex_unlock(&mapping->i_mmap_mutex); 779 mutex_unlock(&mapping->i_mmap_mutex);
797 780
@@ -839,12 +822,13 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
839 goto free; 822 goto free;
840 823
841 down_write(&mm->mmap_sem); 824 down_write(&mm->mmap_sem);
842 vma = find_vma(mm, (unsigned long)info->vaddr); 825 vma = find_vma(mm, info->vaddr);
843 if (!vma || !valid_vma(vma, is_register)) 826 if (!vma || !valid_vma(vma, is_register) ||
827 vma->vm_file->f_mapping->host != uprobe->inode)
844 goto unlock; 828 goto unlock;
845 829
846 if (vma->vm_file->f_mapping->host != uprobe->inode || 830 if (vma->vm_start > info->vaddr ||
847 vma_address(vma, uprobe->offset) != info->vaddr) 831 vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
848 goto unlock; 832 goto unlock;
849 833
850 if (is_register) { 834 if (is_register) {
@@ -960,59 +944,66 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
960 put_uprobe(uprobe); 944 put_uprobe(uprobe);
961} 945}
962 946
963/* 947static struct rb_node *
964 * Of all the nodes that correspond to the given inode, return the node 948find_node_in_range(struct inode *inode, loff_t min, loff_t max)
965 * with the least offset.
966 */
967static struct rb_node *find_least_offset_node(struct inode *inode)
968{ 949{
969 struct uprobe u = { .inode = inode, .offset = 0};
970 struct rb_node *n = uprobes_tree.rb_node; 950 struct rb_node *n = uprobes_tree.rb_node;
971 struct rb_node *close_node = NULL;
972 struct uprobe *uprobe;
973 int match;
974 951
975 while (n) { 952 while (n) {
976 uprobe = rb_entry(n, struct uprobe, rb_node); 953 struct uprobe *u = rb_entry(n, struct uprobe, rb_node);
977 match = match_uprobe(&u, uprobe);
978
979 if (uprobe->inode == inode)
980 close_node = n;
981
982 if (!match)
983 return close_node;
984 954
985 if (match < 0) 955 if (inode < u->inode) {
986 n = n->rb_left; 956 n = n->rb_left;
987 else 957 } else if (inode > u->inode) {
988 n = n->rb_right; 958 n = n->rb_right;
959 } else {
960 if (max < u->offset)
961 n = n->rb_left;
962 else if (min > u->offset)
963 n = n->rb_right;
964 else
965 break;
966 }
989 } 967 }
990 968
991 return close_node; 969 return n;
992} 970}
993 971
994/* 972/*
995 * For a given inode, build a list of probes that need to be inserted. 973 * For a given range in vma, build a list of probes that need to be inserted.
996 */ 974 */
997static void build_probe_list(struct inode *inode, struct list_head *head) 975static void build_probe_list(struct inode *inode,
976 struct vm_area_struct *vma,
977 unsigned long start, unsigned long end,
978 struct list_head *head)
998{ 979{
999 struct uprobe *uprobe; 980 loff_t min, max;
1000 unsigned long flags; 981 unsigned long flags;
1001 struct rb_node *n; 982 struct rb_node *n, *t;
1002 983 struct uprobe *u;
1003 spin_lock_irqsave(&uprobes_treelock, flags);
1004
1005 n = find_least_offset_node(inode);
1006 984
1007 for (; n; n = rb_next(n)) { 985 INIT_LIST_HEAD(head);
1008 uprobe = rb_entry(n, struct uprobe, rb_node); 986 min = vaddr_to_offset(vma, start);
1009 if (uprobe->inode != inode) 987 max = min + (end - start) - 1;
1010 break;
1011 988
1012 list_add(&uprobe->pending_list, head); 989 spin_lock_irqsave(&uprobes_treelock, flags);
1013 atomic_inc(&uprobe->ref); 990 n = find_node_in_range(inode, min, max);
991 if (n) {
992 for (t = n; t; t = rb_prev(t)) {
993 u = rb_entry(t, struct uprobe, rb_node);
994 if (u->inode != inode || u->offset < min)
995 break;
996 list_add(&u->pending_list, head);
997 atomic_inc(&u->ref);
998 }
999 for (t = n; (t = rb_next(t)); ) {
1000 u = rb_entry(t, struct uprobe, rb_node);
1001 if (u->inode != inode || u->offset > max)
1002 break;
1003 list_add(&u->pending_list, head);
1004 atomic_inc(&u->ref);
1005 }
1014 } 1006 }
1015
1016 spin_unlock_irqrestore(&uprobes_treelock, flags); 1007 spin_unlock_irqrestore(&uprobes_treelock, flags);
1017} 1008}
1018 1009
@@ -1031,7 +1022,7 @@ static void build_probe_list(struct inode *inode, struct list_head *head)
1031int uprobe_mmap(struct vm_area_struct *vma) 1022int uprobe_mmap(struct vm_area_struct *vma)
1032{ 1023{
1033 struct list_head tmp_list; 1024 struct list_head tmp_list;
1034 struct uprobe *uprobe; 1025 struct uprobe *uprobe, *u;
1035 struct inode *inode; 1026 struct inode *inode;
1036 int ret, count; 1027 int ret, count;
1037 1028
@@ -1042,21 +1033,15 @@ int uprobe_mmap(struct vm_area_struct *vma)
1042 if (!inode) 1033 if (!inode)
1043 return 0; 1034 return 0;
1044 1035
1045 INIT_LIST_HEAD(&tmp_list);
1046 mutex_lock(uprobes_mmap_hash(inode)); 1036 mutex_lock(uprobes_mmap_hash(inode));
1047 build_probe_list(inode, &tmp_list); 1037 build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
1048 1038
1049 ret = 0; 1039 ret = 0;
1050 count = 0; 1040 count = 0;
1051 1041
1052 list_for_each_entry(uprobe, &tmp_list, pending_list) { 1042 list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
1053 if (!ret) { 1043 if (!ret) {
1054 loff_t vaddr = vma_address(vma, uprobe->offset); 1044 unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
1055
1056 if (vaddr < vma->vm_start || vaddr >= vma->vm_end) {
1057 put_uprobe(uprobe);
1058 continue;
1059 }
1060 1045
1061 ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); 1046 ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
1062 /* 1047 /*
@@ -1097,12 +1082,15 @@ int uprobe_mmap(struct vm_area_struct *vma)
1097void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) 1082void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
1098{ 1083{
1099 struct list_head tmp_list; 1084 struct list_head tmp_list;
1100 struct uprobe *uprobe; 1085 struct uprobe *uprobe, *u;
1101 struct inode *inode; 1086 struct inode *inode;
1102 1087
1103 if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) 1088 if (!atomic_read(&uprobe_events) || !valid_vma(vma, false))
1104 return; 1089 return;
1105 1090
1091 if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
1092 return;
1093
1106 if (!atomic_read(&vma->vm_mm->uprobes_state.count)) 1094 if (!atomic_read(&vma->vm_mm->uprobes_state.count))
1107 return; 1095 return;
1108 1096
@@ -1110,21 +1098,17 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
1110 if (!inode) 1098 if (!inode)
1111 return; 1099 return;
1112 1100
1113 INIT_LIST_HEAD(&tmp_list);
1114 mutex_lock(uprobes_mmap_hash(inode)); 1101 mutex_lock(uprobes_mmap_hash(inode));
1115 build_probe_list(inode, &tmp_list); 1102 build_probe_list(inode, vma, start, end, &tmp_list);
1116 1103
1117 list_for_each_entry(uprobe, &tmp_list, pending_list) { 1104 list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
1118 loff_t vaddr = vma_address(vma, uprobe->offset); 1105 unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
1119 1106 /*
1120 if (vaddr >= start && vaddr < end) { 1107 * An unregister could have removed the probe before
1121 /* 1108 * unmap. So check before we decrement the count.
1122 * An unregister could have removed the probe before 1109 */
1123 * unmap. So check before we decrement the count. 1110 if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1)
1124 */ 1111 atomic_dec(&vma->vm_mm->uprobes_state.count);
1125 if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1)
1126 atomic_dec(&vma->vm_mm->uprobes_state.count);
1127 }
1128 put_uprobe(uprobe); 1112 put_uprobe(uprobe);
1129 } 1113 }
1130 mutex_unlock(uprobes_mmap_hash(inode)); 1114 mutex_unlock(uprobes_mmap_hash(inode));
@@ -1463,12 +1447,9 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
1463 vma = find_vma(mm, bp_vaddr); 1447 vma = find_vma(mm, bp_vaddr);
1464 if (vma && vma->vm_start <= bp_vaddr) { 1448 if (vma && vma->vm_start <= bp_vaddr) {
1465 if (valid_vma(vma, false)) { 1449 if (valid_vma(vma, false)) {
1466 struct inode *inode; 1450 struct inode *inode = vma->vm_file->f_mapping->host;
1467 loff_t offset; 1451 loff_t offset = vaddr_to_offset(vma, bp_vaddr);
1468 1452
1469 inode = vma->vm_file->f_mapping->host;
1470 offset = bp_vaddr - vma->vm_start;
1471 offset += (vma->vm_pgoff << PAGE_SHIFT);
1472 uprobe = find_uprobe(inode, offset); 1453 uprobe = find_uprobe(inode, offset);
1473 } 1454 }
1474 1455
diff --git a/kernel/exit.c b/kernel/exit.c
index d17f6c4ddfa9..f65345f9e5bb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -483,7 +483,7 @@ static void close_files(struct files_struct * files)
483 rcu_read_unlock(); 483 rcu_read_unlock();
484 for (;;) { 484 for (;;) {
485 unsigned long set; 485 unsigned long set;
486 i = j * __NFDBITS; 486 i = j * BITS_PER_LONG;
487 if (i >= fdt->max_fds) 487 if (i >= fdt->max_fds)
488 break; 488 break;
489 set = fdt->open_fds[j++]; 489 set = fdt->open_fds[j++];
diff --git a/kernel/fork.c b/kernel/fork.c
index ff1cad3b7bdc..3bd2280d79f6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -114,6 +114,10 @@ int nr_processes(void)
114 return total; 114 return total;
115} 115}
116 116
117void __weak arch_release_task_struct(struct task_struct *tsk)
118{
119}
120
117#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR 121#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
118static struct kmem_cache *task_struct_cachep; 122static struct kmem_cache *task_struct_cachep;
119 123
@@ -122,17 +126,17 @@ static inline struct task_struct *alloc_task_struct_node(int node)
122 return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node); 126 return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
123} 127}
124 128
125void __weak arch_release_task_struct(struct task_struct *tsk) { }
126
127static inline void free_task_struct(struct task_struct *tsk) 129static inline void free_task_struct(struct task_struct *tsk)
128{ 130{
129 arch_release_task_struct(tsk);
130 kmem_cache_free(task_struct_cachep, tsk); 131 kmem_cache_free(task_struct_cachep, tsk);
131} 132}
132#endif 133#endif
133 134
135void __weak arch_release_thread_info(struct thread_info *ti)
136{
137}
138
134#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR 139#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
135void __weak arch_release_thread_info(struct thread_info *ti) { }
136 140
137/* 141/*
138 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a 142 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
@@ -150,7 +154,6 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
150 154
151static inline void free_thread_info(struct thread_info *ti) 155static inline void free_thread_info(struct thread_info *ti)
152{ 156{
153 arch_release_thread_info(ti);
154 free_pages((unsigned long)ti, THREAD_SIZE_ORDER); 157 free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
155} 158}
156# else 159# else
@@ -164,7 +167,6 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
164 167
165static void free_thread_info(struct thread_info *ti) 168static void free_thread_info(struct thread_info *ti)
166{ 169{
167 arch_release_thread_info(ti);
168 kmem_cache_free(thread_info_cache, ti); 170 kmem_cache_free(thread_info_cache, ti);
169} 171}
170 172
@@ -205,10 +207,12 @@ static void account_kernel_stack(struct thread_info *ti, int account)
205void free_task(struct task_struct *tsk) 207void free_task(struct task_struct *tsk)
206{ 208{
207 account_kernel_stack(tsk->stack, -1); 209 account_kernel_stack(tsk->stack, -1);
210 arch_release_thread_info(tsk->stack);
208 free_thread_info(tsk->stack); 211 free_thread_info(tsk->stack);
209 rt_mutex_debug_task_free(tsk); 212 rt_mutex_debug_task_free(tsk);
210 ftrace_graph_exit_task(tsk); 213 ftrace_graph_exit_task(tsk);
211 put_seccomp_filter(tsk); 214 put_seccomp_filter(tsk);
215 arch_release_task_struct(tsk);
212 free_task_struct(tsk); 216 free_task_struct(tsk);
213} 217}
214EXPORT_SYMBOL(free_task); 218EXPORT_SYMBOL(free_task);
@@ -298,23 +302,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
298 return NULL; 302 return NULL;
299 303
300 ti = alloc_thread_info_node(tsk, node); 304 ti = alloc_thread_info_node(tsk, node);
301 if (!ti) { 305 if (!ti)
302 free_task_struct(tsk); 306 goto free_tsk;
303 return NULL;
304 }
305 307
306 err = arch_dup_task_struct(tsk, orig); 308 err = arch_dup_task_struct(tsk, orig);
309 if (err)
310 goto free_ti;
307 311
308 /*
309 * We defer looking at err, because we will need this setup
310 * for the clean up path to work correctly.
311 */
312 tsk->stack = ti; 312 tsk->stack = ti;
313 setup_thread_stack(tsk, orig);
314
315 if (err)
316 goto out;
317 313
314 setup_thread_stack(tsk, orig);
318 clear_user_return_notifier(tsk); 315 clear_user_return_notifier(tsk);
319 clear_tsk_need_resched(tsk); 316 clear_tsk_need_resched(tsk);
320 stackend = end_of_stack(tsk); 317 stackend = end_of_stack(tsk);
@@ -338,8 +335,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
338 335
339 return tsk; 336 return tsk;
340 337
341out: 338free_ti:
342 free_thread_info(ti); 339 free_thread_info(ti);
340free_tsk:
343 free_task_struct(tsk); 341 free_task_struct(tsk);
344 return NULL; 342 return NULL;
345} 343}
@@ -383,16 +381,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
383 struct file *file; 381 struct file *file;
384 382
385 if (mpnt->vm_flags & VM_DONTCOPY) { 383 if (mpnt->vm_flags & VM_DONTCOPY) {
386 long pages = vma_pages(mpnt);
387 mm->total_vm -= pages;
388 vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, 384 vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
389 -pages); 385 -vma_pages(mpnt));
390 continue; 386 continue;
391 } 387 }
392 charge = 0; 388 charge = 0;
393 if (mpnt->vm_flags & VM_ACCOUNT) { 389 if (mpnt->vm_flags & VM_ACCOUNT) {
394 unsigned long len; 390 unsigned long len = vma_pages(mpnt);
395 len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; 391
396 if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ 392 if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
397 goto fail_nomem; 393 goto fail_nomem;
398 charge = len; 394 charge = len;
@@ -1310,7 +1306,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1310#ifdef CONFIG_DEBUG_MUTEXES 1306#ifdef CONFIG_DEBUG_MUTEXES
1311 p->blocked_on = NULL; /* not blocked yet */ 1307 p->blocked_on = NULL; /* not blocked yet */
1312#endif 1308#endif
1313#ifdef CONFIG_CGROUP_MEM_RES_CTLR 1309#ifdef CONFIG_MEMCG
1314 p->memcg_batch.do_batch = 0; 1310 p->memcg_batch.do_batch = 0;
1315 p->memcg_batch.memcg = NULL; 1311 p->memcg_batch.memcg = NULL;
1316#endif 1312#endif
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index bdb180325551..131ca176b497 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -133,7 +133,7 @@ irqreturn_t
133handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) 133handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
134{ 134{
135 irqreturn_t retval = IRQ_NONE; 135 irqreturn_t retval = IRQ_NONE;
136 unsigned int random = 0, irq = desc->irq_data.irq; 136 unsigned int flags = 0, irq = desc->irq_data.irq;
137 137
138 do { 138 do {
139 irqreturn_t res; 139 irqreturn_t res;
@@ -161,7 +161,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
161 161
162 /* Fall through to add to randomness */ 162 /* Fall through to add to randomness */
163 case IRQ_HANDLED: 163 case IRQ_HANDLED:
164 random |= action->flags; 164 flags |= action->flags;
165 break; 165 break;
166 166
167 default: 167 default:
@@ -172,8 +172,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
172 action = action->next; 172 action = action->next;
173 } while (action); 173 } while (action);
174 174
175 if (random & IRQF_SAMPLE_RANDOM) 175 add_interrupt_randomness(irq, flags);
176 add_interrupt_randomness(irq);
177 176
178 if (!noirqdebug) 177 if (!noirqdebug)
179 note_interrupt(irq, desc, retval); 178 note_interrupt(irq, desc, retval);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 814c9ef6bba1..0a8e8f059627 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -893,22 +893,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
893 return -ENOSYS; 893 return -ENOSYS;
894 if (!try_module_get(desc->owner)) 894 if (!try_module_get(desc->owner))
895 return -ENODEV; 895 return -ENODEV;
896 /*
897 * Some drivers like serial.c use request_irq() heavily,
898 * so we have to be careful not to interfere with a
899 * running system.
900 */
901 if (new->flags & IRQF_SAMPLE_RANDOM) {
902 /*
903 * This function might sleep, we want to call it first,
904 * outside of the atomic block.
905 * Yes, this might clear the entropy pool if the wrong
906 * driver is attempted to be loaded, without actually
907 * installing a new handler, but is this really a problem,
908 * only the sysadmin is able to do this.
909 */
910 rand_initialize_irq(irq);
911 }
912 896
913 /* 897 /*
914 * Check whether the interrupt nests into another interrupt 898 * Check whether the interrupt nests into another interrupt
@@ -1354,7 +1338,6 @@ EXPORT_SYMBOL(free_irq);
1354 * Flags: 1338 * Flags:
1355 * 1339 *
1356 * IRQF_SHARED Interrupt is shared 1340 * IRQF_SHARED Interrupt is shared
1357 * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy
1358 * IRQF_TRIGGER_* Specify active edge(s) or level 1341 * IRQF_TRIGGER_* Specify active edge(s) or level
1359 * 1342 *
1360 */ 1343 */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 4e2e472f6aeb..0668d58d6413 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1424,7 +1424,7 @@ static void update_vmcoreinfo_note(void)
1424 1424
1425void crash_save_vmcoreinfo(void) 1425void crash_save_vmcoreinfo(void)
1426{ 1426{
1427 vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds()); 1427 vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
1428 update_vmcoreinfo_note(); 1428 update_vmcoreinfo_note();
1429} 1429}
1430 1430
diff --git a/kernel/kmod.c b/kernel/kmod.c
index ff2c7cb86d77..6f99aead66c6 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -45,6 +45,13 @@ extern int max_threads;
45 45
46static struct workqueue_struct *khelper_wq; 46static struct workqueue_struct *khelper_wq;
47 47
48/*
49 * kmod_thread_locker is used for deadlock avoidance. There is no explicit
50 * locking to protect this global - it is private to the singleton khelper
51 * thread and should only ever be modified by that thread.
52 */
53static const struct task_struct *kmod_thread_locker;
54
48#define CAP_BSET (void *)1 55#define CAP_BSET (void *)1
49#define CAP_PI (void *)2 56#define CAP_PI (void *)2
50 57
@@ -221,6 +228,13 @@ fail:
221 return 0; 228 return 0;
222} 229}
223 230
231static int call_helper(void *data)
232{
233 /* Worker thread started blocking khelper thread. */
234 kmod_thread_locker = current;
235 return ____call_usermodehelper(data);
236}
237
224static void call_usermodehelper_freeinfo(struct subprocess_info *info) 238static void call_usermodehelper_freeinfo(struct subprocess_info *info)
225{ 239{
226 if (info->cleanup) 240 if (info->cleanup)
@@ -295,9 +309,12 @@ static void __call_usermodehelper(struct work_struct *work)
295 if (wait == UMH_WAIT_PROC) 309 if (wait == UMH_WAIT_PROC)
296 pid = kernel_thread(wait_for_helper, sub_info, 310 pid = kernel_thread(wait_for_helper, sub_info,
297 CLONE_FS | CLONE_FILES | SIGCHLD); 311 CLONE_FS | CLONE_FILES | SIGCHLD);
298 else 312 else {
299 pid = kernel_thread(____call_usermodehelper, sub_info, 313 pid = kernel_thread(call_helper, sub_info,
300 CLONE_VFORK | SIGCHLD); 314 CLONE_VFORK | SIGCHLD);
315 /* Worker thread stopped blocking khelper thread. */
316 kmod_thread_locker = NULL;
317 }
301 318
302 switch (wait) { 319 switch (wait) {
303 case UMH_NO_WAIT: 320 case UMH_NO_WAIT:
@@ -548,6 +565,16 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
548 retval = -EBUSY; 565 retval = -EBUSY;
549 goto out; 566 goto out;
550 } 567 }
568 /*
569 * Worker thread must not wait for khelper thread at below
570 * wait_for_completion() if the thread was created with CLONE_VFORK
571 * flag, for khelper thread is already waiting for the thread at
572 * wait_for_completion() in do_fork().
573 */
574 if (wait != UMH_NO_WAIT && current == kmod_thread_locker) {
575 retval = -EBUSY;
576 goto out;
577 }
551 578
552 sub_info->complete = &done; 579 sub_info->complete = &done;
553 sub_info->wait = wait; 580 sub_info->wait = wait;
@@ -577,6 +604,12 @@ unlock:
577 return retval; 604 return retval;
578} 605}
579 606
607/*
608 * call_usermodehelper_fns() will not run the caller-provided cleanup function
609 * if a memory allocation failure is experienced. So the caller might need to
610 * check the call_usermodehelper_fns() return value: if it is -ENOMEM, perform
611 * the necessaary cleanup within the caller.
612 */
580int call_usermodehelper_fns( 613int call_usermodehelper_fns(
581 char *path, char **argv, char **envp, int wait, 614 char *path, char **argv, char **envp, int wait,
582 int (*init)(struct subprocess_info *info, struct cred *new), 615 int (*init)(struct subprocess_info *info, struct cred *new),
diff --git a/kernel/panic.c b/kernel/panic.c
index d2a5f4ecc6dd..e1b2822fff97 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -75,6 +75,14 @@ void panic(const char *fmt, ...)
75 int state = 0; 75 int state = 0;
76 76
77 /* 77 /*
78 * Disable local interrupts. This will prevent panic_smp_self_stop
79 * from deadlocking the first cpu that invokes the panic, since
80 * there is nothing to prevent an interrupt handler (that runs
81 * after the panic_lock is acquired) from invoking panic again.
82 */
83 local_irq_disable();
84
85 /*
78 * It's possible to come here directly from a panic-assertion and 86 * It's possible to come here directly from a panic-assertion and
79 * not have preempt disabled. Some functions called from here want 87 * not have preempt disabled. Some functions called from here want
80 * preempt to be disabled. No point enabling it later though... 88 * preempt to be disabled. No point enabling it later though...
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c8b7446b27df..1da39ea248fd 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -178,6 +178,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
178 arch_suspend_enable_irqs(); 178 arch_suspend_enable_irqs();
179 BUG_ON(irqs_disabled()); 179 BUG_ON(irqs_disabled());
180 180
181 /* Kick the lockup detector */
182 lockup_detector_bootcpu_resume();
183
181 Enable_cpus: 184 Enable_cpus:
182 enable_nonboot_cpus(); 185 enable_nonboot_cpus();
183 186
diff --git a/kernel/printk.c b/kernel/printk.c
index ac4bc9e79465..6a76ab9d4476 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -216,6 +216,7 @@ struct log {
216 */ 216 */
217static DEFINE_RAW_SPINLOCK(logbuf_lock); 217static DEFINE_RAW_SPINLOCK(logbuf_lock);
218 218
219#ifdef CONFIG_PRINTK
219/* the next printk record to read by syslog(READ) or /proc/kmsg */ 220/* the next printk record to read by syslog(READ) or /proc/kmsg */
220static u64 syslog_seq; 221static u64 syslog_seq;
221static u32 syslog_idx; 222static u32 syslog_idx;
@@ -228,14 +229,19 @@ static u32 log_first_idx;
228 229
229/* index and sequence number of the next record to store in the buffer */ 230/* index and sequence number of the next record to store in the buffer */
230static u64 log_next_seq; 231static u64 log_next_seq;
231#ifdef CONFIG_PRINTK
232static u32 log_next_idx; 232static u32 log_next_idx;
233 233
234/* the next printk record to write to the console */
235static u64 console_seq;
236static u32 console_idx;
237static enum log_flags console_prev;
238
234/* the next printk record to read after the last 'clear' command */ 239/* the next printk record to read after the last 'clear' command */
235static u64 clear_seq; 240static u64 clear_seq;
236static u32 clear_idx; 241static u32 clear_idx;
237 242
238#define LOG_LINE_MAX 1024 243#define PREFIX_MAX 32
244#define LOG_LINE_MAX 1024 - PREFIX_MAX
239 245
240/* record buffer */ 246/* record buffer */
241#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) 247#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
@@ -360,6 +366,7 @@ static void log_store(int facility, int level,
360struct devkmsg_user { 366struct devkmsg_user {
361 u64 seq; 367 u64 seq;
362 u32 idx; 368 u32 idx;
369 enum log_flags prev;
363 struct mutex lock; 370 struct mutex lock;
364 char buf[8192]; 371 char buf[8192];
365}; 372};
@@ -382,8 +389,10 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
382 389
383 line = buf; 390 line = buf;
384 for (i = 0; i < count; i++) { 391 for (i = 0; i < count; i++) {
385 if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) 392 if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) {
393 ret = -EFAULT;
386 goto out; 394 goto out;
395 }
387 line += iv[i].iov_len; 396 line += iv[i].iov_len;
388 } 397 }
389 398
@@ -425,6 +434,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
425 struct log *msg; 434 struct log *msg;
426 u64 ts_usec; 435 u64 ts_usec;
427 size_t i; 436 size_t i;
437 char cont = '-';
428 size_t len; 438 size_t len;
429 ssize_t ret; 439 ssize_t ret;
430 440
@@ -462,8 +472,25 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
462 msg = log_from_idx(user->idx); 472 msg = log_from_idx(user->idx);
463 ts_usec = msg->ts_nsec; 473 ts_usec = msg->ts_nsec;
464 do_div(ts_usec, 1000); 474 do_div(ts_usec, 1000);
465 len = sprintf(user->buf, "%u,%llu,%llu;", 475
466 (msg->facility << 3) | msg->level, user->seq, ts_usec); 476 /*
477 * If we couldn't merge continuation line fragments during the print,
478 * export the stored flags to allow an optional external merge of the
479 * records. Merging the records isn't always neccessarily correct, like
480 * when we hit a race during printing. In most cases though, it produces
481 * better readable output. 'c' in the record flags mark the first
482 * fragment of a line, '+' the following.
483 */
484 if (msg->flags & LOG_CONT && !(user->prev & LOG_CONT))
485 cont = 'c';
486 else if ((msg->flags & LOG_CONT) ||
487 ((user->prev & LOG_CONT) && !(msg->flags & LOG_PREFIX)))
488 cont = '+';
489
490 len = sprintf(user->buf, "%u,%llu,%llu,%c;",
491 (msg->facility << 3) | msg->level,
492 user->seq, ts_usec, cont);
493 user->prev = msg->flags;
467 494
468 /* escape non-printable characters */ 495 /* escape non-printable characters */
469 for (i = 0; i < msg->text_len; i++) { 496 for (i = 0; i < msg->text_len; i++) {
@@ -646,6 +673,15 @@ void log_buf_kexec_setup(void)
646 VMCOREINFO_SYMBOL(log_buf_len); 673 VMCOREINFO_SYMBOL(log_buf_len);
647 VMCOREINFO_SYMBOL(log_first_idx); 674 VMCOREINFO_SYMBOL(log_first_idx);
648 VMCOREINFO_SYMBOL(log_next_idx); 675 VMCOREINFO_SYMBOL(log_next_idx);
676 /*
677 * Export struct log size and field offsets. User space tools can
678 * parse it and detect any changes to structure down the line.
679 */
680 VMCOREINFO_STRUCT_SIZE(log);
681 VMCOREINFO_OFFSET(log, ts_nsec);
682 VMCOREINFO_OFFSET(log, len);
683 VMCOREINFO_OFFSET(log, text_len);
684 VMCOREINFO_OFFSET(log, dict_len);
649} 685}
650#endif 686#endif
651 687
@@ -876,7 +912,7 @@ static size_t msg_print_text(const struct log *msg, enum log_flags prev,
876 912
877 if (buf) { 913 if (buf) {
878 if (print_prefix(msg, syslog, NULL) + 914 if (print_prefix(msg, syslog, NULL) +
879 text_len + 1>= size - len) 915 text_len + 1 >= size - len)
880 break; 916 break;
881 917
882 if (prefix) 918 if (prefix)
@@ -907,7 +943,7 @@ static int syslog_print(char __user *buf, int size)
907 struct log *msg; 943 struct log *msg;
908 int len = 0; 944 int len = 0;
909 945
910 text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); 946 text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
911 if (!text) 947 if (!text)
912 return -ENOMEM; 948 return -ENOMEM;
913 949
@@ -930,7 +966,8 @@ static int syslog_print(char __user *buf, int size)
930 966
931 skip = syslog_partial; 967 skip = syslog_partial;
932 msg = log_from_idx(syslog_idx); 968 msg = log_from_idx(syslog_idx);
933 n = msg_print_text(msg, syslog_prev, true, text, LOG_LINE_MAX); 969 n = msg_print_text(msg, syslog_prev, true, text,
970 LOG_LINE_MAX + PREFIX_MAX);
934 if (n - syslog_partial <= size) { 971 if (n - syslog_partial <= size) {
935 /* message fits into buffer, move forward */ 972 /* message fits into buffer, move forward */
936 syslog_idx = log_next(syslog_idx); 973 syslog_idx = log_next(syslog_idx);
@@ -969,7 +1006,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
969 char *text; 1006 char *text;
970 int len = 0; 1007 int len = 0;
971 1008
972 text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); 1009 text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
973 if (!text) 1010 if (!text)
974 return -ENOMEM; 1011 return -ENOMEM;
975 1012
@@ -1022,7 +1059,8 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
1022 struct log *msg = log_from_idx(idx); 1059 struct log *msg = log_from_idx(idx);
1023 int textlen; 1060 int textlen;
1024 1061
1025 textlen = msg_print_text(msg, prev, true, text, LOG_LINE_MAX); 1062 textlen = msg_print_text(msg, prev, true, text,
1063 LOG_LINE_MAX + PREFIX_MAX);
1026 if (textlen < 0) { 1064 if (textlen < 0) {
1027 len = textlen; 1065 len = textlen;
1028 break; 1066 break;
@@ -1349,20 +1387,36 @@ static struct cont {
1349 u64 ts_nsec; /* time of first print */ 1387 u64 ts_nsec; /* time of first print */
1350 u8 level; /* log level of first message */ 1388 u8 level; /* log level of first message */
1351 u8 facility; /* log level of first message */ 1389 u8 facility; /* log level of first message */
1390 enum log_flags flags; /* prefix, newline flags */
1352 bool flushed:1; /* buffer sealed and committed */ 1391 bool flushed:1; /* buffer sealed and committed */
1353} cont; 1392} cont;
1354 1393
1355static void cont_flush(void) 1394static void cont_flush(enum log_flags flags)
1356{ 1395{
1357 if (cont.flushed) 1396 if (cont.flushed)
1358 return; 1397 return;
1359 if (cont.len == 0) 1398 if (cont.len == 0)
1360 return; 1399 return;
1361 1400
1362 log_store(cont.facility, cont.level, LOG_NOCONS, cont.ts_nsec, 1401 if (cont.cons) {
1363 NULL, 0, cont.buf, cont.len); 1402 /*
1364 1403 * If a fragment of this line was directly flushed to the
1365 cont.flushed = true; 1404 * console; wait for the console to pick up the rest of the
1405 * line. LOG_NOCONS suppresses a duplicated output.
1406 */
1407 log_store(cont.facility, cont.level, flags | LOG_NOCONS,
1408 cont.ts_nsec, NULL, 0, cont.buf, cont.len);
1409 cont.flags = flags;
1410 cont.flushed = true;
1411 } else {
1412 /*
1413 * If no fragment of this line ever reached the console,
1414 * just submit it to the store and free the buffer.
1415 */
1416 log_store(cont.facility, cont.level, flags, 0,
1417 NULL, 0, cont.buf, cont.len);
1418 cont.len = 0;
1419 }
1366} 1420}
1367 1421
1368static bool cont_add(int facility, int level, const char *text, size_t len) 1422static bool cont_add(int facility, int level, const char *text, size_t len)
@@ -1371,7 +1425,8 @@ static bool cont_add(int facility, int level, const char *text, size_t len)
1371 return false; 1425 return false;
1372 1426
1373 if (cont.len + len > sizeof(cont.buf)) { 1427 if (cont.len + len > sizeof(cont.buf)) {
1374 cont_flush(); 1428 /* the line gets too long, split it up in separate records */
1429 cont_flush(LOG_CONT);
1375 return false; 1430 return false;
1376 } 1431 }
1377 1432
@@ -1380,12 +1435,17 @@ static bool cont_add(int facility, int level, const char *text, size_t len)
1380 cont.level = level; 1435 cont.level = level;
1381 cont.owner = current; 1436 cont.owner = current;
1382 cont.ts_nsec = local_clock(); 1437 cont.ts_nsec = local_clock();
1438 cont.flags = 0;
1383 cont.cons = 0; 1439 cont.cons = 0;
1384 cont.flushed = false; 1440 cont.flushed = false;
1385 } 1441 }
1386 1442
1387 memcpy(cont.buf + cont.len, text, len); 1443 memcpy(cont.buf + cont.len, text, len);
1388 cont.len += len; 1444 cont.len += len;
1445
1446 if (cont.len > (sizeof(cont.buf) * 80) / 100)
1447 cont_flush(LOG_CONT);
1448
1389 return true; 1449 return true;
1390} 1450}
1391 1451
@@ -1394,7 +1454,7 @@ static size_t cont_print_text(char *text, size_t size)
1394 size_t textlen = 0; 1454 size_t textlen = 0;
1395 size_t len; 1455 size_t len;
1396 1456
1397 if (cont.cons == 0) { 1457 if (cont.cons == 0 && (console_prev & LOG_NEWLINE)) {
1398 textlen += print_time(cont.ts_nsec, text); 1458 textlen += print_time(cont.ts_nsec, text);
1399 size -= textlen; 1459 size -= textlen;
1400 } 1460 }
@@ -1409,7 +1469,8 @@ static size_t cont_print_text(char *text, size_t size)
1409 } 1469 }
1410 1470
1411 if (cont.flushed) { 1471 if (cont.flushed) {
1412 text[textlen++] = '\n'; 1472 if (cont.flags & LOG_NEWLINE)
1473 text[textlen++] = '\n';
1413 /* got everything, release buffer */ 1474 /* got everything, release buffer */
1414 cont.len = 0; 1475 cont.len = 0;
1415 } 1476 }
@@ -1481,17 +1542,23 @@ asmlinkage int vprintk_emit(int facility, int level,
1481 lflags |= LOG_NEWLINE; 1542 lflags |= LOG_NEWLINE;
1482 } 1543 }
1483 1544
1484 /* strip syslog prefix and extract log level or control flags */ 1545 /* strip kernel syslog prefix and extract log level or control flags */
1485 if (text[0] == '<' && text[1] && text[2] == '>') { 1546 if (facility == 0) {
1486 switch (text[1]) { 1547 int kern_level = printk_get_level(text);
1487 case '0' ... '7': 1548
1488 if (level == -1) 1549 if (kern_level) {
1489 level = text[1] - '0'; 1550 const char *end_of_header = printk_skip_level(text);
1490 case 'd': /* KERN_DEFAULT */ 1551 switch (kern_level) {
1491 lflags |= LOG_PREFIX; 1552 case '0' ... '7':
1492 case 'c': /* KERN_CONT */ 1553 if (level == -1)
1493 text += 3; 1554 level = kern_level - '0';
1494 text_len -= 3; 1555 case 'd': /* KERN_DEFAULT */
1556 lflags |= LOG_PREFIX;
1557 case 'c': /* KERN_CONT */
1558 break;
1559 }
1560 text_len -= end_of_header - text;
1561 text = (char *)end_of_header;
1495 } 1562 }
1496 } 1563 }
1497 1564
@@ -1507,7 +1574,7 @@ asmlinkage int vprintk_emit(int facility, int level,
1507 * or another task also prints continuation lines. 1574 * or another task also prints continuation lines.
1508 */ 1575 */
1509 if (cont.len && (lflags & LOG_PREFIX || cont.owner != current)) 1576 if (cont.len && (lflags & LOG_PREFIX || cont.owner != current))
1510 cont_flush(); 1577 cont_flush(LOG_NEWLINE);
1511 1578
1512 /* buffer line if possible, otherwise store it right away */ 1579 /* buffer line if possible, otherwise store it right away */
1513 if (!cont_add(facility, level, text, text_len)) 1580 if (!cont_add(facility, level, text, text_len))
@@ -1525,7 +1592,7 @@ asmlinkage int vprintk_emit(int facility, int level,
1525 if (cont.len && cont.owner == current) { 1592 if (cont.len && cont.owner == current) {
1526 if (!(lflags & LOG_PREFIX)) 1593 if (!(lflags & LOG_PREFIX))
1527 stored = cont_add(facility, level, text, text_len); 1594 stored = cont_add(facility, level, text, text_len);
1528 cont_flush(); 1595 cont_flush(LOG_NEWLINE);
1529 } 1596 }
1530 1597
1531 if (!stored) 1598 if (!stored)
@@ -1616,9 +1683,20 @@ asmlinkage int printk(const char *fmt, ...)
1616} 1683}
1617EXPORT_SYMBOL(printk); 1684EXPORT_SYMBOL(printk);
1618 1685
1619#else 1686#else /* CONFIG_PRINTK */
1620 1687
1688#define LOG_LINE_MAX 0
1689#define PREFIX_MAX 0
1621#define LOG_LINE_MAX 0 1690#define LOG_LINE_MAX 0
1691static u64 syslog_seq;
1692static u32 syslog_idx;
1693static u64 console_seq;
1694static u32 console_idx;
1695static enum log_flags syslog_prev;
1696static u64 log_first_seq;
1697static u32 log_first_idx;
1698static u64 log_next_seq;
1699static enum log_flags console_prev;
1622static struct cont { 1700static struct cont {
1623 size_t len; 1701 size_t len;
1624 size_t cons; 1702 size_t cons;
@@ -1902,10 +1980,34 @@ void wake_up_klogd(void)
1902 this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); 1980 this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
1903} 1981}
1904 1982
1905/* the next printk record to write to the console */ 1983static void console_cont_flush(char *text, size_t size)
1906static u64 console_seq; 1984{
1907static u32 console_idx; 1985 unsigned long flags;
1908static enum log_flags console_prev; 1986 size_t len;
1987
1988 raw_spin_lock_irqsave(&logbuf_lock, flags);
1989
1990 if (!cont.len)
1991 goto out;
1992
1993 /*
1994 * We still queue earlier records, likely because the console was
1995 * busy. The earlier ones need to be printed before this one, we
1996 * did not flush any fragment so far, so just let it queue up.
1997 */
1998 if (console_seq < log_next_seq && !cont.cons)
1999 goto out;
2000
2001 len = cont_print_text(text, size);
2002 raw_spin_unlock(&logbuf_lock);
2003 stop_critical_timings();
2004 call_console_drivers(cont.level, text, len);
2005 start_critical_timings();
2006 local_irq_restore(flags);
2007 return;
2008out:
2009 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2010}
1909 2011
1910/** 2012/**
1911 * console_unlock - unlock the console system 2013 * console_unlock - unlock the console system
@@ -1923,7 +2025,7 @@ static enum log_flags console_prev;
1923 */ 2025 */
1924void console_unlock(void) 2026void console_unlock(void)
1925{ 2027{
1926 static char text[LOG_LINE_MAX]; 2028 static char text[LOG_LINE_MAX + PREFIX_MAX];
1927 static u64 seen_seq; 2029 static u64 seen_seq;
1928 unsigned long flags; 2030 unsigned long flags;
1929 bool wake_klogd = false; 2031 bool wake_klogd = false;
@@ -1937,19 +2039,7 @@ void console_unlock(void)
1937 console_may_schedule = 0; 2039 console_may_schedule = 0;
1938 2040
1939 /* flush buffered message fragment immediately to console */ 2041 /* flush buffered message fragment immediately to console */
1940 raw_spin_lock_irqsave(&logbuf_lock, flags); 2042 console_cont_flush(text, sizeof(text));
1941 if (cont.len && (cont.cons < cont.len || cont.flushed)) {
1942 size_t len;
1943
1944 len = cont_print_text(text, sizeof(text));
1945 raw_spin_unlock(&logbuf_lock);
1946 stop_critical_timings();
1947 call_console_drivers(cont.level, text, len);
1948 start_critical_timings();
1949 local_irq_restore(flags);
1950 } else
1951 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
1952
1953again: 2043again:
1954 for (;;) { 2044 for (;;) {
1955 struct log *msg; 2045 struct log *msg;
@@ -1986,6 +2076,7 @@ skip:
1986 * will properly dump everything later. 2076 * will properly dump everything later.
1987 */ 2077 */
1988 msg->flags &= ~LOG_NOCONS; 2078 msg->flags &= ~LOG_NOCONS;
2079 console_prev = msg->flags;
1989 goto skip; 2080 goto skip;
1990 } 2081 }
1991 2082
diff --git a/kernel/resource.c b/kernel/resource.c
index dc8b47764443..34d45886ee84 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -7,6 +7,8 @@
7 * Arbitrary resource management. 7 * Arbitrary resource management.
8 */ 8 */
9 9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
10#include <linux/export.h> 12#include <linux/export.h>
11#include <linux/errno.h> 13#include <linux/errno.h>
12#include <linux/ioport.h> 14#include <linux/ioport.h>
@@ -791,8 +793,28 @@ void __init reserve_region_with_split(struct resource *root,
791 resource_size_t start, resource_size_t end, 793 resource_size_t start, resource_size_t end,
792 const char *name) 794 const char *name)
793{ 795{
796 int abort = 0;
797
794 write_lock(&resource_lock); 798 write_lock(&resource_lock);
795 __reserve_region_with_split(root, start, end, name); 799 if (root->start > start || root->end < end) {
800 pr_err("requested range [0x%llx-0x%llx] not in root %pr\n",
801 (unsigned long long)start, (unsigned long long)end,
802 root);
803 if (start > root->end || end < root->start)
804 abort = 1;
805 else {
806 if (end > root->end)
807 end = root->end;
808 if (start < root->start)
809 start = root->start;
810 pr_err("fixing request to [0x%llx-0x%llx]\n",
811 (unsigned long long)start,
812 (unsigned long long)end);
813 }
814 dump_stack();
815 }
816 if (!abort)
817 __reserve_region_with_split(root, start, end, name);
796 write_unlock(&resource_lock); 818 write_unlock(&resource_lock);
797} 819}
798 820
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 468bdd44c1ba..d325c4b2dcbb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1096,7 +1096,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1096 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. 1096 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
1097 * 1097 *
1098 * sched_move_task() holds both and thus holding either pins the cgroup, 1098 * sched_move_task() holds both and thus holding either pins the cgroup,
1099 * see set_task_rq(). 1099 * see task_group().
1100 * 1100 *
1101 * Furthermore, all task_rq users should acquire both locks, see 1101 * Furthermore, all task_rq users should acquire both locks, see
1102 * task_rq_lock(). 1102 * task_rq_lock().
@@ -1910,12 +1910,12 @@ static inline void
1910prepare_task_switch(struct rq *rq, struct task_struct *prev, 1910prepare_task_switch(struct rq *rq, struct task_struct *prev,
1911 struct task_struct *next) 1911 struct task_struct *next)
1912{ 1912{
1913 trace_sched_switch(prev, next);
1913 sched_info_switch(prev, next); 1914 sched_info_switch(prev, next);
1914 perf_event_task_sched_out(prev, next); 1915 perf_event_task_sched_out(prev, next);
1915 fire_sched_out_preempt_notifiers(prev, next); 1916 fire_sched_out_preempt_notifiers(prev, next);
1916 prepare_lock_switch(rq, next); 1917 prepare_lock_switch(rq, next);
1917 prepare_arch_switch(next); 1918 prepare_arch_switch(next);
1918 trace_sched_switch(prev, next);
1919} 1919}
1920 1920
1921/** 1921/**
@@ -6024,6 +6024,11 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
6024 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this 6024 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
6025 * allows us to avoid some pointer chasing select_idle_sibling(). 6025 * allows us to avoid some pointer chasing select_idle_sibling().
6026 * 6026 *
6027 * Iterate domains and sched_groups downward, assigning CPUs to be
6028 * select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing
6029 * due to random perturbation self canceling, ie sw buddies pull
6030 * their counterpart to their CPU's hw counterpart.
6031 *
6027 * Also keep a unique ID per domain (we use the first cpu number in 6032 * Also keep a unique ID per domain (we use the first cpu number in
6028 * the cpumask of the domain), this allows us to quickly tell if 6033 * the cpumask of the domain), this allows us to quickly tell if
6029 * two cpus are in the same cache domain, see cpus_share_cache(). 6034 * two cpus are in the same cache domain, see cpus_share_cache().
@@ -6037,8 +6042,40 @@ static void update_top_cache_domain(int cpu)
6037 int id = cpu; 6042 int id = cpu;
6038 6043
6039 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); 6044 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
6040 if (sd) 6045 if (sd) {
6046 struct sched_domain *tmp = sd;
6047 struct sched_group *sg, *prev;
6048 bool right;
6049
6050 /*
6051 * Traverse to first CPU in group, and count hops
6052 * to cpu from there, switching direction on each
6053 * hop, never ever pointing the last CPU rightward.
6054 */
6055 do {
6056 id = cpumask_first(sched_domain_span(tmp));
6057 prev = sg = tmp->groups;
6058 right = 1;
6059
6060 while (cpumask_first(sched_group_cpus(sg)) != id)
6061 sg = sg->next;
6062
6063 while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) {
6064 prev = sg;
6065 sg = sg->next;
6066 right = !right;
6067 }
6068
6069 /* A CPU went down, never point back to domain start. */
6070 if (right && cpumask_first(sched_group_cpus(sg->next)) == id)
6071 right = false;
6072
6073 sg = right ? sg->next : prev;
6074 tmp->idle_buddy = cpumask_first(sched_group_cpus(sg));
6075 } while ((tmp = tmp->child));
6076
6041 id = cpumask_first(sched_domain_span(sd)); 6077 id = cpumask_first(sched_domain_span(sd));
6078 }
6042 6079
6043 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); 6080 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
6044 per_cpu(sd_llc_id, cpu) = id; 6081 per_cpu(sd_llc_id, cpu) = id;
@@ -7097,34 +7134,66 @@ match2:
7097 mutex_unlock(&sched_domains_mutex); 7134 mutex_unlock(&sched_domains_mutex);
7098} 7135}
7099 7136
7137static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */
7138
7100/* 7139/*
7101 * Update cpusets according to cpu_active mask. If cpusets are 7140 * Update cpusets according to cpu_active mask. If cpusets are
7102 * disabled, cpuset_update_active_cpus() becomes a simple wrapper 7141 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
7103 * around partition_sched_domains(). 7142 * around partition_sched_domains().
7143 *
7144 * If we come here as part of a suspend/resume, don't touch cpusets because we
7145 * want to restore it back to its original state upon resume anyway.
7104 */ 7146 */
7105static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, 7147static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
7106 void *hcpu) 7148 void *hcpu)
7107{ 7149{
7108 switch (action & ~CPU_TASKS_FROZEN) { 7150 switch (action) {
7151 case CPU_ONLINE_FROZEN:
7152 case CPU_DOWN_FAILED_FROZEN:
7153
7154 /*
7155 * num_cpus_frozen tracks how many CPUs are involved in suspend
7156 * resume sequence. As long as this is not the last online
7157 * operation in the resume sequence, just build a single sched
7158 * domain, ignoring cpusets.
7159 */
7160 num_cpus_frozen--;
7161 if (likely(num_cpus_frozen)) {
7162 partition_sched_domains(1, NULL, NULL);
7163 break;
7164 }
7165
7166 /*
7167 * This is the last CPU online operation. So fall through and
7168 * restore the original sched domains by considering the
7169 * cpuset configurations.
7170 */
7171
7109 case CPU_ONLINE: 7172 case CPU_ONLINE:
7110 case CPU_DOWN_FAILED: 7173 case CPU_DOWN_FAILED:
7111 cpuset_update_active_cpus(); 7174 cpuset_update_active_cpus(true);
7112 return NOTIFY_OK; 7175 break;
7113 default: 7176 default:
7114 return NOTIFY_DONE; 7177 return NOTIFY_DONE;
7115 } 7178 }
7179 return NOTIFY_OK;
7116} 7180}
7117 7181
7118static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, 7182static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
7119 void *hcpu) 7183 void *hcpu)
7120{ 7184{
7121 switch (action & ~CPU_TASKS_FROZEN) { 7185 switch (action) {
7122 case CPU_DOWN_PREPARE: 7186 case CPU_DOWN_PREPARE:
7123 cpuset_update_active_cpus(); 7187 cpuset_update_active_cpus(false);
7124 return NOTIFY_OK; 7188 break;
7189 case CPU_DOWN_PREPARE_FROZEN:
7190 num_cpus_frozen++;
7191 partition_sched_domains(1, NULL, NULL);
7192 break;
7125 default: 7193 default:
7126 return NOTIFY_DONE; 7194 return NOTIFY_DONE;
7127 } 7195 }
7196 return NOTIFY_OK;
7128} 7197}
7129 7198
7130void __init sched_init_smp(void) 7199void __init sched_init_smp(void)
@@ -7589,6 +7658,7 @@ void sched_destroy_group(struct task_group *tg)
7589 */ 7658 */
7590void sched_move_task(struct task_struct *tsk) 7659void sched_move_task(struct task_struct *tsk)
7591{ 7660{
7661 struct task_group *tg;
7592 int on_rq, running; 7662 int on_rq, running;
7593 unsigned long flags; 7663 unsigned long flags;
7594 struct rq *rq; 7664 struct rq *rq;
@@ -7603,6 +7673,12 @@ void sched_move_task(struct task_struct *tsk)
7603 if (unlikely(running)) 7673 if (unlikely(running))
7604 tsk->sched_class->put_prev_task(rq, tsk); 7674 tsk->sched_class->put_prev_task(rq, tsk);
7605 7675
7676 tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id,
7677 lockdep_is_held(&tsk->sighand->siglock)),
7678 struct task_group, css);
7679 tg = autogroup_task_group(tsk, tg);
7680 tsk->sched_task_group = tg;
7681
7606#ifdef CONFIG_FAIR_GROUP_SCHED 7682#ifdef CONFIG_FAIR_GROUP_SCHED
7607 if (tsk->sched_class->task_move_group) 7683 if (tsk->sched_class->task_move_group)
7608 tsk->sched_class->task_move_group(tsk, on_rq); 7684 tsk->sched_class->task_move_group(tsk, on_rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c099cc6eebe3..22321db64952 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2637,8 +2637,6 @@ static int select_idle_sibling(struct task_struct *p, int target)
2637 int cpu = smp_processor_id(); 2637 int cpu = smp_processor_id();
2638 int prev_cpu = task_cpu(p); 2638 int prev_cpu = task_cpu(p);
2639 struct sched_domain *sd; 2639 struct sched_domain *sd;
2640 struct sched_group *sg;
2641 int i;
2642 2640
2643 /* 2641 /*
2644 * If the task is going to be woken-up on this cpu and if it is 2642 * If the task is going to be woken-up on this cpu and if it is
@@ -2655,29 +2653,17 @@ static int select_idle_sibling(struct task_struct *p, int target)
2655 return prev_cpu; 2653 return prev_cpu;
2656 2654
2657 /* 2655 /*
2658 * Otherwise, iterate the domains and find an elegible idle cpu. 2656 * Otherwise, check assigned siblings to find an elegible idle cpu.
2659 */ 2657 */
2660 sd = rcu_dereference(per_cpu(sd_llc, target)); 2658 sd = rcu_dereference(per_cpu(sd_llc, target));
2661 for_each_lower_domain(sd) {
2662 sg = sd->groups;
2663 do {
2664 if (!cpumask_intersects(sched_group_cpus(sg),
2665 tsk_cpus_allowed(p)))
2666 goto next;
2667
2668 for_each_cpu(i, sched_group_cpus(sg)) {
2669 if (!idle_cpu(i))
2670 goto next;
2671 }
2672 2659
2673 target = cpumask_first_and(sched_group_cpus(sg), 2660 for_each_lower_domain(sd) {
2674 tsk_cpus_allowed(p)); 2661 if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p)))
2675 goto done; 2662 continue;
2676next: 2663 if (idle_cpu(sd->idle_buddy))
2677 sg = sg->next; 2664 return sd->idle_buddy;
2678 } while (sg != sd->groups);
2679 } 2665 }
2680done: 2666
2681 return target; 2667 return target;
2682} 2668}
2683 2669
@@ -3068,16 +3054,19 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3068 3054
3069#define LBF_ALL_PINNED 0x01 3055#define LBF_ALL_PINNED 0x01
3070#define LBF_NEED_BREAK 0x02 3056#define LBF_NEED_BREAK 0x02
3057#define LBF_SOME_PINNED 0x04
3071 3058
3072struct lb_env { 3059struct lb_env {
3073 struct sched_domain *sd; 3060 struct sched_domain *sd;
3074 3061
3075 int src_cpu;
3076 struct rq *src_rq; 3062 struct rq *src_rq;
3063 int src_cpu;
3077 3064
3078 int dst_cpu; 3065 int dst_cpu;
3079 struct rq *dst_rq; 3066 struct rq *dst_rq;
3080 3067
3068 struct cpumask *dst_grpmask;
3069 int new_dst_cpu;
3081 enum cpu_idle_type idle; 3070 enum cpu_idle_type idle;
3082 long imbalance; 3071 long imbalance;
3083 unsigned int flags; 3072 unsigned int flags;
@@ -3145,9 +3134,31 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
3145 * 3) are cache-hot on their current CPU. 3134 * 3) are cache-hot on their current CPU.
3146 */ 3135 */
3147 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { 3136 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
3137 int new_dst_cpu;
3138
3148 schedstat_inc(p, se.statistics.nr_failed_migrations_affine); 3139 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
3140
3141 /*
3142 * Remember if this task can be migrated to any other cpu in
3143 * our sched_group. We may want to revisit it if we couldn't
3144 * meet load balance goals by pulling other tasks on src_cpu.
3145 *
3146 * Also avoid computing new_dst_cpu if we have already computed
3147 * one in current iteration.
3148 */
3149 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
3150 return 0;
3151
3152 new_dst_cpu = cpumask_first_and(env->dst_grpmask,
3153 tsk_cpus_allowed(p));
3154 if (new_dst_cpu < nr_cpu_ids) {
3155 env->flags |= LBF_SOME_PINNED;
3156 env->new_dst_cpu = new_dst_cpu;
3157 }
3149 return 0; 3158 return 0;
3150 } 3159 }
3160
3161 /* Record that we found atleast one task that could run on dst_cpu */
3151 env->flags &= ~LBF_ALL_PINNED; 3162 env->flags &= ~LBF_ALL_PINNED;
3152 3163
3153 if (task_running(env->src_rq, p)) { 3164 if (task_running(env->src_rq, p)) {
@@ -4227,7 +4238,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4227 struct sched_domain *sd, enum cpu_idle_type idle, 4238 struct sched_domain *sd, enum cpu_idle_type idle,
4228 int *balance) 4239 int *balance)
4229{ 4240{
4230 int ld_moved, active_balance = 0; 4241 int ld_moved, cur_ld_moved, active_balance = 0;
4242 int lb_iterations, max_lb_iterations;
4231 struct sched_group *group; 4243 struct sched_group *group;
4232 struct rq *busiest; 4244 struct rq *busiest;
4233 unsigned long flags; 4245 unsigned long flags;
@@ -4237,11 +4249,13 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4237 .sd = sd, 4249 .sd = sd,
4238 .dst_cpu = this_cpu, 4250 .dst_cpu = this_cpu,
4239 .dst_rq = this_rq, 4251 .dst_rq = this_rq,
4252 .dst_grpmask = sched_group_cpus(sd->groups),
4240 .idle = idle, 4253 .idle = idle,
4241 .loop_break = sched_nr_migrate_break, 4254 .loop_break = sched_nr_migrate_break,
4242 }; 4255 };
4243 4256
4244 cpumask_copy(cpus, cpu_active_mask); 4257 cpumask_copy(cpus, cpu_active_mask);
4258 max_lb_iterations = cpumask_weight(env.dst_grpmask);
4245 4259
4246 schedstat_inc(sd, lb_count[idle]); 4260 schedstat_inc(sd, lb_count[idle]);
4247 4261
@@ -4267,6 +4281,7 @@ redo:
4267 schedstat_add(sd, lb_imbalance[idle], env.imbalance); 4281 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
4268 4282
4269 ld_moved = 0; 4283 ld_moved = 0;
4284 lb_iterations = 1;
4270 if (busiest->nr_running > 1) { 4285 if (busiest->nr_running > 1) {
4271 /* 4286 /*
4272 * Attempt to move tasks. If find_busiest_group has found 4287 * Attempt to move tasks. If find_busiest_group has found
@@ -4284,7 +4299,13 @@ more_balance:
4284 double_rq_lock(this_rq, busiest); 4299 double_rq_lock(this_rq, busiest);
4285 if (!env.loop) 4300 if (!env.loop)
4286 update_h_load(env.src_cpu); 4301 update_h_load(env.src_cpu);
4287 ld_moved += move_tasks(&env); 4302
4303 /*
4304 * cur_ld_moved - load moved in current iteration
4305 * ld_moved - cumulative load moved across iterations
4306 */
4307 cur_ld_moved = move_tasks(&env);
4308 ld_moved += cur_ld_moved;
4288 double_rq_unlock(this_rq, busiest); 4309 double_rq_unlock(this_rq, busiest);
4289 local_irq_restore(flags); 4310 local_irq_restore(flags);
4290 4311
@@ -4296,14 +4317,52 @@ more_balance:
4296 /* 4317 /*
4297 * some other cpu did the load balance for us. 4318 * some other cpu did the load balance for us.
4298 */ 4319 */
4299 if (ld_moved && this_cpu != smp_processor_id()) 4320 if (cur_ld_moved && env.dst_cpu != smp_processor_id())
4300 resched_cpu(this_cpu); 4321 resched_cpu(env.dst_cpu);
4322
4323 /*
4324 * Revisit (affine) tasks on src_cpu that couldn't be moved to
4325 * us and move them to an alternate dst_cpu in our sched_group
4326 * where they can run. The upper limit on how many times we
4327 * iterate on same src_cpu is dependent on number of cpus in our
4328 * sched_group.
4329 *
4330 * This changes load balance semantics a bit on who can move
4331 * load to a given_cpu. In addition to the given_cpu itself
4332 * (or a ilb_cpu acting on its behalf where given_cpu is
4333 * nohz-idle), we now have balance_cpu in a position to move
4334 * load to given_cpu. In rare situations, this may cause
4335 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
4336 * _independently_ and at _same_ time to move some load to
4337 * given_cpu) causing exceess load to be moved to given_cpu.
4338 * This however should not happen so much in practice and
4339 * moreover subsequent load balance cycles should correct the
4340 * excess load moved.
4341 */
4342 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
4343 lb_iterations++ < max_lb_iterations) {
4344
4345 this_rq = cpu_rq(env.new_dst_cpu);
4346 env.dst_rq = this_rq;
4347 env.dst_cpu = env.new_dst_cpu;
4348 env.flags &= ~LBF_SOME_PINNED;
4349 env.loop = 0;
4350 env.loop_break = sched_nr_migrate_break;
4351 /*
4352 * Go back to "more_balance" rather than "redo" since we
4353 * need to continue with same src_cpu.
4354 */
4355 goto more_balance;
4356 }
4301 4357
4302 /* All tasks on this runqueue were pinned by CPU affinity */ 4358 /* All tasks on this runqueue were pinned by CPU affinity */
4303 if (unlikely(env.flags & LBF_ALL_PINNED)) { 4359 if (unlikely(env.flags & LBF_ALL_PINNED)) {
4304 cpumask_clear_cpu(cpu_of(busiest), cpus); 4360 cpumask_clear_cpu(cpu_of(busiest), cpus);
4305 if (!cpumask_empty(cpus)) 4361 if (!cpumask_empty(cpus)) {
4362 env.loop = 0;
4363 env.loop_break = sched_nr_migrate_break;
4306 goto redo; 4364 goto redo;
4365 }
4307 goto out_balanced; 4366 goto out_balanced;
4308 } 4367 }
4309 } 4368 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 55844f24435a..c35a1a7dd4d6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -538,22 +538,19 @@ extern int group_balance_cpu(struct sched_group *sg);
538/* 538/*
539 * Return the group to which this tasks belongs. 539 * Return the group to which this tasks belongs.
540 * 540 *
541 * We use task_subsys_state_check() and extend the RCU verification with 541 * We cannot use task_subsys_state() and friends because the cgroup
542 * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each 542 * subsystem changes that value before the cgroup_subsys::attach() method
543 * task it moves into the cgroup. Therefore by holding either of those locks, 543 * is called, therefore we cannot pin it and might observe the wrong value.
544 * we pin the task to the current cgroup. 544 *
545 * The same is true for autogroup's p->signal->autogroup->tg, the autogroup
546 * core changes this before calling sched_move_task().
547 *
548 * Instead we use a 'copy' which is updated from sched_move_task() while
549 * holding both task_struct::pi_lock and rq::lock.
545 */ 550 */
546static inline struct task_group *task_group(struct task_struct *p) 551static inline struct task_group *task_group(struct task_struct *p)
547{ 552{
548 struct task_group *tg; 553 return p->sched_task_group;
549 struct cgroup_subsys_state *css;
550
551 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
552 lockdep_is_held(&p->pi_lock) ||
553 lockdep_is_held(&task_rq(p)->lock));
554 tg = container_of(css, struct task_group, css);
555
556 return autogroup_task_group(p, tg);
557} 554}
558 555
559/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 556/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 671f9594e368..b73e681df09e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -210,6 +210,14 @@ asmlinkage void __do_softirq(void)
210 __u32 pending; 210 __u32 pending;
211 int max_restart = MAX_SOFTIRQ_RESTART; 211 int max_restart = MAX_SOFTIRQ_RESTART;
212 int cpu; 212 int cpu;
213 unsigned long old_flags = current->flags;
214
215 /*
216 * Mask out PF_MEMALLOC s current task context is borrowed for the
217 * softirq. A softirq handled such as network RX might set PF_MEMALLOC
218 * again if the socket is related to swap
219 */
220 current->flags &= ~PF_MEMALLOC;
213 221
214 pending = local_softirq_pending(); 222 pending = local_softirq_pending();
215 account_system_vtime(current); 223 account_system_vtime(current);
@@ -265,6 +273,7 @@ restart:
265 273
266 account_system_vtime(current); 274 account_system_vtime(current);
267 __local_bh_enable(SOFTIRQ_OFFSET); 275 __local_bh_enable(SOFTIRQ_OFFSET);
276 tsk_restore_flags(current, old_flags, PF_MEMALLOC);
268} 277}
269 278
270#ifndef __ARCH_HAS_DO_SOFTIRQ 279#ifndef __ARCH_HAS_DO_SOFTIRQ
diff --git a/kernel/sys.c b/kernel/sys.c
index 2d39a84cd857..241507f23eca 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2015,7 +2015,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2015 break; 2015 break;
2016 } 2016 }
2017 me->pdeath_signal = arg2; 2017 me->pdeath_signal = arg2;
2018 error = 0;
2019 break; 2018 break;
2020 case PR_GET_PDEATHSIG: 2019 case PR_GET_PDEATHSIG:
2021 error = put_user(me->pdeath_signal, (int __user *)arg2); 2020 error = put_user(me->pdeath_signal, (int __user *)arg2);
@@ -2029,7 +2028,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2029 break; 2028 break;
2030 } 2029 }
2031 set_dumpable(me->mm, arg2); 2030 set_dumpable(me->mm, arg2);
2032 error = 0;
2033 break; 2031 break;
2034 2032
2035 case PR_SET_UNALIGN: 2033 case PR_SET_UNALIGN:
@@ -2056,10 +2054,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2056 case PR_SET_TIMING: 2054 case PR_SET_TIMING:
2057 if (arg2 != PR_TIMING_STATISTICAL) 2055 if (arg2 != PR_TIMING_STATISTICAL)
2058 error = -EINVAL; 2056 error = -EINVAL;
2059 else
2060 error = 0;
2061 break; 2057 break;
2062
2063 case PR_SET_NAME: 2058 case PR_SET_NAME:
2064 comm[sizeof(me->comm)-1] = 0; 2059 comm[sizeof(me->comm)-1] = 0;
2065 if (strncpy_from_user(comm, (char __user *)arg2, 2060 if (strncpy_from_user(comm, (char __user *)arg2,
@@ -2067,20 +2062,19 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2067 return -EFAULT; 2062 return -EFAULT;
2068 set_task_comm(me, comm); 2063 set_task_comm(me, comm);
2069 proc_comm_connector(me); 2064 proc_comm_connector(me);
2070 return 0; 2065 break;
2071 case PR_GET_NAME: 2066 case PR_GET_NAME:
2072 get_task_comm(comm, me); 2067 get_task_comm(comm, me);
2073 if (copy_to_user((char __user *)arg2, comm, 2068 if (copy_to_user((char __user *)arg2, comm,
2074 sizeof(comm))) 2069 sizeof(comm)))
2075 return -EFAULT; 2070 return -EFAULT;
2076 return 0; 2071 break;
2077 case PR_GET_ENDIAN: 2072 case PR_GET_ENDIAN:
2078 error = GET_ENDIAN(me, arg2); 2073 error = GET_ENDIAN(me, arg2);
2079 break; 2074 break;
2080 case PR_SET_ENDIAN: 2075 case PR_SET_ENDIAN:
2081 error = SET_ENDIAN(me, arg2); 2076 error = SET_ENDIAN(me, arg2);
2082 break; 2077 break;
2083
2084 case PR_GET_SECCOMP: 2078 case PR_GET_SECCOMP:
2085 error = prctl_get_seccomp(); 2079 error = prctl_get_seccomp();
2086 break; 2080 break;
@@ -2108,7 +2102,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2108 current->default_timer_slack_ns; 2102 current->default_timer_slack_ns;
2109 else 2103 else
2110 current->timer_slack_ns = arg2; 2104 current->timer_slack_ns = arg2;
2111 error = 0;
2112 break; 2105 break;
2113 case PR_MCE_KILL: 2106 case PR_MCE_KILL:
2114 if (arg4 | arg5) 2107 if (arg4 | arg5)
@@ -2134,7 +2127,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2134 default: 2127 default:
2135 return -EINVAL; 2128 return -EINVAL;
2136 } 2129 }
2137 error = 0;
2138 break; 2130 break;
2139 case PR_MCE_KILL_GET: 2131 case PR_MCE_KILL_GET:
2140 if (arg2 | arg3 | arg4 | arg5) 2132 if (arg2 | arg3 | arg4 | arg5)
@@ -2153,7 +2145,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2153 break; 2145 break;
2154 case PR_SET_CHILD_SUBREAPER: 2146 case PR_SET_CHILD_SUBREAPER:
2155 me->signal->is_child_subreaper = !!arg2; 2147 me->signal->is_child_subreaper = !!arg2;
2156 error = 0;
2157 break; 2148 break;
2158 case PR_GET_CHILD_SUBREAPER: 2149 case PR_GET_CHILD_SUBREAPER:
2159 error = put_user(me->signal->is_child_subreaper, 2150 error = put_user(me->signal->is_child_subreaper,
@@ -2195,46 +2186,52 @@ static void argv_cleanup(struct subprocess_info *info)
2195 argv_free(info->argv); 2186 argv_free(info->argv);
2196} 2187}
2197 2188
2198/** 2189static int __orderly_poweroff(void)
2199 * orderly_poweroff - Trigger an orderly system poweroff
2200 * @force: force poweroff if command execution fails
2201 *
2202 * This may be called from any context to trigger a system shutdown.
2203 * If the orderly shutdown fails, it will force an immediate shutdown.
2204 */
2205int orderly_poweroff(bool force)
2206{ 2190{
2207 int argc; 2191 int argc;
2208 char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc); 2192 char **argv;
2209 static char *envp[] = { 2193 static char *envp[] = {
2210 "HOME=/", 2194 "HOME=/",
2211 "PATH=/sbin:/bin:/usr/sbin:/usr/bin", 2195 "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
2212 NULL 2196 NULL
2213 }; 2197 };
2214 int ret = -ENOMEM; 2198 int ret;
2215 2199
2200 argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc);
2216 if (argv == NULL) { 2201 if (argv == NULL) {
2217 printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", 2202 printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
2218 __func__, poweroff_cmd); 2203 __func__, poweroff_cmd);
2219 goto out; 2204 return -ENOMEM;
2220 } 2205 }
2221 2206
2222 ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT, 2207 ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT,
2223 NULL, argv_cleanup, NULL); 2208 NULL, argv_cleanup, NULL);
2224out:
2225 if (likely(!ret))
2226 return 0;
2227
2228 if (ret == -ENOMEM) 2209 if (ret == -ENOMEM)
2229 argv_free(argv); 2210 argv_free(argv);
2230 2211
2231 if (force) { 2212 return ret;
2213}
2214
2215/**
2216 * orderly_poweroff - Trigger an orderly system poweroff
2217 * @force: force poweroff if command execution fails
2218 *
2219 * This may be called from any context to trigger a system shutdown.
2220 * If the orderly shutdown fails, it will force an immediate shutdown.
2221 */
2222int orderly_poweroff(bool force)
2223{
2224 int ret = __orderly_poweroff();
2225
2226 if (ret && force) {
2232 printk(KERN_WARNING "Failed to start orderly shutdown: " 2227 printk(KERN_WARNING "Failed to start orderly shutdown: "
2233 "forcing the issue\n"); 2228 "forcing the issue\n");
2234 2229
2235 /* I guess this should try to kick off some daemon to 2230 /*
2236 sync and poweroff asap. Or not even bother syncing 2231 * I guess this should try to kick off some daemon to sync and
2237 if we're doing an emergency shutdown? */ 2232 * poweroff asap. Or not even bother syncing if we're doing an
2233 * emergency shutdown?
2234 */
2238 emergency_sync(); 2235 emergency_sync();
2239 kernel_power_off(); 2236 kernel_power_off();
2240 } 2237 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4ab11879aeb4..6502d35a25ba 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -30,6 +30,7 @@
30#include <linux/security.h> 30#include <linux/security.h>
31#include <linux/ctype.h> 31#include <linux/ctype.h>
32#include <linux/kmemcheck.h> 32#include <linux/kmemcheck.h>
33#include <linux/kmemleak.h>
33#include <linux/fs.h> 34#include <linux/fs.h>
34#include <linux/init.h> 35#include <linux/init.h>
35#include <linux/kernel.h> 36#include <linux/kernel.h>
@@ -174,6 +175,11 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
174 void __user *buffer, size_t *lenp, loff_t *ppos); 175 void __user *buffer, size_t *lenp, loff_t *ppos);
175#endif 176#endif
176 177
178static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
179 void __user *buffer, size_t *lenp, loff_t *ppos);
180static int proc_dostring_coredump(struct ctl_table *table, int write,
181 void __user *buffer, size_t *lenp, loff_t *ppos);
182
177#ifdef CONFIG_MAGIC_SYSRQ 183#ifdef CONFIG_MAGIC_SYSRQ
178/* Note: sysrq code uses it's own private copy */ 184/* Note: sysrq code uses it's own private copy */
179static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; 185static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
@@ -410,7 +416,7 @@ static struct ctl_table kern_table[] = {
410 .data = core_pattern, 416 .data = core_pattern,
411 .maxlen = CORENAME_MAX_SIZE, 417 .maxlen = CORENAME_MAX_SIZE,
412 .mode = 0644, 418 .mode = 0644,
413 .proc_handler = proc_dostring, 419 .proc_handler = proc_dostring_coredump,
414 }, 420 },
415 { 421 {
416 .procname = "core_pipe_limit", 422 .procname = "core_pipe_limit",
@@ -1095,11 +1101,9 @@ static struct ctl_table vm_table[] = {
1095 .extra1 = &zero, 1101 .extra1 = &zero,
1096 }, 1102 },
1097 { 1103 {
1098 .procname = "nr_pdflush_threads", 1104 .procname = "nr_pdflush_threads",
1099 .data = &nr_pdflush_threads, 1105 .mode = 0444 /* read-only */,
1100 .maxlen = sizeof nr_pdflush_threads, 1106 .proc_handler = pdflush_proc_obsolete,
1101 .mode = 0444 /* read-only*/,
1102 .proc_handler = proc_dointvec,
1103 }, 1107 },
1104 { 1108 {
1105 .procname = "swappiness", 1109 .procname = "swappiness",
@@ -1498,7 +1502,7 @@ static struct ctl_table fs_table[] = {
1498 .data = &suid_dumpable, 1502 .data = &suid_dumpable,
1499 .maxlen = sizeof(int), 1503 .maxlen = sizeof(int),
1500 .mode = 0644, 1504 .mode = 0644,
1501 .proc_handler = proc_dointvec_minmax, 1505 .proc_handler = proc_dointvec_minmax_coredump,
1502 .extra1 = &zero, 1506 .extra1 = &zero,
1503 .extra2 = &two, 1507 .extra2 = &two,
1504 }, 1508 },
@@ -1551,7 +1555,10 @@ static struct ctl_table dev_table[] = {
1551 1555
1552int __init sysctl_init(void) 1556int __init sysctl_init(void)
1553{ 1557{
1554 register_sysctl_table(sysctl_base_table); 1558 struct ctl_table_header *hdr;
1559
1560 hdr = register_sysctl_table(sysctl_base_table);
1561 kmemleak_not_leak(hdr);
1555 return 0; 1562 return 0;
1556} 1563}
1557 1564
@@ -2009,6 +2016,34 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
2009 do_proc_dointvec_minmax_conv, &param); 2016 do_proc_dointvec_minmax_conv, &param);
2010} 2017}
2011 2018
2019static void validate_coredump_safety(void)
2020{
2021 if (suid_dumpable == SUID_DUMPABLE_SAFE &&
2022 core_pattern[0] != '/' && core_pattern[0] != '|') {
2023 printk(KERN_WARNING "Unsafe core_pattern used with "\
2024 "suid_dumpable=2. Pipe handler or fully qualified "\
2025 "core dump path required.\n");
2026 }
2027}
2028
2029static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
2030 void __user *buffer, size_t *lenp, loff_t *ppos)
2031{
2032 int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
2033 if (!error)
2034 validate_coredump_safety();
2035 return error;
2036}
2037
2038static int proc_dostring_coredump(struct ctl_table *table, int write,
2039 void __user *buffer, size_t *lenp, loff_t *ppos)
2040{
2041 int error = proc_dostring(table, write, buffer, lenp, ppos);
2042 if (!error)
2043 validate_coredump_safety();
2044 return error;
2045}
2046
2012static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, 2047static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
2013 void __user *buffer, 2048 void __user *buffer,
2014 size_t *lenp, loff_t *ppos, 2049 size_t *lenp, loff_t *ppos,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index a650694883a1..65bdcf198d4e 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -147,7 +147,7 @@ static const struct bin_table bin_vm_table[] = {
147 { CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" }, 147 { CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" },
148 /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */ 148 /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */
149 /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */ 149 /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */
150 { CTL_INT, VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" }, 150 /* VM_NR_PDFLUSH_THREADS "nr_pdflush_threads" no longer used */
151 { CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" }, 151 { CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" },
152 /* VM_PAGEBUF unused */ 152 /* VM_PAGEBUF unused */
153 /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */ 153 /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index e66046456f4f..d0a32796550f 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -436,6 +436,11 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
436 436
437 na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, 437 na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS,
438 sizeof(struct cgroupstats)); 438 sizeof(struct cgroupstats));
439 if (na == NULL) {
440 rc = -EMSGSIZE;
441 goto err;
442 }
443
439 stats = nla_data(na); 444 stats = nla_data(na);
440 memset(stats, 0, sizeof(*stats)); 445 memset(stats, 0, sizeof(*stats));
441 446
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a120f98c4112..5c38c81496ce 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3187,10 +3187,10 @@ static int tracing_set_tracer(const char *buf)
3187 } 3187 }
3188 destroy_trace_option_files(topts); 3188 destroy_trace_option_files(topts);
3189 3189
3190 current_trace = t; 3190 current_trace = &nop_trace;
3191 3191
3192 topts = create_trace_option_files(current_trace); 3192 topts = create_trace_option_files(t);
3193 if (current_trace->use_max_tr) { 3193 if (t->use_max_tr) {
3194 int cpu; 3194 int cpu;
3195 /* we need to make per cpu buffer sizes equivalent */ 3195 /* we need to make per cpu buffer sizes equivalent */
3196 for_each_tracing_cpu(cpu) { 3196 for_each_tracing_cpu(cpu) {
@@ -3210,6 +3210,7 @@ static int tracing_set_tracer(const char *buf)
3210 goto out; 3210 goto out;
3211 } 3211 }
3212 3212
3213 current_trace = t;
3213 trace_branch_enable(tr); 3214 trace_branch_enable(tr);
3214 out: 3215 out:
3215 mutex_unlock(&trace_types_lock); 3216 mutex_unlock(&trace_types_lock);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index c7b0c6a7db09..a426f410c060 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -13,6 +13,7 @@
13#include <linux/debugfs.h> 13#include <linux/debugfs.h>
14#include <linux/uaccess.h> 14#include <linux/uaccess.h>
15#include <linux/ftrace.h> 15#include <linux/ftrace.h>
16#include <linux/pstore.h>
16#include <linux/fs.h> 17#include <linux/fs.h>
17 18
18#include "trace.h" 19#include "trace.h"
@@ -74,6 +75,14 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
74 preempt_enable_notrace(); 75 preempt_enable_notrace();
75} 76}
76 77
78/* Our two options */
79enum {
80 TRACE_FUNC_OPT_STACK = 0x1,
81 TRACE_FUNC_OPT_PSTORE = 0x2,
82};
83
84static struct tracer_flags func_flags;
85
77static void 86static void
78function_trace_call(unsigned long ip, unsigned long parent_ip) 87function_trace_call(unsigned long ip, unsigned long parent_ip)
79{ 88{
@@ -97,6 +106,12 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
97 disabled = atomic_inc_return(&data->disabled); 106 disabled = atomic_inc_return(&data->disabled);
98 107
99 if (likely(disabled == 1)) { 108 if (likely(disabled == 1)) {
109 /*
110 * So far tracing doesn't support multiple buffers, so
111 * we make an explicit call for now.
112 */
113 if (unlikely(func_flags.val & TRACE_FUNC_OPT_PSTORE))
114 pstore_ftrace_call(ip, parent_ip);
100 pc = preempt_count(); 115 pc = preempt_count();
101 trace_function(tr, ip, parent_ip, flags, pc); 116 trace_function(tr, ip, parent_ip, flags, pc);
102 } 117 }
@@ -158,15 +173,13 @@ static struct ftrace_ops trace_stack_ops __read_mostly =
158 .flags = FTRACE_OPS_FL_GLOBAL, 173 .flags = FTRACE_OPS_FL_GLOBAL,
159}; 174};
160 175
161/* Our two options */
162enum {
163 TRACE_FUNC_OPT_STACK = 0x1,
164};
165
166static struct tracer_opt func_opts[] = { 176static struct tracer_opt func_opts[] = {
167#ifdef CONFIG_STACKTRACE 177#ifdef CONFIG_STACKTRACE
168 { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, 178 { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
169#endif 179#endif
180#ifdef CONFIG_PSTORE_FTRACE
181 { TRACER_OPT(func_pstore, TRACE_FUNC_OPT_PSTORE) },
182#endif
170 { } /* Always set a last empty entry */ 183 { } /* Always set a last empty entry */
171}; 184};
172 185
@@ -204,10 +217,11 @@ static void tracing_stop_function_trace(void)
204 217
205static int func_set_flag(u32 old_flags, u32 bit, int set) 218static int func_set_flag(u32 old_flags, u32 bit, int set)
206{ 219{
207 if (bit == TRACE_FUNC_OPT_STACK) { 220 switch (bit) {
221 case TRACE_FUNC_OPT_STACK:
208 /* do nothing if already set */ 222 /* do nothing if already set */
209 if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK)) 223 if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK))
210 return 0; 224 break;
211 225
212 if (set) { 226 if (set) {
213 unregister_ftrace_function(&trace_ops); 227 unregister_ftrace_function(&trace_ops);
@@ -217,10 +231,14 @@ static int func_set_flag(u32 old_flags, u32 bit, int set)
217 register_ftrace_function(&trace_ops); 231 register_ftrace_function(&trace_ops);
218 } 232 }
219 233
220 return 0; 234 break;
235 case TRACE_FUNC_OPT_PSTORE:
236 break;
237 default:
238 return -EINVAL;
221 } 239 }
222 240
223 return -EINVAL; 241 return 0;
224} 242}
225 243
226static struct tracer function_trace __read_mostly = 244static struct tracer function_trace __read_mostly =
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 4b1dfba70f7c..69add8a9da68 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -575,7 +575,7 @@ out:
575/* 575/*
576 * Create/destroy watchdog threads as CPUs come and go: 576 * Create/destroy watchdog threads as CPUs come and go:
577 */ 577 */
578static int __cpuinit 578static int
579cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 579cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
580{ 580{
581 int hotcpu = (unsigned long)hcpu; 581 int hotcpu = (unsigned long)hcpu;
@@ -610,10 +610,27 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
610 return NOTIFY_OK; 610 return NOTIFY_OK;
611} 611}
612 612
613static struct notifier_block __cpuinitdata cpu_nfb = { 613static struct notifier_block cpu_nfb = {
614 .notifier_call = cpu_callback 614 .notifier_call = cpu_callback
615}; 615};
616 616
617#ifdef CONFIG_SUSPEND
618/*
619 * On exit from suspend we force an offline->online transition on the boot CPU
620 * so that the PMU state that was lost while in suspended state gets set up
621 * properly for the boot CPU. This information is required for restarting the
622 * NMI watchdog.
623 */
624void lockup_detector_bootcpu_resume(void)
625{
626 void *cpu = (void *)(long)smp_processor_id();
627
628 cpu_callback(&cpu_nfb, CPU_DEAD_FROZEN, cpu);
629 cpu_callback(&cpu_nfb, CPU_UP_PREPARE_FROZEN, cpu);
630 cpu_callback(&cpu_nfb, CPU_ONLINE_FROZEN, cpu);
631}
632#endif
633
617void __init lockup_detector_init(void) 634void __init lockup_detector_init(void)
618{ 635{
619 void *cpu = (void *)(long)smp_processor_id(); 636 void *cpu = (void *)(long)smp_processor_id();