aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/trace/ftrace-design.txt5
-rw-r--r--arch/arm/include/asm/xen/page.h15
-rw-r--r--arch/arm/kernel/ftrace.c4
-rw-r--r--arch/arm/xen/p2m.c32
-rw-r--r--arch/blackfin/kernel/ftrace.c5
-rw-r--r--arch/ia64/kernel/ftrace.c4
-rw-r--r--arch/metag/kernel/ftrace.c5
-rw-r--r--arch/microblaze/kernel/ftrace.c5
-rw-r--r--arch/mips/kernel/ftrace.c5
-rw-r--r--arch/powerpc/kernel/ftrace.c7
-rw-r--r--arch/s390/kernel/ftrace.c3
-rw-r--r--arch/sh/kernel/ftrace.c5
-rw-r--r--arch/sparc/kernel/ftrace.c6
-rw-r--r--arch/sparc/kernel/leon_pci_grpci2.c1
-rw-r--r--arch/sparc/kernel/sun4m_irq.c2
-rw-r--r--arch/tile/kernel/ftrace.c4
-rw-r--r--arch/x86/include/asm/xen/page.h11
-rw-r--r--arch/x86/kernel/ftrace.c55
-rw-r--r--arch/x86/pci/xen.c29
-rw-r--r--arch/x86/xen/Kconfig5
-rw-r--r--arch/x86/xen/p2m.c121
-rw-r--r--block/blk-cgroup.c11
-rw-r--r--block/blk-cgroup.h14
-rw-r--r--block/blk-throttle.c8
-rw-r--r--block/cfq-iosched.c7
-rw-r--r--drivers/vfio/Kconfig1
-rw-r--r--drivers/vfio/vfio.c6
-rw-r--r--drivers/vfio/vfio_iommu_type1.c656
-rw-r--r--drivers/xen/events/events_base.c83
-rw-r--r--drivers/xen/events/events_internal.h1
-rw-r--r--drivers/xen/grant-table.c73
-rw-r--r--drivers/xen/manage.c16
-rw-r--r--drivers/xen/pcpu.c1
-rw-r--r--drivers/xen/platform-pci.c2
-rw-r--r--drivers/xen/xen-acpi-processor.c15
-rw-r--r--drivers/xen/xen-pciback/pciback_ops.c3
-rw-r--r--drivers/xen/xen-selfballoon.c1
-rw-r--r--drivers/xen/xenbus/xenbus_client.c27
-rw-r--r--fs/bio.c2
-rw-r--r--fs/kernfs/dir.c1
-rw-r--r--include/linux/cgroup.h275
-rw-r--r--include/linux/cgroup_subsys.h30
-rw-r--r--include/linux/ftrace.h27
-rw-r--r--include/linux/ftrace_event.h28
-rw-r--r--include/linux/hugetlb_cgroup.h2
-rw-r--r--include/linux/memcontrol.h2
-rw-r--r--include/linux/module.h2
-rw-r--r--include/linux/tracepoint.h18
-rw-r--r--include/linux/vfio.h2
-rw-r--r--include/net/cls_cgroup.h2
-rw-r--r--include/net/netprio_cgroup.h17
-rw-r--r--include/trace/events/migrate.h2
-rw-r--r--include/trace/events/writeback.h1
-rw-r--r--include/trace/ftrace.h38
-rw-r--r--include/uapi/linux/vfio.h6
-rw-r--r--include/xen/events.h6
-rw-r--r--include/xen/interface/physdev.h10
-rw-r--r--include/xen/xen-ops.h4
-rw-r--r--include/xen/xenbus.h1
-rw-r--r--init/Kconfig1
-rw-r--r--kernel/cgroup.c3711
-rw-r--r--kernel/cgroup_freezer.c40
-rw-r--r--kernel/cpuset.c262
-rw-r--r--kernel/events/core.c25
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/sched/core.c10
-rw-r--r--kernel/sched/cpuacct.c6
-rw-r--r--kernel/sched/debug.c3
-rw-r--r--kernel/trace/blktrace.c3
-rw-r--r--kernel/trace/ftrace.c162
-rw-r--r--kernel/trace/trace.c187
-rw-r--r--kernel/trace/trace.h38
-rw-r--r--kernel/trace/trace_events.c30
-rw-r--r--kernel/trace/trace_functions.c143
-rw-r--r--kernel/trace/trace_functions_graph.c3
-rw-r--r--kernel/trace/trace_irqsoff.c10
-rw-r--r--kernel/trace/trace_kprobe.c17
-rw-r--r--kernel/trace/trace_nop.c5
-rw-r--r--kernel/trace/trace_output.c31
-rw-r--r--kernel/trace/trace_probe.h17
-rw-r--r--kernel/trace/trace_sched_wakeup.c10
-rw-r--r--kernel/trace/trace_stack.c3
-rw-r--r--kernel/trace/trace_uprobe.c191
-rw-r--r--kernel/tracepoint.c251
-rw-r--r--mm/hugetlb_cgroup.c11
-rw-r--r--mm/memcontrol.c110
-rw-r--r--mm/memory-failure.c8
-rw-r--r--net/Kconfig2
-rw-r--r--net/core/netclassid_cgroup.c15
-rw-r--r--net/core/netprio_cgroup.c41
-rw-r--r--net/ipv4/tcp_memcontrol.c4
-rw-r--r--security/device_cgroup.c12
-rw-r--r--virt/kvm/vfio.c27
94 files changed, 3302 insertions, 3824 deletions
diff --git a/Documentation/trace/ftrace-design.txt b/Documentation/trace/ftrace-design.txt
index 79fcafc7fd64..3f669b9e8852 100644
--- a/Documentation/trace/ftrace-design.txt
+++ b/Documentation/trace/ftrace-design.txt
@@ -358,11 +358,8 @@ Every arch has an init callback function. If you need to do something early on
358to initialize some state, this is the time to do that. Otherwise, this simple 358to initialize some state, this is the time to do that. Otherwise, this simple
359function below should be sufficient for most people: 359function below should be sufficient for most people:
360 360
361int __init ftrace_dyn_arch_init(void *data) 361int __init ftrace_dyn_arch_init(void)
362{ 362{
363 /* return value is done indirectly via data */
364 *(unsigned long *)data = 0;
365
366 return 0; 363 return 0;
367} 364}
368 365
diff --git a/arch/arm/include/asm/xen/page.h b/arch/arm/include/asm/xen/page.h
index e0965abacb7d..cf4f3e867395 100644
--- a/arch/arm/include/asm/xen/page.h
+++ b/arch/arm/include/asm/xen/page.h
@@ -97,16 +97,13 @@ static inline pte_t *lookup_address(unsigned long address, unsigned int *level)
97 return NULL; 97 return NULL;
98} 98}
99 99
100static inline int m2p_add_override(unsigned long mfn, struct page *page, 100extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
101 struct gnttab_map_grant_ref *kmap_op) 101 struct gnttab_map_grant_ref *kmap_ops,
102{ 102 struct page **pages, unsigned int count);
103 return 0;
104}
105 103
106static inline int m2p_remove_override(struct page *page, bool clear_pte) 104extern int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
107{ 105 struct gnttab_map_grant_ref *kmap_ops,
108 return 0; 106 struct page **pages, unsigned int count);
109}
110 107
111bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); 108bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
112bool __set_phys_to_machine_multi(unsigned long pfn, unsigned long mfn, 109bool __set_phys_to_machine_multi(unsigned long pfn, unsigned long mfn,
diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c
index 34e56647dcee..c108ddcb9ba4 100644
--- a/arch/arm/kernel/ftrace.c
+++ b/arch/arm/kernel/ftrace.c
@@ -156,10 +156,8 @@ int ftrace_make_nop(struct module *mod,
156 return ret; 156 return ret;
157} 157}
158 158
159int __init ftrace_dyn_arch_init(void *data) 159int __init ftrace_dyn_arch_init(void)
160{ 160{
161 *(unsigned long *)data = 0;
162
163 return 0; 161 return 0;
164} 162}
165#endif /* CONFIG_DYNAMIC_FTRACE */ 163#endif /* CONFIG_DYNAMIC_FTRACE */
diff --git a/arch/arm/xen/p2m.c b/arch/arm/xen/p2m.c
index b31ee1b275b0..97baf4427817 100644
--- a/arch/arm/xen/p2m.c
+++ b/arch/arm/xen/p2m.c
@@ -146,6 +146,38 @@ unsigned long __mfn_to_pfn(unsigned long mfn)
146} 146}
147EXPORT_SYMBOL_GPL(__mfn_to_pfn); 147EXPORT_SYMBOL_GPL(__mfn_to_pfn);
148 148
149int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
150 struct gnttab_map_grant_ref *kmap_ops,
151 struct page **pages, unsigned int count)
152{
153 int i;
154
155 for (i = 0; i < count; i++) {
156 if (map_ops[i].status)
157 continue;
158 set_phys_to_machine(map_ops[i].host_addr >> PAGE_SHIFT,
159 map_ops[i].dev_bus_addr >> PAGE_SHIFT);
160 }
161
162 return 0;
163}
164EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping);
165
166int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
167 struct gnttab_map_grant_ref *kmap_ops,
168 struct page **pages, unsigned int count)
169{
170 int i;
171
172 for (i = 0; i < count; i++) {
173 set_phys_to_machine(unmap_ops[i].host_addr >> PAGE_SHIFT,
174 INVALID_P2M_ENTRY);
175 }
176
177 return 0;
178}
179EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping);
180
149bool __set_phys_to_machine_multi(unsigned long pfn, 181bool __set_phys_to_machine_multi(unsigned long pfn,
150 unsigned long mfn, unsigned long nr_pages) 182 unsigned long mfn, unsigned long nr_pages)
151{ 183{
diff --git a/arch/blackfin/kernel/ftrace.c b/arch/blackfin/kernel/ftrace.c
index 9277905b82cf..095de0fa044d 100644
--- a/arch/blackfin/kernel/ftrace.c
+++ b/arch/blackfin/kernel/ftrace.c
@@ -65,11 +65,8 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
65 return ftrace_modify_code(ip, call, sizeof(call)); 65 return ftrace_modify_code(ip, call, sizeof(call));
66} 66}
67 67
68int __init ftrace_dyn_arch_init(void *data) 68int __init ftrace_dyn_arch_init(void)
69{ 69{
70 /* return value is done indirectly via data */
71 *(unsigned long *)data = 0;
72
73 return 0; 70 return 0;
74} 71}
75 72
diff --git a/arch/ia64/kernel/ftrace.c b/arch/ia64/kernel/ftrace.c
index 7fc8c961b1f7..3b0c2aa07857 100644
--- a/arch/ia64/kernel/ftrace.c
+++ b/arch/ia64/kernel/ftrace.c
@@ -198,9 +198,7 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
198} 198}
199 199
200/* run from kstop_machine */ 200/* run from kstop_machine */
201int __init ftrace_dyn_arch_init(void *data) 201int __init ftrace_dyn_arch_init(void)
202{ 202{
203 *(unsigned long *)data = 0;
204
205 return 0; 203 return 0;
206} 204}
diff --git a/arch/metag/kernel/ftrace.c b/arch/metag/kernel/ftrace.c
index a774f321643f..ed1d685157c2 100644
--- a/arch/metag/kernel/ftrace.c
+++ b/arch/metag/kernel/ftrace.c
@@ -117,10 +117,7 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
117} 117}
118 118
119/* run from kstop_machine */ 119/* run from kstop_machine */
120int __init ftrace_dyn_arch_init(void *data) 120int __init ftrace_dyn_arch_init(void)
121{ 121{
122 /* The return code is returned via data */
123 writel(0, data);
124
125 return 0; 122 return 0;
126} 123}
diff --git a/arch/microblaze/kernel/ftrace.c b/arch/microblaze/kernel/ftrace.c
index e8a5e9cf4ed1..bbcd2533766c 100644
--- a/arch/microblaze/kernel/ftrace.c
+++ b/arch/microblaze/kernel/ftrace.c
@@ -171,11 +171,8 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
171 return ret; 171 return ret;
172} 172}
173 173
174int __init ftrace_dyn_arch_init(void *data) 174int __init ftrace_dyn_arch_init(void)
175{ 175{
176 /* The return code is retured via data */
177 *(unsigned long *)data = 0;
178
179 return 0; 176 return 0;
180} 177}
181 178
diff --git a/arch/mips/kernel/ftrace.c b/arch/mips/kernel/ftrace.c
index 74fe73506d8f..60e7e5e45af1 100644
--- a/arch/mips/kernel/ftrace.c
+++ b/arch/mips/kernel/ftrace.c
@@ -201,7 +201,7 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
201 return ftrace_modify_code(FTRACE_CALL_IP, new); 201 return ftrace_modify_code(FTRACE_CALL_IP, new);
202} 202}
203 203
204int __init ftrace_dyn_arch_init(void *data) 204int __init ftrace_dyn_arch_init(void)
205{ 205{
206 /* Encode the instructions when booting */ 206 /* Encode the instructions when booting */
207 ftrace_dyn_arch_init_insns(); 207 ftrace_dyn_arch_init_insns();
@@ -209,9 +209,6 @@ int __init ftrace_dyn_arch_init(void *data)
209 /* Remove "b ftrace_stub" to ensure ftrace_caller() is executed */ 209 /* Remove "b ftrace_stub" to ensure ftrace_caller() is executed */
210 ftrace_modify_code(MCOUNT_ADDR, INSN_NOP); 210 ftrace_modify_code(MCOUNT_ADDR, INSN_NOP);
211 211
212 /* The return code is retured via data */
213 *(unsigned long *)data = 0;
214
215 return 0; 212 return 0;
216} 213}
217#endif /* CONFIG_DYNAMIC_FTRACE */ 214#endif /* CONFIG_DYNAMIC_FTRACE */
diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c
index b0ded97ee4e1..6a014c763cc7 100644
--- a/arch/powerpc/kernel/ftrace.c
+++ b/arch/powerpc/kernel/ftrace.c
@@ -532,13 +532,8 @@ void arch_ftrace_update_code(int command)
532 ftrace_disable_ftrace_graph_caller(); 532 ftrace_disable_ftrace_graph_caller();
533} 533}
534 534
535int __init ftrace_dyn_arch_init(void *data) 535int __init ftrace_dyn_arch_init(void)
536{ 536{
537 /* caller expects data to be zero */
538 unsigned long *p = data;
539
540 *p = 0;
541
542 return 0; 537 return 0;
543} 538}
544#endif /* CONFIG_DYNAMIC_FTRACE */ 539#endif /* CONFIG_DYNAMIC_FTRACE */
diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
index 224db03e9518..54d6493c4a56 100644
--- a/arch/s390/kernel/ftrace.c
+++ b/arch/s390/kernel/ftrace.c
@@ -130,9 +130,8 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
130 return 0; 130 return 0;
131} 131}
132 132
133int __init ftrace_dyn_arch_init(void *data) 133int __init ftrace_dyn_arch_init(void)
134{ 134{
135 *(unsigned long *) data = 0;
136 return 0; 135 return 0;
137} 136}
138 137
diff --git a/arch/sh/kernel/ftrace.c b/arch/sh/kernel/ftrace.c
index 30e13196d35b..3c74f53db6db 100644
--- a/arch/sh/kernel/ftrace.c
+++ b/arch/sh/kernel/ftrace.c
@@ -272,11 +272,8 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
272 return ftrace_modify_code(rec->ip, old, new); 272 return ftrace_modify_code(rec->ip, old, new);
273} 273}
274 274
275int __init ftrace_dyn_arch_init(void *data) 275int __init ftrace_dyn_arch_init(void)
276{ 276{
277 /* The return code is retured via data */
278 __raw_writel(0, (unsigned long)data);
279
280 return 0; 277 return 0;
281} 278}
282#endif /* CONFIG_DYNAMIC_FTRACE */ 279#endif /* CONFIG_DYNAMIC_FTRACE */
diff --git a/arch/sparc/kernel/ftrace.c b/arch/sparc/kernel/ftrace.c
index 03ab022e51c5..0a2d2ddff543 100644
--- a/arch/sparc/kernel/ftrace.c
+++ b/arch/sparc/kernel/ftrace.c
@@ -82,12 +82,8 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
82 return ftrace_modify_code(ip, old, new); 82 return ftrace_modify_code(ip, old, new);
83} 83}
84 84
85int __init ftrace_dyn_arch_init(void *data) 85int __init ftrace_dyn_arch_init(void)
86{ 86{
87 unsigned long *p = data;
88
89 *p = 0;
90
91 return 0; 87 return 0;
92} 88}
93#endif 89#endif
diff --git a/arch/sparc/kernel/leon_pci_grpci2.c b/arch/sparc/kernel/leon_pci_grpci2.c
index 5f0402aab7fb..24d6a4446349 100644
--- a/arch/sparc/kernel/leon_pci_grpci2.c
+++ b/arch/sparc/kernel/leon_pci_grpci2.c
@@ -8,6 +8,7 @@
8#include <linux/of_device.h> 8#include <linux/of_device.h>
9#include <linux/kernel.h> 9#include <linux/kernel.h>
10#include <linux/pci.h> 10#include <linux/pci.h>
11#include <linux/slab.h>
11#include <linux/delay.h> 12#include <linux/delay.h>
12#include <linux/export.h> 13#include <linux/export.h>
13#include <asm/io.h> 14#include <asm/io.h>
diff --git a/arch/sparc/kernel/sun4m_irq.c b/arch/sparc/kernel/sun4m_irq.c
index c5ade9d27a1d..8bb3b3fddea7 100644
--- a/arch/sparc/kernel/sun4m_irq.c
+++ b/arch/sparc/kernel/sun4m_irq.c
@@ -9,6 +9,8 @@
9 * Copyright (C) 1996 Dave Redman (djhr@tadpole.co.uk) 9 * Copyright (C) 1996 Dave Redman (djhr@tadpole.co.uk)
10 */ 10 */
11 11
12#include <linux/slab.h>
13
12#include <asm/timer.h> 14#include <asm/timer.h>
13#include <asm/traps.h> 15#include <asm/traps.h>
14#include <asm/pgalloc.h> 16#include <asm/pgalloc.h>
diff --git a/arch/tile/kernel/ftrace.c b/arch/tile/kernel/ftrace.c
index f1c452092eeb..8d52d83cc516 100644
--- a/arch/tile/kernel/ftrace.c
+++ b/arch/tile/kernel/ftrace.c
@@ -167,10 +167,8 @@ int ftrace_make_nop(struct module *mod,
167 return ret; 167 return ret;
168} 168}
169 169
170int __init ftrace_dyn_arch_init(void *data) 170int __init ftrace_dyn_arch_init(void)
171{ 171{
172 *(unsigned long *)data = 0;
173
174 return 0; 172 return 0;
175} 173}
176#endif /* CONFIG_DYNAMIC_FTRACE */ 174#endif /* CONFIG_DYNAMIC_FTRACE */
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 3e276eb23d1b..c949923a5668 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -49,10 +49,17 @@ extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
49extern unsigned long set_phys_range_identity(unsigned long pfn_s, 49extern unsigned long set_phys_range_identity(unsigned long pfn_s,
50 unsigned long pfn_e); 50 unsigned long pfn_e);
51 51
52extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
53 struct gnttab_map_grant_ref *kmap_ops,
54 struct page **pages, unsigned int count);
52extern int m2p_add_override(unsigned long mfn, struct page *page, 55extern int m2p_add_override(unsigned long mfn, struct page *page,
53 struct gnttab_map_grant_ref *kmap_op); 56 struct gnttab_map_grant_ref *kmap_op);
57extern int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
58 struct gnttab_map_grant_ref *kmap_ops,
59 struct page **pages, unsigned int count);
54extern int m2p_remove_override(struct page *page, 60extern int m2p_remove_override(struct page *page,
55 struct gnttab_map_grant_ref *kmap_op); 61 struct gnttab_map_grant_ref *kmap_op,
62 unsigned long mfn);
56extern struct page *m2p_find_override(unsigned long mfn); 63extern struct page *m2p_find_override(unsigned long mfn);
57extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn); 64extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
58 65
@@ -121,7 +128,7 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
121 pfn = m2p_find_override_pfn(mfn, ~0); 128 pfn = m2p_find_override_pfn(mfn, ~0);
122 } 129 }
123 130
124 /* 131 /*
125 * pfn is ~0 if there are no entries in the m2p for mfn or if the 132 * pfn is ~0 if there are no entries in the m2p for mfn or if the
126 * entry doesn't map back to the mfn and m2p_override doesn't have a 133 * entry doesn't map back to the mfn and m2p_override doesn't have a
127 * valid entry for it. 134 * valid entry for it.
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index e6253195a301..52819e816f87 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -308,7 +308,10 @@ static int ftrace_write(unsigned long ip, const char *val, int size)
308 if (within(ip, (unsigned long)_text, (unsigned long)_etext)) 308 if (within(ip, (unsigned long)_text, (unsigned long)_etext))
309 ip = (unsigned long)__va(__pa_symbol(ip)); 309 ip = (unsigned long)__va(__pa_symbol(ip));
310 310
311 return probe_kernel_write((void *)ip, val, size); 311 if (probe_kernel_write((void *)ip, val, size))
312 return -EPERM;
313
314 return 0;
312} 315}
313 316
314static int add_break(unsigned long ip, const char *old) 317static int add_break(unsigned long ip, const char *old)
@@ -323,10 +326,7 @@ static int add_break(unsigned long ip, const char *old)
323 if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0) 326 if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0)
324 return -EINVAL; 327 return -EINVAL;
325 328
326 if (ftrace_write(ip, &brk, 1)) 329 return ftrace_write(ip, &brk, 1);
327 return -EPERM;
328
329 return 0;
330} 330}
331 331
332static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr) 332static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr)
@@ -425,7 +425,7 @@ static int remove_breakpoint(struct dyn_ftrace *rec)
425 425
426 /* If this does not have a breakpoint, we are done */ 426 /* If this does not have a breakpoint, we are done */
427 if (ins[0] != brk) 427 if (ins[0] != brk)
428 return -1; 428 return 0;
429 429
430 nop = ftrace_nop_replace(); 430 nop = ftrace_nop_replace();
431 431
@@ -455,7 +455,7 @@ static int remove_breakpoint(struct dyn_ftrace *rec)
455 } 455 }
456 456
457 update: 457 update:
458 return probe_kernel_write((void *)ip, &nop[0], 1); 458 return ftrace_write(ip, nop, 1);
459} 459}
460 460
461static int add_update_code(unsigned long ip, unsigned const char *new) 461static int add_update_code(unsigned long ip, unsigned const char *new)
@@ -463,9 +463,7 @@ static int add_update_code(unsigned long ip, unsigned const char *new)
463 /* skip breakpoint */ 463 /* skip breakpoint */
464 ip++; 464 ip++;
465 new++; 465 new++;
466 if (ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1)) 466 return ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1);
467 return -EPERM;
468 return 0;
469} 467}
470 468
471static int add_update_call(struct dyn_ftrace *rec, unsigned long addr) 469static int add_update_call(struct dyn_ftrace *rec, unsigned long addr)
@@ -520,10 +518,7 @@ static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr)
520 518
521 new = ftrace_call_replace(ip, addr); 519 new = ftrace_call_replace(ip, addr);
522 520
523 if (ftrace_write(ip, new, 1)) 521 return ftrace_write(ip, new, 1);
524 return -EPERM;
525
526 return 0;
527} 522}
528 523
529static int finish_update_nop(struct dyn_ftrace *rec) 524static int finish_update_nop(struct dyn_ftrace *rec)
@@ -533,9 +528,7 @@ static int finish_update_nop(struct dyn_ftrace *rec)
533 528
534 new = ftrace_nop_replace(); 529 new = ftrace_nop_replace();
535 530
536 if (ftrace_write(ip, new, 1)) 531 return ftrace_write(ip, new, 1);
537 return -EPERM;
538 return 0;
539} 532}
540 533
541static int finish_update(struct dyn_ftrace *rec, int enable) 534static int finish_update(struct dyn_ftrace *rec, int enable)
@@ -632,8 +625,14 @@ void ftrace_replace_code(int enable)
632 printk(KERN_WARNING "Failed on %s (%d):\n", report, count); 625 printk(KERN_WARNING "Failed on %s (%d):\n", report, count);
633 for_ftrace_rec_iter(iter) { 626 for_ftrace_rec_iter(iter) {
634 rec = ftrace_rec_iter_record(iter); 627 rec = ftrace_rec_iter_record(iter);
635 remove_breakpoint(rec); 628 /*
629 * Breakpoints are handled only when this function is in
630 * progress. The system could not work with them.
631 */
632 if (remove_breakpoint(rec))
633 BUG();
636 } 634 }
635 run_sync();
637} 636}
638 637
639static int 638static int
@@ -655,16 +654,19 @@ ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
655 run_sync(); 654 run_sync();
656 655
657 ret = ftrace_write(ip, new_code, 1); 656 ret = ftrace_write(ip, new_code, 1);
658 if (ret) { 657 /*
659 ret = -EPERM; 658 * The breakpoint is handled only when this function is in progress.
660 goto out; 659 * The system could not work if we could not remove it.
661 } 660 */
662 run_sync(); 661 BUG_ON(ret);
663 out: 662 out:
663 run_sync();
664 return ret; 664 return ret;
665 665
666 fail_update: 666 fail_update:
667 probe_kernel_write((void *)ip, &old_code[0], 1); 667 /* Also here the system could not work with the breakpoint */
668 if (ftrace_write(ip, old_code, 1))
669 BUG();
668 goto out; 670 goto out;
669} 671}
670 672
@@ -678,11 +680,8 @@ void arch_ftrace_update_code(int command)
678 atomic_dec(&modifying_ftrace_code); 680 atomic_dec(&modifying_ftrace_code);
679} 681}
680 682
681int __init ftrace_dyn_arch_init(void *data) 683int __init ftrace_dyn_arch_init(void)
682{ 684{
683 /* The return code is retured via data */
684 *(unsigned long *)data = 0;
685
686 return 0; 685 return 0;
687} 686}
688#endif 687#endif
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 103e702ec5a7..905956f16465 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -178,6 +178,7 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
178 i = 0; 178 i = 0;
179 list_for_each_entry(msidesc, &dev->msi_list, list) { 179 list_for_each_entry(msidesc, &dev->msi_list, list) {
180 irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], 180 irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i],
181 (type == PCI_CAP_ID_MSI) ? nvec : 1,
181 (type == PCI_CAP_ID_MSIX) ? 182 (type == PCI_CAP_ID_MSIX) ?
182 "pcifront-msi-x" : 183 "pcifront-msi-x" :
183 "pcifront-msi", 184 "pcifront-msi",
@@ -245,6 +246,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
245 "xen: msi already bound to pirq=%d\n", pirq); 246 "xen: msi already bound to pirq=%d\n", pirq);
246 } 247 }
247 irq = xen_bind_pirq_msi_to_irq(dev, msidesc, pirq, 248 irq = xen_bind_pirq_msi_to_irq(dev, msidesc, pirq,
249 (type == PCI_CAP_ID_MSI) ? nvec : 1,
248 (type == PCI_CAP_ID_MSIX) ? 250 (type == PCI_CAP_ID_MSIX) ?
249 "msi-x" : "msi", 251 "msi-x" : "msi",
250 DOMID_SELF); 252 DOMID_SELF);
@@ -269,9 +271,6 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
269 int ret = 0; 271 int ret = 0;
270 struct msi_desc *msidesc; 272 struct msi_desc *msidesc;
271 273
272 if (type == PCI_CAP_ID_MSI && nvec > 1)
273 return 1;
274
275 list_for_each_entry(msidesc, &dev->msi_list, list) { 274 list_for_each_entry(msidesc, &dev->msi_list, list) {
276 struct physdev_map_pirq map_irq; 275 struct physdev_map_pirq map_irq;
277 domid_t domid; 276 domid_t domid;
@@ -291,7 +290,10 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
291 (pci_domain_nr(dev->bus) << 16); 290 (pci_domain_nr(dev->bus) << 16);
292 map_irq.devfn = dev->devfn; 291 map_irq.devfn = dev->devfn;
293 292
294 if (type == PCI_CAP_ID_MSIX) { 293 if (type == PCI_CAP_ID_MSI && nvec > 1) {
294 map_irq.type = MAP_PIRQ_TYPE_MULTI_MSI;
295 map_irq.entry_nr = nvec;
296 } else if (type == PCI_CAP_ID_MSIX) {
295 int pos; 297 int pos;
296 u32 table_offset, bir; 298 u32 table_offset, bir;
297 299
@@ -308,6 +310,16 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
308 if (pci_seg_supported) 310 if (pci_seg_supported)
309 ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, 311 ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq,
310 &map_irq); 312 &map_irq);
313 if (type == PCI_CAP_ID_MSI && nvec > 1 && ret) {
314 /*
315 * If MAP_PIRQ_TYPE_MULTI_MSI is not available
316 * there's nothing else we can do in this case.
317 * Just set ret > 0 so driver can retry with
318 * single MSI.
319 */
320 ret = 1;
321 goto out;
322 }
311 if (ret == -EINVAL && !pci_domain_nr(dev->bus)) { 323 if (ret == -EINVAL && !pci_domain_nr(dev->bus)) {
312 map_irq.type = MAP_PIRQ_TYPE_MSI; 324 map_irq.type = MAP_PIRQ_TYPE_MSI;
313 map_irq.index = -1; 325 map_irq.index = -1;
@@ -324,11 +336,10 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
324 goto out; 336 goto out;
325 } 337 }
326 338
327 ret = xen_bind_pirq_msi_to_irq(dev, msidesc, 339 ret = xen_bind_pirq_msi_to_irq(dev, msidesc, map_irq.pirq,
328 map_irq.pirq, 340 (type == PCI_CAP_ID_MSI) ? nvec : 1,
329 (type == PCI_CAP_ID_MSIX) ? 341 (type == PCI_CAP_ID_MSIX) ? "msi-x" : "msi",
330 "msi-x" : "msi", 342 domid);
331 domid);
332 if (ret < 0) 343 if (ret < 0)
333 goto out; 344 goto out;
334 } 345 }
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 9c50cc2e403b..e88fda867a33 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -19,11 +19,6 @@ config XEN_DOM0
19 depends on XEN && PCI_XEN && SWIOTLB_XEN 19 depends on XEN && PCI_XEN && SWIOTLB_XEN
20 depends on X86_LOCAL_APIC && X86_IO_APIC && ACPI && PCI 20 depends on X86_LOCAL_APIC && X86_IO_APIC && ACPI && PCI
21 21
22# Dummy symbol since people have come to rely on the PRIVILEGED_GUEST
23# name in tools.
24config XEN_PRIVILEGED_GUEST
25 def_bool XEN_DOM0
26
27config XEN_PVHVM 22config XEN_PVHVM
28 def_bool y 23 def_bool y
29 depends on XEN && PCI && X86_LOCAL_APIC 24 depends on XEN && PCI && X86_LOCAL_APIC
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 696c694986d0..85e5d78c9874 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -881,6 +881,65 @@ static unsigned long mfn_hash(unsigned long mfn)
881 return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT); 881 return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT);
882} 882}
883 883
884int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
885 struct gnttab_map_grant_ref *kmap_ops,
886 struct page **pages, unsigned int count)
887{
888 int i, ret = 0;
889 bool lazy = false;
890 pte_t *pte;
891
892 if (xen_feature(XENFEAT_auto_translated_physmap))
893 return 0;
894
895 if (kmap_ops &&
896 !in_interrupt() &&
897 paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) {
898 arch_enter_lazy_mmu_mode();
899 lazy = true;
900 }
901
902 for (i = 0; i < count; i++) {
903 unsigned long mfn, pfn;
904
905 /* Do not add to override if the map failed. */
906 if (map_ops[i].status)
907 continue;
908
909 if (map_ops[i].flags & GNTMAP_contains_pte) {
910 pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) +
911 (map_ops[i].host_addr & ~PAGE_MASK));
912 mfn = pte_mfn(*pte);
913 } else {
914 mfn = PFN_DOWN(map_ops[i].dev_bus_addr);
915 }
916 pfn = page_to_pfn(pages[i]);
917
918 WARN_ON(PagePrivate(pages[i]));
919 SetPagePrivate(pages[i]);
920 set_page_private(pages[i], mfn);
921 pages[i]->index = pfn_to_mfn(pfn);
922
923 if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) {
924 ret = -ENOMEM;
925 goto out;
926 }
927
928 if (kmap_ops) {
929 ret = m2p_add_override(mfn, pages[i], &kmap_ops[i]);
930 if (ret)
931 goto out;
932 }
933 }
934
935out:
936 if (lazy)
937 arch_leave_lazy_mmu_mode();
938
939 return ret;
940}
941EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping);
942
884/* Add an MFN override for a particular page */ 943/* Add an MFN override for a particular page */
885int m2p_add_override(unsigned long mfn, struct page *page, 944int m2p_add_override(unsigned long mfn, struct page *page,
886 struct gnttab_map_grant_ref *kmap_op) 945 struct gnttab_map_grant_ref *kmap_op)
@@ -899,13 +958,6 @@ int m2p_add_override(unsigned long mfn, struct page *page,
899 "m2p_add_override: pfn %lx not mapped", pfn)) 958 "m2p_add_override: pfn %lx not mapped", pfn))
900 return -EINVAL; 959 return -EINVAL;
901 } 960 }
902 WARN_ON(PagePrivate(page));
903 SetPagePrivate(page);
904 set_page_private(page, mfn);
905 page->index = pfn_to_mfn(pfn);
906
907 if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn))))
908 return -ENOMEM;
909 961
910 if (kmap_op != NULL) { 962 if (kmap_op != NULL) {
911 if (!PageHighMem(page)) { 963 if (!PageHighMem(page)) {
@@ -943,20 +995,62 @@ int m2p_add_override(unsigned long mfn, struct page *page,
943 return 0; 995 return 0;
944} 996}
945EXPORT_SYMBOL_GPL(m2p_add_override); 997EXPORT_SYMBOL_GPL(m2p_add_override);
998
999int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
1000 struct gnttab_map_grant_ref *kmap_ops,
1001 struct page **pages, unsigned int count)
1002{
1003 int i, ret = 0;
1004 bool lazy = false;
1005
1006 if (xen_feature(XENFEAT_auto_translated_physmap))
1007 return 0;
1008
1009 if (kmap_ops &&
1010 !in_interrupt() &&
1011 paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) {
1012 arch_enter_lazy_mmu_mode();
1013 lazy = true;
1014 }
1015
1016 for (i = 0; i < count; i++) {
1017 unsigned long mfn = get_phys_to_machine(page_to_pfn(pages[i]));
1018 unsigned long pfn = page_to_pfn(pages[i]);
1019
1020 if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) {
1021 ret = -EINVAL;
1022 goto out;
1023 }
1024
1025 set_page_private(pages[i], INVALID_P2M_ENTRY);
1026 WARN_ON(!PagePrivate(pages[i]));
1027 ClearPagePrivate(pages[i]);
1028 set_phys_to_machine(pfn, pages[i]->index);
1029
1030 if (kmap_ops)
1031 ret = m2p_remove_override(pages[i], &kmap_ops[i], mfn);
1032 if (ret)
1033 goto out;
1034 }
1035
1036out:
1037 if (lazy)
1038 arch_leave_lazy_mmu_mode();
1039 return ret;
1040}
1041EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping);
1042
946int m2p_remove_override(struct page *page, 1043int m2p_remove_override(struct page *page,
947 struct gnttab_map_grant_ref *kmap_op) 1044 struct gnttab_map_grant_ref *kmap_op,
1045 unsigned long mfn)
948{ 1046{
949 unsigned long flags; 1047 unsigned long flags;
950 unsigned long mfn;
951 unsigned long pfn; 1048 unsigned long pfn;
952 unsigned long uninitialized_var(address); 1049 unsigned long uninitialized_var(address);
953 unsigned level; 1050 unsigned level;
954 pte_t *ptep = NULL; 1051 pte_t *ptep = NULL;
955 1052
956 pfn = page_to_pfn(page); 1053 pfn = page_to_pfn(page);
957 mfn = get_phys_to_machine(pfn);
958 if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT))
959 return -EINVAL;
960 1054
961 if (!PageHighMem(page)) { 1055 if (!PageHighMem(page)) {
962 address = (unsigned long)__va(pfn << PAGE_SHIFT); 1056 address = (unsigned long)__va(pfn << PAGE_SHIFT);
@@ -970,10 +1064,7 @@ int m2p_remove_override(struct page *page,
970 spin_lock_irqsave(&m2p_override_lock, flags); 1064 spin_lock_irqsave(&m2p_override_lock, flags);
971 list_del(&page->lru); 1065 list_del(&page->lru);
972 spin_unlock_irqrestore(&m2p_override_lock, flags); 1066 spin_unlock_irqrestore(&m2p_override_lock, flags);
973 WARN_ON(!PagePrivate(page));
974 ClearPagePrivate(page);
975 1067
976 set_phys_to_machine(pfn, page->index);
977 if (kmap_op != NULL) { 1068 if (kmap_op != NULL) {
978 if (!PageHighMem(page)) { 1069 if (!PageHighMem(page)) {
979 struct multicall_space mcs; 1070 struct multicall_space mcs;
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b6e95b5e262f..e4a4145926f6 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -894,7 +894,7 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css,
894 int ret = 0; 894 int ret = 0;
895 895
896 /* task_lock() is needed to avoid races with exit_io_context() */ 896 /* task_lock() is needed to avoid races with exit_io_context() */
897 cgroup_taskset_for_each(task, css, tset) { 897 cgroup_taskset_for_each(task, tset) {
898 task_lock(task); 898 task_lock(task);
899 ioc = task->io_context; 899 ioc = task->io_context;
900 if (ioc && atomic_read(&ioc->nr_tasks) > 1) 900 if (ioc && atomic_read(&ioc->nr_tasks) > 1)
@@ -906,17 +906,14 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css,
906 return ret; 906 return ret;
907} 907}
908 908
909struct cgroup_subsys blkio_subsys = { 909struct cgroup_subsys blkio_cgrp_subsys = {
910 .name = "blkio",
911 .css_alloc = blkcg_css_alloc, 910 .css_alloc = blkcg_css_alloc,
912 .css_offline = blkcg_css_offline, 911 .css_offline = blkcg_css_offline,
913 .css_free = blkcg_css_free, 912 .css_free = blkcg_css_free,
914 .can_attach = blkcg_can_attach, 913 .can_attach = blkcg_can_attach,
915 .subsys_id = blkio_subsys_id,
916 .base_cftypes = blkcg_files, 914 .base_cftypes = blkcg_files,
917 .module = THIS_MODULE,
918}; 915};
919EXPORT_SYMBOL_GPL(blkio_subsys); 916EXPORT_SYMBOL_GPL(blkio_cgrp_subsys);
920 917
921/** 918/**
922 * blkcg_activate_policy - activate a blkcg policy on a request_queue 919 * blkcg_activate_policy - activate a blkcg policy on a request_queue
@@ -1106,7 +1103,7 @@ int blkcg_policy_register(struct blkcg_policy *pol)
1106 1103
1107 /* everything is in place, add intf files for the new policy */ 1104 /* everything is in place, add intf files for the new policy */
1108 if (pol->cftypes) 1105 if (pol->cftypes)
1109 WARN_ON(cgroup_add_cftypes(&blkio_subsys, pol->cftypes)); 1106 WARN_ON(cgroup_add_cftypes(&blkio_cgrp_subsys, pol->cftypes));
1110 ret = 0; 1107 ret = 0;
1111out_unlock: 1108out_unlock:
1112 mutex_unlock(&blkcg_pol_mutex); 1109 mutex_unlock(&blkcg_pol_mutex);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 604f6d99ab92..371fe8e92ab5 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -186,7 +186,7 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
186 186
187static inline struct blkcg *task_blkcg(struct task_struct *tsk) 187static inline struct blkcg *task_blkcg(struct task_struct *tsk)
188{ 188{
189 return css_to_blkcg(task_css(tsk, blkio_subsys_id)); 189 return css_to_blkcg(task_css(tsk, blkio_cgrp_id));
190} 190}
191 191
192static inline struct blkcg *bio_blkcg(struct bio *bio) 192static inline struct blkcg *bio_blkcg(struct bio *bio)
@@ -241,12 +241,16 @@ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
241 */ 241 */
242static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) 242static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
243{ 243{
244 int ret; 244 char *p;
245 245
246 ret = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); 246 p = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
247 if (ret) 247 if (!p) {
248 strncpy(buf, "<unavailable>", buflen); 248 strncpy(buf, "<unavailable>", buflen);
249 return ret; 249 return -ENAMETOOLONG;
250 }
251
252 memmove(buf, p, buf + buflen - p);
253 return 0;
250} 254}
251 255
252/** 256/**
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 1474c3ab7e72..033745cd7fba 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1408,13 +1408,13 @@ static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft,
1408} 1408}
1409 1409
1410static int tg_set_conf_u64(struct cgroup_subsys_state *css, struct cftype *cft, 1410static int tg_set_conf_u64(struct cgroup_subsys_state *css, struct cftype *cft,
1411 const char *buf) 1411 char *buf)
1412{ 1412{
1413 return tg_set_conf(css, cft, buf, true); 1413 return tg_set_conf(css, cft, buf, true);
1414} 1414}
1415 1415
1416static int tg_set_conf_uint(struct cgroup_subsys_state *css, struct cftype *cft, 1416static int tg_set_conf_uint(struct cgroup_subsys_state *css, struct cftype *cft,
1417 const char *buf) 1417 char *buf)
1418{ 1418{
1419 return tg_set_conf(css, cft, buf, false); 1419 return tg_set_conf(css, cft, buf, false);
1420} 1420}
@@ -1425,28 +1425,24 @@ static struct cftype throtl_files[] = {
1425 .private = offsetof(struct throtl_grp, bps[READ]), 1425 .private = offsetof(struct throtl_grp, bps[READ]),
1426 .seq_show = tg_print_conf_u64, 1426 .seq_show = tg_print_conf_u64,
1427 .write_string = tg_set_conf_u64, 1427 .write_string = tg_set_conf_u64,
1428 .max_write_len = 256,
1429 }, 1428 },
1430 { 1429 {
1431 .name = "throttle.write_bps_device", 1430 .name = "throttle.write_bps_device",
1432 .private = offsetof(struct throtl_grp, bps[WRITE]), 1431 .private = offsetof(struct throtl_grp, bps[WRITE]),
1433 .seq_show = tg_print_conf_u64, 1432 .seq_show = tg_print_conf_u64,
1434 .write_string = tg_set_conf_u64, 1433 .write_string = tg_set_conf_u64,
1435 .max_write_len = 256,
1436 }, 1434 },
1437 { 1435 {
1438 .name = "throttle.read_iops_device", 1436 .name = "throttle.read_iops_device",
1439 .private = offsetof(struct throtl_grp, iops[READ]), 1437 .private = offsetof(struct throtl_grp, iops[READ]),
1440 .seq_show = tg_print_conf_uint, 1438 .seq_show = tg_print_conf_uint,
1441 .write_string = tg_set_conf_uint, 1439 .write_string = tg_set_conf_uint,
1442 .max_write_len = 256,
1443 }, 1440 },
1444 { 1441 {
1445 .name = "throttle.write_iops_device", 1442 .name = "throttle.write_iops_device",
1446 .private = offsetof(struct throtl_grp, iops[WRITE]), 1443 .private = offsetof(struct throtl_grp, iops[WRITE]),
1447 .seq_show = tg_print_conf_uint, 1444 .seq_show = tg_print_conf_uint,
1448 .write_string = tg_set_conf_uint, 1445 .write_string = tg_set_conf_uint,
1449 .max_write_len = 256,
1450 }, 1446 },
1451 { 1447 {
1452 .name = "throttle.io_service_bytes", 1448 .name = "throttle.io_service_bytes",
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 5873e4ada9eb..e0985f1955e7 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1701,13 +1701,13 @@ static int __cfqg_set_weight_device(struct cgroup_subsys_state *css,
1701} 1701}
1702 1702
1703static int cfqg_set_weight_device(struct cgroup_subsys_state *css, 1703static int cfqg_set_weight_device(struct cgroup_subsys_state *css,
1704 struct cftype *cft, const char *buf) 1704 struct cftype *cft, char *buf)
1705{ 1705{
1706 return __cfqg_set_weight_device(css, cft, buf, false); 1706 return __cfqg_set_weight_device(css, cft, buf, false);
1707} 1707}
1708 1708
1709static int cfqg_set_leaf_weight_device(struct cgroup_subsys_state *css, 1709static int cfqg_set_leaf_weight_device(struct cgroup_subsys_state *css,
1710 struct cftype *cft, const char *buf) 1710 struct cftype *cft, char *buf)
1711{ 1711{
1712 return __cfqg_set_weight_device(css, cft, buf, true); 1712 return __cfqg_set_weight_device(css, cft, buf, true);
1713} 1713}
@@ -1838,7 +1838,6 @@ static struct cftype cfq_blkcg_files[] = {
1838 .flags = CFTYPE_ONLY_ON_ROOT, 1838 .flags = CFTYPE_ONLY_ON_ROOT,
1839 .seq_show = cfqg_print_leaf_weight_device, 1839 .seq_show = cfqg_print_leaf_weight_device,
1840 .write_string = cfqg_set_leaf_weight_device, 1840 .write_string = cfqg_set_leaf_weight_device,
1841 .max_write_len = 256,
1842 }, 1841 },
1843 { 1842 {
1844 .name = "weight", 1843 .name = "weight",
@@ -1853,7 +1852,6 @@ static struct cftype cfq_blkcg_files[] = {
1853 .flags = CFTYPE_NOT_ON_ROOT, 1852 .flags = CFTYPE_NOT_ON_ROOT,
1854 .seq_show = cfqg_print_weight_device, 1853 .seq_show = cfqg_print_weight_device,
1855 .write_string = cfqg_set_weight_device, 1854 .write_string = cfqg_set_weight_device,
1856 .max_write_len = 256,
1857 }, 1855 },
1858 { 1856 {
1859 .name = "weight", 1857 .name = "weight",
@@ -1866,7 +1864,6 @@ static struct cftype cfq_blkcg_files[] = {
1866 .name = "leaf_weight_device", 1864 .name = "leaf_weight_device",
1867 .seq_show = cfqg_print_leaf_weight_device, 1865 .seq_show = cfqg_print_leaf_weight_device,
1868 .write_string = cfqg_set_leaf_weight_device, 1866 .write_string = cfqg_set_leaf_weight_device,
1869 .max_write_len = 256,
1870 }, 1867 },
1871 { 1868 {
1872 .name = "leaf_weight", 1869 .name = "leaf_weight",
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 26b3d9d1409f..af7b204b9215 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -13,6 +13,7 @@ menuconfig VFIO
13 depends on IOMMU_API 13 depends on IOMMU_API
14 select VFIO_IOMMU_TYPE1 if X86 14 select VFIO_IOMMU_TYPE1 if X86
15 select VFIO_IOMMU_SPAPR_TCE if (PPC_POWERNV || PPC_PSERIES) 15 select VFIO_IOMMU_SPAPR_TCE if (PPC_POWERNV || PPC_PSERIES)
16 select ANON_INODES
16 help 17 help
17 VFIO provides a framework for secure userspace device drivers. 18 VFIO provides a framework for secure userspace device drivers.
18 See Documentation/vfio.txt for more details. 19 See Documentation/vfio.txt for more details.
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 21271d8df023..512f479d8a50 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1413,6 +1413,12 @@ int vfio_external_user_iommu_id(struct vfio_group *group)
1413} 1413}
1414EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id); 1414EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1415 1415
1416long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1417{
1418 return vfio_ioctl_check_extension(group->container, arg);
1419}
1420EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1421
1416/** 1422/**
1417 * Module/class support 1423 * Module/class support
1418 */ 1424 */
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 54af4e933695..6673e7be507f 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -30,7 +30,6 @@
30#include <linux/iommu.h> 30#include <linux/iommu.h>
31#include <linux/module.h> 31#include <linux/module.h>
32#include <linux/mm.h> 32#include <linux/mm.h>
33#include <linux/pci.h> /* pci_bus_type */
34#include <linux/rbtree.h> 33#include <linux/rbtree.h>
35#include <linux/sched.h> 34#include <linux/sched.h>
36#include <linux/slab.h> 35#include <linux/slab.h>
@@ -55,11 +54,17 @@ MODULE_PARM_DESC(disable_hugepages,
55 "Disable VFIO IOMMU support for IOMMU hugepages."); 54 "Disable VFIO IOMMU support for IOMMU hugepages.");
56 55
57struct vfio_iommu { 56struct vfio_iommu {
58 struct iommu_domain *domain; 57 struct list_head domain_list;
59 struct mutex lock; 58 struct mutex lock;
60 struct rb_root dma_list; 59 struct rb_root dma_list;
60 bool v2;
61};
62
63struct vfio_domain {
64 struct iommu_domain *domain;
65 struct list_head next;
61 struct list_head group_list; 66 struct list_head group_list;
62 bool cache; 67 int prot; /* IOMMU_CACHE */
63}; 68};
64 69
65struct vfio_dma { 70struct vfio_dma {
@@ -99,7 +104,7 @@ static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
99 return NULL; 104 return NULL;
100} 105}
101 106
102static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new) 107static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
103{ 108{
104 struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL; 109 struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
105 struct vfio_dma *dma; 110 struct vfio_dma *dma;
@@ -118,7 +123,7 @@ static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
118 rb_insert_color(&new->node, &iommu->dma_list); 123 rb_insert_color(&new->node, &iommu->dma_list);
119} 124}
120 125
121static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *old) 126static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
122{ 127{
123 rb_erase(&old->node, &iommu->dma_list); 128 rb_erase(&old->node, &iommu->dma_list);
124} 129}
@@ -322,32 +327,39 @@ static long vfio_unpin_pages(unsigned long pfn, long npage,
322 return unlocked; 327 return unlocked;
323} 328}
324 329
325static int vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, 330static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
326 dma_addr_t iova, size_t *size)
327{ 331{
328 dma_addr_t start = iova, end = iova + *size; 332 dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
333 struct vfio_domain *domain, *d;
329 long unlocked = 0; 334 long unlocked = 0;
330 335
336 if (!dma->size)
337 return;
338 /*
339 * We use the IOMMU to track the physical addresses, otherwise we'd
340 * need a much more complicated tracking system. Unfortunately that
341 * means we need to use one of the iommu domains to figure out the
342 * pfns to unpin. The rest need to be unmapped in advance so we have
343 * no iommu translations remaining when the pages are unpinned.
344 */
345 domain = d = list_first_entry(&iommu->domain_list,
346 struct vfio_domain, next);
347
348 list_for_each_entry_continue(d, &iommu->domain_list, next)
349 iommu_unmap(d->domain, dma->iova, dma->size);
350
331 while (iova < end) { 351 while (iova < end) {
332 size_t unmapped; 352 size_t unmapped;
333 phys_addr_t phys; 353 phys_addr_t phys;
334 354
335 /* 355 phys = iommu_iova_to_phys(domain->domain, iova);
336 * We use the IOMMU to track the physical address. This
337 * saves us from having a lot more entries in our mapping
338 * tree. The downside is that we don't track the size
339 * used to do the mapping. We request unmap of a single
340 * page, but expect IOMMUs that support large pages to
341 * unmap a larger chunk.
342 */
343 phys = iommu_iova_to_phys(iommu->domain, iova);
344 if (WARN_ON(!phys)) { 356 if (WARN_ON(!phys)) {
345 iova += PAGE_SIZE; 357 iova += PAGE_SIZE;
346 continue; 358 continue;
347 } 359 }
348 360
349 unmapped = iommu_unmap(iommu->domain, iova, PAGE_SIZE); 361 unmapped = iommu_unmap(domain->domain, iova, PAGE_SIZE);
350 if (!unmapped) 362 if (WARN_ON(!unmapped))
351 break; 363 break;
352 364
353 unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT, 365 unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
@@ -357,119 +369,26 @@ static int vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
357 } 369 }
358 370
359 vfio_lock_acct(-unlocked); 371 vfio_lock_acct(-unlocked);
360
361 *size = iova - start;
362
363 return 0;
364} 372}
365 373
366static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, 374static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
367 size_t *size, struct vfio_dma *dma)
368{ 375{
369 size_t offset, overlap, tmp; 376 vfio_unmap_unpin(iommu, dma);
370 struct vfio_dma *split; 377 vfio_unlink_dma(iommu, dma);
371 int ret; 378 kfree(dma);
372 379}
373 if (!*size)
374 return 0;
375
376 /*
377 * Existing dma region is completely covered, unmap all. This is
378 * the likely case since userspace tends to map and unmap buffers
379 * in one shot rather than multiple mappings within a buffer.
380 */
381 if (likely(start <= dma->iova &&
382 start + *size >= dma->iova + dma->size)) {
383 *size = dma->size;
384 ret = vfio_unmap_unpin(iommu, dma, dma->iova, size);
385 if (ret)
386 return ret;
387
388 /*
389 * Did we remove more than we have? Should never happen
390 * since a vfio_dma is contiguous in iova and vaddr.
391 */
392 WARN_ON(*size != dma->size);
393
394 vfio_remove_dma(iommu, dma);
395 kfree(dma);
396 return 0;
397 }
398
399 /* Overlap low address of existing range */
400 if (start <= dma->iova) {
401 overlap = start + *size - dma->iova;
402 ret = vfio_unmap_unpin(iommu, dma, dma->iova, &overlap);
403 if (ret)
404 return ret;
405
406 vfio_remove_dma(iommu, dma);
407
408 /*
409 * Check, we may have removed to whole vfio_dma. If not
410 * fixup and re-insert.
411 */
412 if (overlap < dma->size) {
413 dma->iova += overlap;
414 dma->vaddr += overlap;
415 dma->size -= overlap;
416 vfio_insert_dma(iommu, dma);
417 } else
418 kfree(dma);
419
420 *size = overlap;
421 return 0;
422 }
423
424 /* Overlap high address of existing range */
425 if (start + *size >= dma->iova + dma->size) {
426 offset = start - dma->iova;
427 overlap = dma->size - offset;
428
429 ret = vfio_unmap_unpin(iommu, dma, start, &overlap);
430 if (ret)
431 return ret;
432
433 dma->size -= overlap;
434 *size = overlap;
435 return 0;
436 }
437
438 /* Split existing */
439
440 /*
441 * Allocate our tracking structure early even though it may not
442 * be used. An Allocation failure later loses track of pages and
443 * is more difficult to unwind.
444 */
445 split = kzalloc(sizeof(*split), GFP_KERNEL);
446 if (!split)
447 return -ENOMEM;
448
449 offset = start - dma->iova;
450
451 ret = vfio_unmap_unpin(iommu, dma, start, size);
452 if (ret || !*size) {
453 kfree(split);
454 return ret;
455 }
456
457 tmp = dma->size;
458 380
459 /* Resize the lower vfio_dma in place, before the below insert */ 381static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
460 dma->size = offset; 382{
383 struct vfio_domain *domain;
384 unsigned long bitmap = PAGE_MASK;
461 385
462 /* Insert new for remainder, assuming it didn't all get unmapped */ 386 mutex_lock(&iommu->lock);
463 if (likely(offset + *size < tmp)) { 387 list_for_each_entry(domain, &iommu->domain_list, next)
464 split->size = tmp - offset - *size; 388 bitmap &= domain->domain->ops->pgsize_bitmap;
465 split->iova = dma->iova + offset + *size; 389 mutex_unlock(&iommu->lock);
466 split->vaddr = dma->vaddr + offset + *size;
467 split->prot = dma->prot;
468 vfio_insert_dma(iommu, split);
469 } else
470 kfree(split);
471 390
472 return 0; 391 return bitmap;
473} 392}
474 393
475static int vfio_dma_do_unmap(struct vfio_iommu *iommu, 394static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
@@ -477,10 +396,10 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
477{ 396{
478 uint64_t mask; 397 uint64_t mask;
479 struct vfio_dma *dma; 398 struct vfio_dma *dma;
480 size_t unmapped = 0, size; 399 size_t unmapped = 0;
481 int ret = 0; 400 int ret = 0;
482 401
483 mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; 402 mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
484 403
485 if (unmap->iova & mask) 404 if (unmap->iova & mask)
486 return -EINVAL; 405 return -EINVAL;
@@ -491,20 +410,61 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
491 410
492 mutex_lock(&iommu->lock); 411 mutex_lock(&iommu->lock);
493 412
413 /*
414 * vfio-iommu-type1 (v1) - User mappings were coalesced together to
415 * avoid tracking individual mappings. This means that the granularity
416 * of the original mapping was lost and the user was allowed to attempt
417 * to unmap any range. Depending on the contiguousness of physical
418 * memory and page sizes supported by the IOMMU, arbitrary unmaps may
419 * or may not have worked. We only guaranteed unmap granularity
420 * matching the original mapping; even though it was untracked here,
421 * the original mappings are reflected in IOMMU mappings. This
422 * resulted in a couple unusual behaviors. First, if a range is not
423 * able to be unmapped, ex. a set of 4k pages that was mapped as a
424 * 2M hugepage into the IOMMU, the unmap ioctl returns success but with
425 * a zero sized unmap. Also, if an unmap request overlaps the first
426 * address of a hugepage, the IOMMU will unmap the entire hugepage.
427 * This also returns success and the returned unmap size reflects the
428 * actual size unmapped.
429 *
430 * We attempt to maintain compatibility with this "v1" interface, but
431 * we take control out of the hands of the IOMMU. Therefore, an unmap
432 * request offset from the beginning of the original mapping will
433 * return success with zero sized unmap. And an unmap request covering
434 * the first iova of mapping will unmap the entire range.
435 *
436 * The v2 version of this interface intends to be more deterministic.
437 * Unmap requests must fully cover previous mappings. Multiple
438 * mappings may still be unmaped by specifying large ranges, but there
439 * must not be any previous mappings bisected by the range. An error
440 * will be returned if these conditions are not met. The v2 interface
441 * will only return success and a size of zero if there were no
442 * mappings within the range.
443 */
444 if (iommu->v2) {
445 dma = vfio_find_dma(iommu, unmap->iova, 0);
446 if (dma && dma->iova != unmap->iova) {
447 ret = -EINVAL;
448 goto unlock;
449 }
450 dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
451 if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
452 ret = -EINVAL;
453 goto unlock;
454 }
455 }
456
494 while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) { 457 while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
495 size = unmap->size; 458 if (!iommu->v2 && unmap->iova > dma->iova)
496 ret = vfio_remove_dma_overlap(iommu, unmap->iova, &size, dma);
497 if (ret || !size)
498 break; 459 break;
499 unmapped += size; 460 unmapped += dma->size;
461 vfio_remove_dma(iommu, dma);
500 } 462 }
501 463
464unlock:
502 mutex_unlock(&iommu->lock); 465 mutex_unlock(&iommu->lock);
503 466
504 /* 467 /* Report how much was unmapped */
505 * We may unmap more than requested, update the unmap struct so
506 * userspace can know.
507 */
508 unmap->size = unmapped; 468 unmap->size = unmapped;
509 469
510 return ret; 470 return ret;
@@ -516,22 +476,47 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
516 * soon, so this is just a temporary workaround to break mappings down into 476 * soon, so this is just a temporary workaround to break mappings down into
517 * PAGE_SIZE. Better to map smaller pages than nothing. 477 * PAGE_SIZE. Better to map smaller pages than nothing.
518 */ 478 */
519static int map_try_harder(struct vfio_iommu *iommu, dma_addr_t iova, 479static int map_try_harder(struct vfio_domain *domain, dma_addr_t iova,
520 unsigned long pfn, long npage, int prot) 480 unsigned long pfn, long npage, int prot)
521{ 481{
522 long i; 482 long i;
523 int ret; 483 int ret;
524 484
525 for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) { 485 for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
526 ret = iommu_map(iommu->domain, iova, 486 ret = iommu_map(domain->domain, iova,
527 (phys_addr_t)pfn << PAGE_SHIFT, 487 (phys_addr_t)pfn << PAGE_SHIFT,
528 PAGE_SIZE, prot); 488 PAGE_SIZE, prot | domain->prot);
529 if (ret) 489 if (ret)
530 break; 490 break;
531 } 491 }
532 492
533 for (; i < npage && i > 0; i--, iova -= PAGE_SIZE) 493 for (; i < npage && i > 0; i--, iova -= PAGE_SIZE)
534 iommu_unmap(iommu->domain, iova, PAGE_SIZE); 494 iommu_unmap(domain->domain, iova, PAGE_SIZE);
495
496 return ret;
497}
498
499static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
500 unsigned long pfn, long npage, int prot)
501{
502 struct vfio_domain *d;
503 int ret;
504
505 list_for_each_entry(d, &iommu->domain_list, next) {
506 ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
507 npage << PAGE_SHIFT, prot | d->prot);
508 if (ret) {
509 if (ret != -EBUSY ||
510 map_try_harder(d, iova, pfn, npage, prot))
511 goto unwind;
512 }
513 }
514
515 return 0;
516
517unwind:
518 list_for_each_entry_continue_reverse(d, &iommu->domain_list, next)
519 iommu_unmap(d->domain, iova, npage << PAGE_SHIFT);
535 520
536 return ret; 521 return ret;
537} 522}
@@ -545,12 +530,12 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
545 long npage; 530 long npage;
546 int ret = 0, prot = 0; 531 int ret = 0, prot = 0;
547 uint64_t mask; 532 uint64_t mask;
548 struct vfio_dma *dma = NULL; 533 struct vfio_dma *dma;
549 unsigned long pfn; 534 unsigned long pfn;
550 535
551 end = map->iova + map->size; 536 end = map->iova + map->size;
552 537
553 mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; 538 mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
554 539
555 /* READ/WRITE from device perspective */ 540 /* READ/WRITE from device perspective */
556 if (map->flags & VFIO_DMA_MAP_FLAG_WRITE) 541 if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
@@ -561,9 +546,6 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
561 if (!prot) 546 if (!prot)
562 return -EINVAL; /* No READ/WRITE? */ 547 return -EINVAL; /* No READ/WRITE? */
563 548
564 if (iommu->cache)
565 prot |= IOMMU_CACHE;
566
567 if (vaddr & mask) 549 if (vaddr & mask)
568 return -EINVAL; 550 return -EINVAL;
569 if (map->iova & mask) 551 if (map->iova & mask)
@@ -588,180 +570,257 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
588 return -EEXIST; 570 return -EEXIST;
589 } 571 }
590 572
591 for (iova = map->iova; iova < end; iova += size, vaddr += size) { 573 dma = kzalloc(sizeof(*dma), GFP_KERNEL);
592 long i; 574 if (!dma) {
575 mutex_unlock(&iommu->lock);
576 return -ENOMEM;
577 }
578
579 dma->iova = map->iova;
580 dma->vaddr = map->vaddr;
581 dma->prot = prot;
582
583 /* Insert zero-sized and grow as we map chunks of it */
584 vfio_link_dma(iommu, dma);
593 585
586 for (iova = map->iova; iova < end; iova += size, vaddr += size) {
594 /* Pin a contiguous chunk of memory */ 587 /* Pin a contiguous chunk of memory */
595 npage = vfio_pin_pages(vaddr, (end - iova) >> PAGE_SHIFT, 588 npage = vfio_pin_pages(vaddr, (end - iova) >> PAGE_SHIFT,
596 prot, &pfn); 589 prot, &pfn);
597 if (npage <= 0) { 590 if (npage <= 0) {
598 WARN_ON(!npage); 591 WARN_ON(!npage);
599 ret = (int)npage; 592 ret = (int)npage;
600 goto out; 593 break;
601 }
602
603 /* Verify pages are not already mapped */
604 for (i = 0; i < npage; i++) {
605 if (iommu_iova_to_phys(iommu->domain,
606 iova + (i << PAGE_SHIFT))) {
607 ret = -EBUSY;
608 goto out_unpin;
609 }
610 } 594 }
611 595
612 ret = iommu_map(iommu->domain, iova, 596 /* Map it! */
613 (phys_addr_t)pfn << PAGE_SHIFT, 597 ret = vfio_iommu_map(iommu, iova, pfn, npage, prot);
614 npage << PAGE_SHIFT, prot);
615 if (ret) { 598 if (ret) {
616 if (ret != -EBUSY || 599 vfio_unpin_pages(pfn, npage, prot, true);
617 map_try_harder(iommu, iova, pfn, npage, prot)) { 600 break;
618 goto out_unpin;
619 }
620 } 601 }
621 602
622 size = npage << PAGE_SHIFT; 603 size = npage << PAGE_SHIFT;
604 dma->size += size;
605 }
623 606
624 /* 607 if (ret)
625 * Check if we abut a region below - nothing below 0. 608 vfio_remove_dma(iommu, dma);
626 * This is the most likely case when mapping chunks of
627 * physically contiguous regions within a virtual address
628 * range. Update the abutting entry in place since iova
629 * doesn't change.
630 */
631 if (likely(iova)) {
632 struct vfio_dma *tmp;
633 tmp = vfio_find_dma(iommu, iova - 1, 1);
634 if (tmp && tmp->prot == prot &&
635 tmp->vaddr + tmp->size == vaddr) {
636 tmp->size += size;
637 iova = tmp->iova;
638 size = tmp->size;
639 vaddr = tmp->vaddr;
640 dma = tmp;
641 }
642 }
643 609
644 /* 610 mutex_unlock(&iommu->lock);
645 * Check if we abut a region above - nothing above ~0 + 1. 611 return ret;
646 * If we abut above and below, remove and free. If only 612}
647 * abut above, remove, modify, reinsert. 613
648 */ 614static int vfio_bus_type(struct device *dev, void *data)
649 if (likely(iova + size)) { 615{
650 struct vfio_dma *tmp; 616 struct bus_type **bus = data;
651 tmp = vfio_find_dma(iommu, iova + size, 1); 617
652 if (tmp && tmp->prot == prot && 618 if (*bus && *bus != dev->bus)
653 tmp->vaddr == vaddr + size) { 619 return -EINVAL;
654 vfio_remove_dma(iommu, tmp); 620
655 if (dma) { 621 *bus = dev->bus;
656 dma->size += tmp->size; 622
657 kfree(tmp); 623 return 0;
658 } else { 624}
659 size += tmp->size; 625
660 tmp->size = size; 626static int vfio_iommu_replay(struct vfio_iommu *iommu,
661 tmp->iova = iova; 627 struct vfio_domain *domain)
662 tmp->vaddr = vaddr; 628{
663 vfio_insert_dma(iommu, tmp); 629 struct vfio_domain *d;
664 dma = tmp; 630 struct rb_node *n;
665 } 631 int ret;
666 } 632
667 } 633 /* Arbitrarily pick the first domain in the list for lookups */
634 d = list_first_entry(&iommu->domain_list, struct vfio_domain, next);
635 n = rb_first(&iommu->dma_list);
636
637 /* If there's not a domain, there better not be any mappings */
638 if (WARN_ON(n && !d))
639 return -EINVAL;
640
641 for (; n; n = rb_next(n)) {
642 struct vfio_dma *dma;
643 dma_addr_t iova;
644
645 dma = rb_entry(n, struct vfio_dma, node);
646 iova = dma->iova;
647
648 while (iova < dma->iova + dma->size) {
649 phys_addr_t phys = iommu_iova_to_phys(d->domain, iova);
650 size_t size;
668 651
669 if (!dma) { 652 if (WARN_ON(!phys)) {
670 dma = kzalloc(sizeof(*dma), GFP_KERNEL); 653 iova += PAGE_SIZE;
671 if (!dma) { 654 continue;
672 iommu_unmap(iommu->domain, iova, size);
673 ret = -ENOMEM;
674 goto out_unpin;
675 } 655 }
676 656
677 dma->size = size; 657 size = PAGE_SIZE;
678 dma->iova = iova;
679 dma->vaddr = vaddr;
680 dma->prot = prot;
681 vfio_insert_dma(iommu, dma);
682 }
683 }
684 658
685 WARN_ON(ret); 659 while (iova + size < dma->iova + dma->size &&
686 mutex_unlock(&iommu->lock); 660 phys + size == iommu_iova_to_phys(d->domain,
687 return ret; 661 iova + size))
662 size += PAGE_SIZE;
688 663
689out_unpin: 664 ret = iommu_map(domain->domain, iova, phys,
690 vfio_unpin_pages(pfn, npage, prot, true); 665 size, dma->prot | domain->prot);
666 if (ret)
667 return ret;
691 668
692out: 669 iova += size;
693 iova = map->iova; 670 }
694 size = map->size;
695 while ((dma = vfio_find_dma(iommu, iova, size))) {
696 int r = vfio_remove_dma_overlap(iommu, iova,
697 &size, dma);
698 if (WARN_ON(r || !size))
699 break;
700 } 671 }
701 672
702 mutex_unlock(&iommu->lock); 673 return 0;
703 return ret;
704} 674}
705 675
706static int vfio_iommu_type1_attach_group(void *iommu_data, 676static int vfio_iommu_type1_attach_group(void *iommu_data,
707 struct iommu_group *iommu_group) 677 struct iommu_group *iommu_group)
708{ 678{
709 struct vfio_iommu *iommu = iommu_data; 679 struct vfio_iommu *iommu = iommu_data;
710 struct vfio_group *group, *tmp; 680 struct vfio_group *group, *g;
681 struct vfio_domain *domain, *d;
682 struct bus_type *bus = NULL;
711 int ret; 683 int ret;
712 684
713 group = kzalloc(sizeof(*group), GFP_KERNEL);
714 if (!group)
715 return -ENOMEM;
716
717 mutex_lock(&iommu->lock); 685 mutex_lock(&iommu->lock);
718 686
719 list_for_each_entry(tmp, &iommu->group_list, next) { 687 list_for_each_entry(d, &iommu->domain_list, next) {
720 if (tmp->iommu_group == iommu_group) { 688 list_for_each_entry(g, &d->group_list, next) {
689 if (g->iommu_group != iommu_group)
690 continue;
691
721 mutex_unlock(&iommu->lock); 692 mutex_unlock(&iommu->lock);
722 kfree(group);
723 return -EINVAL; 693 return -EINVAL;
724 } 694 }
725 } 695 }
726 696
697 group = kzalloc(sizeof(*group), GFP_KERNEL);
698 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
699 if (!group || !domain) {
700 ret = -ENOMEM;
701 goto out_free;
702 }
703
704 group->iommu_group = iommu_group;
705
706 /* Determine bus_type in order to allocate a domain */
707 ret = iommu_group_for_each_dev(iommu_group, &bus, vfio_bus_type);
708 if (ret)
709 goto out_free;
710
711 domain->domain = iommu_domain_alloc(bus);
712 if (!domain->domain) {
713 ret = -EIO;
714 goto out_free;
715 }
716
717 ret = iommu_attach_group(domain->domain, iommu_group);
718 if (ret)
719 goto out_domain;
720
721 INIT_LIST_HEAD(&domain->group_list);
722 list_add(&group->next, &domain->group_list);
723
724 if (!allow_unsafe_interrupts &&
725 !iommu_domain_has_cap(domain->domain, IOMMU_CAP_INTR_REMAP)) {
726 pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
727 __func__);
728 ret = -EPERM;
729 goto out_detach;
730 }
731
732 if (iommu_domain_has_cap(domain->domain, IOMMU_CAP_CACHE_COHERENCY))
733 domain->prot |= IOMMU_CACHE;
734
727 /* 735 /*
728 * TODO: Domain have capabilities that might change as we add 736 * Try to match an existing compatible domain. We don't want to
729 * groups (see iommu->cache, currently never set). Check for 737 * preclude an IOMMU driver supporting multiple bus_types and being
730 * them and potentially disallow groups to be attached when it 738 * able to include different bus_types in the same IOMMU domain, so
731 * would change capabilities (ugh). 739 * we test whether the domains use the same iommu_ops rather than
740 * testing if they're on the same bus_type.
732 */ 741 */
733 ret = iommu_attach_group(iommu->domain, iommu_group); 742 list_for_each_entry(d, &iommu->domain_list, next) {
734 if (ret) { 743 if (d->domain->ops == domain->domain->ops &&
735 mutex_unlock(&iommu->lock); 744 d->prot == domain->prot) {
736 kfree(group); 745 iommu_detach_group(domain->domain, iommu_group);
737 return ret; 746 if (!iommu_attach_group(d->domain, iommu_group)) {
747 list_add(&group->next, &d->group_list);
748 iommu_domain_free(domain->domain);
749 kfree(domain);
750 mutex_unlock(&iommu->lock);
751 return 0;
752 }
753
754 ret = iommu_attach_group(domain->domain, iommu_group);
755 if (ret)
756 goto out_domain;
757 }
738 } 758 }
739 759
740 group->iommu_group = iommu_group; 760 /* replay mappings on new domains */
741 list_add(&group->next, &iommu->group_list); 761 ret = vfio_iommu_replay(iommu, domain);
762 if (ret)
763 goto out_detach;
764
765 list_add(&domain->next, &iommu->domain_list);
742 766
743 mutex_unlock(&iommu->lock); 767 mutex_unlock(&iommu->lock);
744 768
745 return 0; 769 return 0;
770
771out_detach:
772 iommu_detach_group(domain->domain, iommu_group);
773out_domain:
774 iommu_domain_free(domain->domain);
775out_free:
776 kfree(domain);
777 kfree(group);
778 mutex_unlock(&iommu->lock);
779 return ret;
780}
781
782static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
783{
784 struct rb_node *node;
785
786 while ((node = rb_first(&iommu->dma_list)))
787 vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
746} 788}
747 789
748static void vfio_iommu_type1_detach_group(void *iommu_data, 790static void vfio_iommu_type1_detach_group(void *iommu_data,
749 struct iommu_group *iommu_group) 791 struct iommu_group *iommu_group)
750{ 792{
751 struct vfio_iommu *iommu = iommu_data; 793 struct vfio_iommu *iommu = iommu_data;
794 struct vfio_domain *domain;
752 struct vfio_group *group; 795 struct vfio_group *group;
753 796
754 mutex_lock(&iommu->lock); 797 mutex_lock(&iommu->lock);
755 798
756 list_for_each_entry(group, &iommu->group_list, next) { 799 list_for_each_entry(domain, &iommu->domain_list, next) {
757 if (group->iommu_group == iommu_group) { 800 list_for_each_entry(group, &domain->group_list, next) {
758 iommu_detach_group(iommu->domain, iommu_group); 801 if (group->iommu_group != iommu_group)
802 continue;
803
804 iommu_detach_group(domain->domain, iommu_group);
759 list_del(&group->next); 805 list_del(&group->next);
760 kfree(group); 806 kfree(group);
761 break; 807 /*
808 * Group ownership provides privilege, if the group
809 * list is empty, the domain goes away. If it's the
810 * last domain, then all the mappings go away too.
811 */
812 if (list_empty(&domain->group_list)) {
813 if (list_is_singular(&iommu->domain_list))
814 vfio_iommu_unmap_unpin_all(iommu);
815 iommu_domain_free(domain->domain);
816 list_del(&domain->next);
817 kfree(domain);
818 }
819 goto done;
762 } 820 }
763 } 821 }
764 822
823done:
765 mutex_unlock(&iommu->lock); 824 mutex_unlock(&iommu->lock);
766} 825}
767 826
@@ -769,40 +828,17 @@ static void *vfio_iommu_type1_open(unsigned long arg)
769{ 828{
770 struct vfio_iommu *iommu; 829 struct vfio_iommu *iommu;
771 830
772 if (arg != VFIO_TYPE1_IOMMU) 831 if (arg != VFIO_TYPE1_IOMMU && arg != VFIO_TYPE1v2_IOMMU)
773 return ERR_PTR(-EINVAL); 832 return ERR_PTR(-EINVAL);
774 833
775 iommu = kzalloc(sizeof(*iommu), GFP_KERNEL); 834 iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
776 if (!iommu) 835 if (!iommu)
777 return ERR_PTR(-ENOMEM); 836 return ERR_PTR(-ENOMEM);
778 837
779 INIT_LIST_HEAD(&iommu->group_list); 838 INIT_LIST_HEAD(&iommu->domain_list);
780 iommu->dma_list = RB_ROOT; 839 iommu->dma_list = RB_ROOT;
781 mutex_init(&iommu->lock); 840 mutex_init(&iommu->lock);
782 841 iommu->v2 = (arg == VFIO_TYPE1v2_IOMMU);
783 /*
784 * Wish we didn't have to know about bus_type here.
785 */
786 iommu->domain = iommu_domain_alloc(&pci_bus_type);
787 if (!iommu->domain) {
788 kfree(iommu);
789 return ERR_PTR(-EIO);
790 }
791
792 /*
793 * Wish we could specify required capabilities rather than create
794 * a domain, see what comes out and hope it doesn't change along
795 * the way. Fortunately we know interrupt remapping is global for
796 * our iommus.
797 */
798 if (!allow_unsafe_interrupts &&
799 !iommu_domain_has_cap(iommu->domain, IOMMU_CAP_INTR_REMAP)) {
800 pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
801 __func__);
802 iommu_domain_free(iommu->domain);
803 kfree(iommu);
804 return ERR_PTR(-EPERM);
805 }
806 842
807 return iommu; 843 return iommu;
808} 844}
@@ -810,26 +846,42 @@ static void *vfio_iommu_type1_open(unsigned long arg)
810static void vfio_iommu_type1_release(void *iommu_data) 846static void vfio_iommu_type1_release(void *iommu_data)
811{ 847{
812 struct vfio_iommu *iommu = iommu_data; 848 struct vfio_iommu *iommu = iommu_data;
849 struct vfio_domain *domain, *domain_tmp;
813 struct vfio_group *group, *group_tmp; 850 struct vfio_group *group, *group_tmp;
814 struct rb_node *node;
815 851
816 list_for_each_entry_safe(group, group_tmp, &iommu->group_list, next) { 852 vfio_iommu_unmap_unpin_all(iommu);
817 iommu_detach_group(iommu->domain, group->iommu_group); 853
818 list_del(&group->next); 854 list_for_each_entry_safe(domain, domain_tmp,
819 kfree(group); 855 &iommu->domain_list, next) {
856 list_for_each_entry_safe(group, group_tmp,
857 &domain->group_list, next) {
858 iommu_detach_group(domain->domain, group->iommu_group);
859 list_del(&group->next);
860 kfree(group);
861 }
862 iommu_domain_free(domain->domain);
863 list_del(&domain->next);
864 kfree(domain);
820 } 865 }
821 866
822 while ((node = rb_first(&iommu->dma_list))) { 867 kfree(iommu);
823 struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node); 868}
824 size_t size = dma->size; 869
825 vfio_remove_dma_overlap(iommu, dma->iova, &size, dma); 870static int vfio_domains_have_iommu_cache(struct vfio_iommu *iommu)
826 if (WARN_ON(!size)) 871{
872 struct vfio_domain *domain;
873 int ret = 1;
874
875 mutex_lock(&iommu->lock);
876 list_for_each_entry(domain, &iommu->domain_list, next) {
877 if (!(domain->prot & IOMMU_CACHE)) {
878 ret = 0;
827 break; 879 break;
880 }
828 } 881 }
882 mutex_unlock(&iommu->lock);
829 883
830 iommu_domain_free(iommu->domain); 884 return ret;
831 iommu->domain = NULL;
832 kfree(iommu);
833} 885}
834 886
835static long vfio_iommu_type1_ioctl(void *iommu_data, 887static long vfio_iommu_type1_ioctl(void *iommu_data,
@@ -841,7 +893,12 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
841 if (cmd == VFIO_CHECK_EXTENSION) { 893 if (cmd == VFIO_CHECK_EXTENSION) {
842 switch (arg) { 894 switch (arg) {
843 case VFIO_TYPE1_IOMMU: 895 case VFIO_TYPE1_IOMMU:
896 case VFIO_TYPE1v2_IOMMU:
844 return 1; 897 return 1;
898 case VFIO_DMA_CC_IOMMU:
899 if (!iommu)
900 return 0;
901 return vfio_domains_have_iommu_cache(iommu);
845 default: 902 default:
846 return 0; 903 return 0;
847 } 904 }
@@ -858,7 +915,7 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
858 915
859 info.flags = 0; 916 info.flags = 0;
860 917
861 info.iova_pgsizes = iommu->domain->ops->pgsize_bitmap; 918 info.iova_pgsizes = vfio_pgsize_bitmap(iommu);
862 919
863 return copy_to_user((void __user *)arg, &info, minsz); 920 return copy_to_user((void __user *)arg, &info, minsz);
864 921
@@ -911,9 +968,6 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
911 968
912static int __init vfio_iommu_type1_init(void) 969static int __init vfio_iommu_type1_init(void)
913{ 970{
914 if (!iommu_present(&pci_bus_type))
915 return -ENODEV;
916
917 return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1); 971 return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1);
918} 972}
919 973
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index c3458f58de90..d5a3de88ac59 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -388,10 +388,10 @@ static void xen_irq_init(unsigned irq)
388 list_add_tail(&info->list, &xen_irq_list_head); 388 list_add_tail(&info->list, &xen_irq_list_head);
389} 389}
390 390
391static int __must_check xen_allocate_irq_dynamic(void) 391static int __must_check xen_allocate_irqs_dynamic(int nvec)
392{ 392{
393 int first = 0; 393 int first = 0;
394 int irq; 394 int i, irq;
395 395
396#ifdef CONFIG_X86_IO_APIC 396#ifdef CONFIG_X86_IO_APIC
397 /* 397 /*
@@ -405,14 +405,22 @@ static int __must_check xen_allocate_irq_dynamic(void)
405 first = get_nr_irqs_gsi(); 405 first = get_nr_irqs_gsi();
406#endif 406#endif
407 407
408 irq = irq_alloc_desc_from(first, -1); 408 irq = irq_alloc_descs_from(first, nvec, -1);
409 409
410 if (irq >= 0) 410 if (irq >= 0) {
411 xen_irq_init(irq); 411 for (i = 0; i < nvec; i++)
412 xen_irq_init(irq + i);
413 }
412 414
413 return irq; 415 return irq;
414} 416}
415 417
418static inline int __must_check xen_allocate_irq_dynamic(void)
419{
420
421 return xen_allocate_irqs_dynamic(1);
422}
423
416static int __must_check xen_allocate_irq_gsi(unsigned gsi) 424static int __must_check xen_allocate_irq_gsi(unsigned gsi)
417{ 425{
418 int irq; 426 int irq;
@@ -466,9 +474,6 @@ static void xen_evtchn_close(unsigned int port)
466 close.port = port; 474 close.port = port;
467 if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) 475 if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
468 BUG(); 476 BUG();
469
470 /* Closed ports are implicitly re-bound to VCPU0. */
471 bind_evtchn_to_cpu(port, 0);
472} 477}
473 478
474static void pirq_query_unmask(int irq) 479static void pirq_query_unmask(int irq)
@@ -730,22 +735,25 @@ int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc)
730} 735}
731 736
732int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, 737int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc,
733 int pirq, const char *name, domid_t domid) 738 int pirq, int nvec, const char *name, domid_t domid)
734{ 739{
735 int irq, ret; 740 int i, irq, ret;
736 741
737 mutex_lock(&irq_mapping_update_lock); 742 mutex_lock(&irq_mapping_update_lock);
738 743
739 irq = xen_allocate_irq_dynamic(); 744 irq = xen_allocate_irqs_dynamic(nvec);
740 if (irq < 0) 745 if (irq < 0)
741 goto out; 746 goto out;
742 747
743 irq_set_chip_and_handler_name(irq, &xen_pirq_chip, handle_edge_irq, 748 for (i = 0; i < nvec; i++) {
744 name); 749 irq_set_chip_and_handler_name(irq + i, &xen_pirq_chip, handle_edge_irq, name);
750
751 ret = xen_irq_info_pirq_setup(irq + i, 0, pirq + i, 0, domid,
752 i == 0 ? 0 : PIRQ_MSI_GROUP);
753 if (ret < 0)
754 goto error_irq;
755 }
745 756
746 ret = xen_irq_info_pirq_setup(irq, 0, pirq, 0, domid, 0);
747 if (ret < 0)
748 goto error_irq;
749 ret = irq_set_msi_desc(irq, msidesc); 757 ret = irq_set_msi_desc(irq, msidesc);
750 if (ret < 0) 758 if (ret < 0)
751 goto error_irq; 759 goto error_irq;
@@ -753,7 +761,8 @@ out:
753 mutex_unlock(&irq_mapping_update_lock); 761 mutex_unlock(&irq_mapping_update_lock);
754 return irq; 762 return irq;
755error_irq: 763error_irq:
756 __unbind_from_irq(irq); 764 for (; i >= 0; i--)
765 __unbind_from_irq(irq + i);
757 mutex_unlock(&irq_mapping_update_lock); 766 mutex_unlock(&irq_mapping_update_lock);
758 return ret; 767 return ret;
759} 768}
@@ -767,7 +776,12 @@ int xen_destroy_irq(int irq)
767 776
768 mutex_lock(&irq_mapping_update_lock); 777 mutex_lock(&irq_mapping_update_lock);
769 778
770 if (xen_initial_domain()) { 779 /*
780 * If trying to remove a vector in a MSI group different
781 * than the first one skip the PIRQ unmap unless this vector
782 * is the first one in the group.
783 */
784 if (xen_initial_domain() && !(info->u.pirq.flags & PIRQ_MSI_GROUP)) {
771 unmap_irq.pirq = info->u.pirq.pirq; 785 unmap_irq.pirq = info->u.pirq.pirq;
772 unmap_irq.domid = info->u.pirq.domid; 786 unmap_irq.domid = info->u.pirq.domid;
773 rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq); 787 rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq);
@@ -1329,26 +1343,6 @@ static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest,
1329 return rebind_irq_to_cpu(data->irq, tcpu); 1343 return rebind_irq_to_cpu(data->irq, tcpu);
1330} 1344}
1331 1345
1332static int retrigger_evtchn(int evtchn)
1333{
1334 int masked;
1335
1336 if (!VALID_EVTCHN(evtchn))
1337 return 0;
1338
1339 masked = test_and_set_mask(evtchn);
1340 set_evtchn(evtchn);
1341 if (!masked)
1342 unmask_evtchn(evtchn);
1343
1344 return 1;
1345}
1346
1347int resend_irq_on_evtchn(unsigned int irq)
1348{
1349 return retrigger_evtchn(evtchn_from_irq(irq));
1350}
1351
1352static void enable_dynirq(struct irq_data *data) 1346static void enable_dynirq(struct irq_data *data)
1353{ 1347{
1354 int evtchn = evtchn_from_irq(data->irq); 1348 int evtchn = evtchn_from_irq(data->irq);
@@ -1383,7 +1377,18 @@ static void mask_ack_dynirq(struct irq_data *data)
1383 1377
1384static int retrigger_dynirq(struct irq_data *data) 1378static int retrigger_dynirq(struct irq_data *data)
1385{ 1379{
1386 return retrigger_evtchn(evtchn_from_irq(data->irq)); 1380 unsigned int evtchn = evtchn_from_irq(data->irq);
1381 int masked;
1382
1383 if (!VALID_EVTCHN(evtchn))
1384 return 0;
1385
1386 masked = test_and_set_mask(evtchn);
1387 set_evtchn(evtchn);
1388 if (!masked)
1389 unmask_evtchn(evtchn);
1390
1391 return 1;
1387} 1392}
1388 1393
1389static void restore_pirqs(void) 1394static void restore_pirqs(void)
diff --git a/drivers/xen/events/events_internal.h b/drivers/xen/events/events_internal.h
index 677f41a0fff9..50c2050a1e32 100644
--- a/drivers/xen/events/events_internal.h
+++ b/drivers/xen/events/events_internal.h
@@ -53,6 +53,7 @@ struct irq_info {
53 53
54#define PIRQ_NEEDS_EOI (1 << 0) 54#define PIRQ_NEEDS_EOI (1 << 0)
55#define PIRQ_SHAREABLE (1 << 1) 55#define PIRQ_SHAREABLE (1 << 1)
56#define PIRQ_MSI_GROUP (1 << 2)
56 57
57struct evtchn_ops { 58struct evtchn_ops {
58 unsigned (*max_channels)(void); 59 unsigned (*max_channels)(void);
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index b84e3ab839aa..6d325bda76da 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -933,9 +933,6 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
933 struct page **pages, unsigned int count) 933 struct page **pages, unsigned int count)
934{ 934{
935 int i, ret; 935 int i, ret;
936 bool lazy = false;
937 pte_t *pte;
938 unsigned long mfn;
939 936
940 ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map_ops, count); 937 ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map_ops, count);
941 if (ret) 938 if (ret)
@@ -947,45 +944,7 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
947 gnttab_retry_eagain_gop(GNTTABOP_map_grant_ref, map_ops + i, 944 gnttab_retry_eagain_gop(GNTTABOP_map_grant_ref, map_ops + i,
948 &map_ops[i].status, __func__); 945 &map_ops[i].status, __func__);
949 946
950 /* this is basically a nop on x86 */ 947 return set_foreign_p2m_mapping(map_ops, kmap_ops, pages, count);
951 if (xen_feature(XENFEAT_auto_translated_physmap)) {
952 for (i = 0; i < count; i++) {
953 if (map_ops[i].status)
954 continue;
955 set_phys_to_machine(map_ops[i].host_addr >> PAGE_SHIFT,
956 map_ops[i].dev_bus_addr >> PAGE_SHIFT);
957 }
958 return ret;
959 }
960
961 if (!in_interrupt() && paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) {
962 arch_enter_lazy_mmu_mode();
963 lazy = true;
964 }
965
966 for (i = 0; i < count; i++) {
967 /* Do not add to override if the map failed. */
968 if (map_ops[i].status)
969 continue;
970
971 if (map_ops[i].flags & GNTMAP_contains_pte) {
972 pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) +
973 (map_ops[i].host_addr & ~PAGE_MASK));
974 mfn = pte_mfn(*pte);
975 } else {
976 mfn = PFN_DOWN(map_ops[i].dev_bus_addr);
977 }
978 ret = m2p_add_override(mfn, pages[i], kmap_ops ?
979 &kmap_ops[i] : NULL);
980 if (ret)
981 goto out;
982 }
983
984 out:
985 if (lazy)
986 arch_leave_lazy_mmu_mode();
987
988 return ret;
989} 948}
990EXPORT_SYMBOL_GPL(gnttab_map_refs); 949EXPORT_SYMBOL_GPL(gnttab_map_refs);
991 950
@@ -993,39 +952,13 @@ int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops,
993 struct gnttab_map_grant_ref *kmap_ops, 952 struct gnttab_map_grant_ref *kmap_ops,
994 struct page **pages, unsigned int count) 953 struct page **pages, unsigned int count)
995{ 954{
996 int i, ret; 955 int ret;
997 bool lazy = false;
998 956
999 ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap_ops, count); 957 ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap_ops, count);
1000 if (ret) 958 if (ret)
1001 return ret; 959 return ret;
1002 960
1003 /* this is basically a nop on x86 */ 961 return clear_foreign_p2m_mapping(unmap_ops, kmap_ops, pages, count);
1004 if (xen_feature(XENFEAT_auto_translated_physmap)) {
1005 for (i = 0; i < count; i++) {
1006 set_phys_to_machine(unmap_ops[i].host_addr >> PAGE_SHIFT,
1007 INVALID_P2M_ENTRY);
1008 }
1009 return ret;
1010 }
1011
1012 if (!in_interrupt() && paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) {
1013 arch_enter_lazy_mmu_mode();
1014 lazy = true;
1015 }
1016
1017 for (i = 0; i < count; i++) {
1018 ret = m2p_remove_override(pages[i], kmap_ops ?
1019 &kmap_ops[i] : NULL);
1020 if (ret)
1021 goto out;
1022 }
1023
1024 out:
1025 if (lazy)
1026 arch_leave_lazy_mmu_mode();
1027
1028 return ret;
1029} 962}
1030EXPORT_SYMBOL_GPL(gnttab_unmap_refs); 963EXPORT_SYMBOL_GPL(gnttab_unmap_refs);
1031 964
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 624e8dc24532..fc6c94c0b436 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -46,6 +46,20 @@ struct suspend_info {
46 void (*post)(int cancelled); 46 void (*post)(int cancelled);
47}; 47};
48 48
49static RAW_NOTIFIER_HEAD(xen_resume_notifier);
50
51void xen_resume_notifier_register(struct notifier_block *nb)
52{
53 raw_notifier_chain_register(&xen_resume_notifier, nb);
54}
55EXPORT_SYMBOL_GPL(xen_resume_notifier_register);
56
57void xen_resume_notifier_unregister(struct notifier_block *nb)
58{
59 raw_notifier_chain_unregister(&xen_resume_notifier, nb);
60}
61EXPORT_SYMBOL_GPL(xen_resume_notifier_unregister);
62
49#ifdef CONFIG_HIBERNATE_CALLBACKS 63#ifdef CONFIG_HIBERNATE_CALLBACKS
50static void xen_hvm_post_suspend(int cancelled) 64static void xen_hvm_post_suspend(int cancelled)
51{ 65{
@@ -152,6 +166,8 @@ static void do_suspend(void)
152 166
153 err = stop_machine(xen_suspend, &si, cpumask_of(0)); 167 err = stop_machine(xen_suspend, &si, cpumask_of(0));
154 168
169 raw_notifier_call_chain(&xen_resume_notifier, 0, NULL);
170
155 dpm_resume_start(si.cancelled ? PMSG_THAW : PMSG_RESTORE); 171 dpm_resume_start(si.cancelled ? PMSG_THAW : PMSG_RESTORE);
156 172
157 if (err) { 173 if (err) {
diff --git a/drivers/xen/pcpu.c b/drivers/xen/pcpu.c
index 79e1dff7ed4f..0aac403d53fd 100644
--- a/drivers/xen/pcpu.c
+++ b/drivers/xen/pcpu.c
@@ -40,6 +40,7 @@
40#include <linux/capability.h> 40#include <linux/capability.h>
41 41
42#include <xen/xen.h> 42#include <xen/xen.h>
43#include <xen/acpi.h>
43#include <xen/xenbus.h> 44#include <xen/xenbus.h>
44#include <xen/events.h> 45#include <xen/events.h>
45#include <xen/interface/platform.h> 46#include <xen/interface/platform.h>
diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c
index a1361c312c06..3454973dc3bb 100644
--- a/drivers/xen/platform-pci.c
+++ b/drivers/xen/platform-pci.c
@@ -45,7 +45,7 @@ static unsigned long platform_mmio_alloc;
45static unsigned long platform_mmiolen; 45static unsigned long platform_mmiolen;
46static uint64_t callback_via; 46static uint64_t callback_via;
47 47
48unsigned long alloc_xen_mmio(unsigned long len) 48static unsigned long alloc_xen_mmio(unsigned long len)
49{ 49{
50 unsigned long addr; 50 unsigned long addr;
51 51
diff --git a/drivers/xen/xen-acpi-processor.c b/drivers/xen/xen-acpi-processor.c
index 7231859119f1..82358d14ecf1 100644
--- a/drivers/xen/xen-acpi-processor.c
+++ b/drivers/xen/xen-acpi-processor.c
@@ -27,10 +27,10 @@
27#include <linux/init.h> 27#include <linux/init.h>
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/types.h> 29#include <linux/types.h>
30#include <linux/syscore_ops.h>
31#include <linux/acpi.h> 30#include <linux/acpi.h>
32#include <acpi/processor.h> 31#include <acpi/processor.h>
33#include <xen/xen.h> 32#include <xen/xen.h>
33#include <xen/xen-ops.h>
34#include <xen/interface/platform.h> 34#include <xen/interface/platform.h>
35#include <asm/xen/hypercall.h> 35#include <asm/xen/hypercall.h>
36 36
@@ -495,14 +495,15 @@ static int xen_upload_processor_pm_data(void)
495 return rc; 495 return rc;
496} 496}
497 497
498static void xen_acpi_processor_resume(void) 498static int xen_acpi_processor_resume(struct notifier_block *nb,
499 unsigned long action, void *data)
499{ 500{
500 bitmap_zero(acpi_ids_done, nr_acpi_bits); 501 bitmap_zero(acpi_ids_done, nr_acpi_bits);
501 xen_upload_processor_pm_data(); 502 return xen_upload_processor_pm_data();
502} 503}
503 504
504static struct syscore_ops xap_syscore_ops = { 505struct notifier_block xen_acpi_processor_resume_nb = {
505 .resume = xen_acpi_processor_resume, 506 .notifier_call = xen_acpi_processor_resume,
506}; 507};
507 508
508static int __init xen_acpi_processor_init(void) 509static int __init xen_acpi_processor_init(void)
@@ -555,7 +556,7 @@ static int __init xen_acpi_processor_init(void)
555 if (rc) 556 if (rc)
556 goto err_unregister; 557 goto err_unregister;
557 558
558 register_syscore_ops(&xap_syscore_ops); 559 xen_resume_notifier_register(&xen_acpi_processor_resume_nb);
559 560
560 return 0; 561 return 0;
561err_unregister: 562err_unregister:
@@ -574,7 +575,7 @@ static void __exit xen_acpi_processor_exit(void)
574{ 575{
575 int i; 576 int i;
576 577
577 unregister_syscore_ops(&xap_syscore_ops); 578 xen_resume_notifier_unregister(&xen_acpi_processor_resume_nb);
578 kfree(acpi_ids_done); 579 kfree(acpi_ids_done);
579 kfree(acpi_id_present); 580 kfree(acpi_id_present);
580 kfree(acpi_id_cst_present); 581 kfree(acpi_id_cst_present);
diff --git a/drivers/xen/xen-pciback/pciback_ops.c b/drivers/xen/xen-pciback/pciback_ops.c
index 64eb0cd8b8af..929dd46bb40c 100644
--- a/drivers/xen/xen-pciback/pciback_ops.c
+++ b/drivers/xen/xen-pciback/pciback_ops.c
@@ -213,8 +213,7 @@ int xen_pcibk_enable_msix(struct xen_pcibk_device *pdev,
213 entries[i].vector = op->msix_entries[i].vector; 213 entries[i].vector = op->msix_entries[i].vector;
214 } 214 }
215 215
216 result = pci_enable_msix(dev, entries, op->value); 216 result = pci_enable_msix_exact(dev, entries, op->value);
217
218 if (result == 0) { 217 if (result == 0) {
219 for (i = 0; i < op->value; i++) { 218 for (i = 0; i < op->value; i++) {
220 op->msix_entries[i].entry = entries[i].entry; 219 op->msix_entries[i].entry = entries[i].entry;
diff --git a/drivers/xen/xen-selfballoon.c b/drivers/xen/xen-selfballoon.c
index 745ad79c1d8e..3b2bffde534f 100644
--- a/drivers/xen/xen-selfballoon.c
+++ b/drivers/xen/xen-selfballoon.c
@@ -170,6 +170,7 @@ static void frontswap_selfshrink(void)
170 tgt_frontswap_pages = cur_frontswap_pages - 170 tgt_frontswap_pages = cur_frontswap_pages -
171 (cur_frontswap_pages / frontswap_hysteresis); 171 (cur_frontswap_pages / frontswap_hysteresis);
172 frontswap_shrink(tgt_frontswap_pages); 172 frontswap_shrink(tgt_frontswap_pages);
173 frontswap_inertia_counter = frontswap_inertia;
173} 174}
174 175
175#endif /* CONFIG_FRONTSWAP */ 176#endif /* CONFIG_FRONTSWAP */
diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c
index 01d59e66565d..439c9dca9eee 100644
--- a/drivers/xen/xenbus/xenbus_client.c
+++ b/drivers/xen/xenbus/xenbus_client.c
@@ -401,33 +401,6 @@ EXPORT_SYMBOL_GPL(xenbus_alloc_evtchn);
401 401
402 402
403/** 403/**
404 * Bind to an existing interdomain event channel in another domain. Returns 0
405 * on success and stores the local port in *port. On error, returns -errno,
406 * switches the device to XenbusStateClosing, and saves the error in XenStore.
407 */
408int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port)
409{
410 struct evtchn_bind_interdomain bind_interdomain;
411 int err;
412
413 bind_interdomain.remote_dom = dev->otherend_id;
414 bind_interdomain.remote_port = remote_port;
415
416 err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
417 &bind_interdomain);
418 if (err)
419 xenbus_dev_fatal(dev, err,
420 "binding to event channel %d from domain %d",
421 remote_port, dev->otherend_id);
422 else
423 *port = bind_interdomain.local_port;
424
425 return err;
426}
427EXPORT_SYMBOL_GPL(xenbus_bind_evtchn);
428
429
430/**
431 * Free an existing event channel. Returns 0 on success or -errno on error. 404 * Free an existing event channel. Returns 0 on success or -errno on error.
432 */ 405 */
433int xenbus_free_evtchn(struct xenbus_device *dev, int port) 406int xenbus_free_evtchn(struct xenbus_device *dev, int port)
diff --git a/fs/bio.c b/fs/bio.c
index b2dd42ed9edd..b1bc722b89aa 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1969,7 +1969,7 @@ int bio_associate_current(struct bio *bio)
1969 1969
1970 /* associate blkcg if exists */ 1970 /* associate blkcg if exists */
1971 rcu_read_lock(); 1971 rcu_read_lock();
1972 css = task_css(current, blkio_subsys_id); 1972 css = task_css(current, blkio_cgrp_id);
1973 if (css && css_tryget(css)) 1973 if (css && css_tryget(css))
1974 bio->bi_css = css; 1974 bio->bi_css = css;
1975 rcu_read_unlock(); 1975 rcu_read_unlock();
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 0bd05ab26003..78f3403300af 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -112,6 +112,7 @@ char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen)
112 spin_unlock_irqrestore(&kernfs_rename_lock, flags); 112 spin_unlock_irqrestore(&kernfs_rename_lock, flags);
113 return p; 113 return p;
114} 114}
115EXPORT_SYMBOL_GPL(kernfs_path);
115 116
116/** 117/**
117 * pr_cont_kernfs_name - pr_cont name of a kernfs_node 118 * pr_cont_kernfs_name - pr_cont name of a kernfs_node
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 9450f025fe0c..c2515851c1aa 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -14,18 +14,17 @@
14#include <linux/rcupdate.h> 14#include <linux/rcupdate.h>
15#include <linux/rculist.h> 15#include <linux/rculist.h>
16#include <linux/cgroupstats.h> 16#include <linux/cgroupstats.h>
17#include <linux/prio_heap.h>
18#include <linux/rwsem.h> 17#include <linux/rwsem.h>
19#include <linux/idr.h> 18#include <linux/idr.h>
20#include <linux/workqueue.h> 19#include <linux/workqueue.h>
21#include <linux/xattr.h>
22#include <linux/fs.h> 20#include <linux/fs.h>
23#include <linux/percpu-refcount.h> 21#include <linux/percpu-refcount.h>
24#include <linux/seq_file.h> 22#include <linux/seq_file.h>
23#include <linux/kernfs.h>
25 24
26#ifdef CONFIG_CGROUPS 25#ifdef CONFIG_CGROUPS
27 26
28struct cgroupfs_root; 27struct cgroup_root;
29struct cgroup_subsys; 28struct cgroup_subsys;
30struct inode; 29struct inode;
31struct cgroup; 30struct cgroup;
@@ -34,31 +33,16 @@ extern int cgroup_init_early(void);
34extern int cgroup_init(void); 33extern int cgroup_init(void);
35extern void cgroup_fork(struct task_struct *p); 34extern void cgroup_fork(struct task_struct *p);
36extern void cgroup_post_fork(struct task_struct *p); 35extern void cgroup_post_fork(struct task_struct *p);
37extern void cgroup_exit(struct task_struct *p, int run_callbacks); 36extern void cgroup_exit(struct task_struct *p);
38extern int cgroupstats_build(struct cgroupstats *stats, 37extern int cgroupstats_build(struct cgroupstats *stats,
39 struct dentry *dentry); 38 struct dentry *dentry);
40extern int cgroup_load_subsys(struct cgroup_subsys *ss);
41extern void cgroup_unload_subsys(struct cgroup_subsys *ss);
42 39
43extern int proc_cgroup_show(struct seq_file *, void *); 40extern int proc_cgroup_show(struct seq_file *, void *);
44 41
45/* 42/* define the enumeration of all cgroup subsystems */
46 * Define the enumeration of all cgroup subsystems. 43#define SUBSYS(_x) _x ## _cgrp_id,
47 *
48 * We define ids for builtin subsystems and then modular ones.
49 */
50#define SUBSYS(_x) _x ## _subsys_id,
51enum cgroup_subsys_id { 44enum cgroup_subsys_id {
52#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
53#include <linux/cgroup_subsys.h>
54#undef IS_SUBSYS_ENABLED
55 CGROUP_BUILTIN_SUBSYS_COUNT,
56
57 __CGROUP_SUBSYS_TEMP_PLACEHOLDER = CGROUP_BUILTIN_SUBSYS_COUNT - 1,
58
59#define IS_SUBSYS_ENABLED(option) IS_MODULE(option)
60#include <linux/cgroup_subsys.h> 45#include <linux/cgroup_subsys.h>
61#undef IS_SUBSYS_ENABLED
62 CGROUP_SUBSYS_COUNT, 46 CGROUP_SUBSYS_COUNT,
63}; 47};
64#undef SUBSYS 48#undef SUBSYS
@@ -153,11 +137,6 @@ enum {
153 CGRP_SANE_BEHAVIOR, 137 CGRP_SANE_BEHAVIOR,
154}; 138};
155 139
156struct cgroup_name {
157 struct rcu_head rcu_head;
158 char name[];
159};
160
161struct cgroup { 140struct cgroup {
162 unsigned long flags; /* "unsigned long" so bitops work */ 141 unsigned long flags; /* "unsigned long" so bitops work */
163 142
@@ -174,16 +153,17 @@ struct cgroup {
174 /* the number of attached css's */ 153 /* the number of attached css's */
175 int nr_css; 154 int nr_css;
176 155
156 atomic_t refcnt;
157
177 /* 158 /*
178 * We link our 'sibling' struct into our parent's 'children'. 159 * We link our 'sibling' struct into our parent's 'children'.
179 * Our children link their 'sibling' into our 'children'. 160 * Our children link their 'sibling' into our 'children'.
180 */ 161 */
181 struct list_head sibling; /* my parent's children */ 162 struct list_head sibling; /* my parent's children */
182 struct list_head children; /* my children */ 163 struct list_head children; /* my children */
183 struct list_head files; /* my files */
184 164
185 struct cgroup *parent; /* my parent */ 165 struct cgroup *parent; /* my parent */
186 struct dentry *dentry; /* cgroup fs entry, RCU protected */ 166 struct kernfs_node *kn; /* cgroup kernfs entry */
187 167
188 /* 168 /*
189 * Monotonically increasing unique serial number which defines a 169 * Monotonically increasing unique serial number which defines a
@@ -193,23 +173,13 @@ struct cgroup {
193 */ 173 */
194 u64 serial_nr; 174 u64 serial_nr;
195 175
196 /* 176 /* The bitmask of subsystems attached to this cgroup */
197 * This is a copy of dentry->d_name, and it's needed because 177 unsigned long subsys_mask;
198 * we can't use dentry->d_name in cgroup_path().
199 *
200 * You must acquire rcu_read_lock() to access cgrp->name, and
201 * the only place that can change it is rename(), which is
202 * protected by parent dir's i_mutex.
203 *
204 * Normally you should use cgroup_name() wrapper rather than
205 * access it directly.
206 */
207 struct cgroup_name __rcu *name;
208 178
209 /* Private pointers for each registered subsystem */ 179 /* Private pointers for each registered subsystem */
210 struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; 180 struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT];
211 181
212 struct cgroupfs_root *root; 182 struct cgroup_root *root;
213 183
214 /* 184 /*
215 * List of cgrp_cset_links pointing at css_sets with tasks in this 185 * List of cgrp_cset_links pointing at css_sets with tasks in this
@@ -237,14 +207,11 @@ struct cgroup {
237 /* For css percpu_ref killing and RCU-protected deletion */ 207 /* For css percpu_ref killing and RCU-protected deletion */
238 struct rcu_head rcu_head; 208 struct rcu_head rcu_head;
239 struct work_struct destroy_work; 209 struct work_struct destroy_work;
240
241 /* directory xattrs */
242 struct simple_xattrs xattrs;
243}; 210};
244 211
245#define MAX_CGROUP_ROOT_NAMELEN 64 212#define MAX_CGROUP_ROOT_NAMELEN 64
246 213
247/* cgroupfs_root->flags */ 214/* cgroup_root->flags */
248enum { 215enum {
249 /* 216 /*
250 * Unfortunately, cgroup core and various controllers are riddled 217 * Unfortunately, cgroup core and various controllers are riddled
@@ -262,8 +229,8 @@ enum {
262 * 229 *
263 * The followings are the behaviors currently affected this flag. 230 * The followings are the behaviors currently affected this flag.
264 * 231 *
265 * - Mount options "noprefix" and "clone_children" are disallowed. 232 * - Mount options "noprefix", "xattr", "clone_children",
266 * Also, cgroupfs file cgroup.clone_children is not created. 233 * "release_agent" and "name" are disallowed.
267 * 234 *
268 * - When mounting an existing superblock, mount options should 235 * - When mounting an existing superblock, mount options should
269 * match. 236 * match.
@@ -281,6 +248,11 @@ enum {
281 * - "release_agent" and "notify_on_release" are removed. 248 * - "release_agent" and "notify_on_release" are removed.
282 * Replacement notification mechanism will be implemented. 249 * Replacement notification mechanism will be implemented.
283 * 250 *
251 * - "cgroup.clone_children" is removed.
252 *
253 * - If mount is requested with sane_behavior but without any
254 * subsystem, the default unified hierarchy is mounted.
255 *
284 * - cpuset: tasks will be kept in empty cpusets when hotplug happens 256 * - cpuset: tasks will be kept in empty cpusets when hotplug happens
285 * and take masks of ancestors with non-empty cpus/mems, instead of 257 * and take masks of ancestors with non-empty cpus/mems, instead of
286 * being moved to an ancestor. 258 * being moved to an ancestor.
@@ -300,29 +272,24 @@ enum {
300 272
301 /* mount options live below bit 16 */ 273 /* mount options live below bit 16 */
302 CGRP_ROOT_OPTION_MASK = (1 << 16) - 1, 274 CGRP_ROOT_OPTION_MASK = (1 << 16) - 1,
303
304 CGRP_ROOT_SUBSYS_BOUND = (1 << 16), /* subsystems finished binding */
305}; 275};
306 276
307/* 277/*
308 * A cgroupfs_root represents the root of a cgroup hierarchy, and may be 278 * A cgroup_root represents the root of a cgroup hierarchy, and may be
309 * associated with a superblock to form an active hierarchy. This is 279 * associated with a kernfs_root to form an active hierarchy. This is
310 * internal to cgroup core. Don't access directly from controllers. 280 * internal to cgroup core. Don't access directly from controllers.
311 */ 281 */
312struct cgroupfs_root { 282struct cgroup_root {
313 struct super_block *sb; 283 struct kernfs_root *kf_root;
314
315 /* The bitmask of subsystems attached to this hierarchy */
316 unsigned long subsys_mask;
317 284
318 /* Unique id for this hierarchy. */ 285 /* Unique id for this hierarchy. */
319 int hierarchy_id; 286 int hierarchy_id;
320 287
321 /* The root cgroup for this hierarchy */ 288 /* The root cgroup. Root is destroyed on its release. */
322 struct cgroup top_cgroup; 289 struct cgroup cgrp;
323 290
324 /* Tracks how many cgroups are currently defined in hierarchy.*/ 291 /* Number of cgroups in the hierarchy, used only for /proc/cgroups */
325 int number_of_cgroups; 292 atomic_t nr_cgrps;
326 293
327 /* A list running through the active hierarchies */ 294 /* A list running through the active hierarchies */
328 struct list_head root_list; 295 struct list_head root_list;
@@ -360,10 +327,14 @@ struct css_set {
360 struct hlist_node hlist; 327 struct hlist_node hlist;
361 328
362 /* 329 /*
363 * List running through all tasks using this cgroup 330 * Lists running through all tasks using this cgroup group.
364 * group. Protected by css_set_lock 331 * mg_tasks lists tasks which belong to this cset but are in the
332 * process of being migrated out or in. Protected by
333 * css_set_rwsem, but, during migration, once tasks are moved to
334 * mg_tasks, it can be read safely while holding cgroup_mutex.
365 */ 335 */
366 struct list_head tasks; 336 struct list_head tasks;
337 struct list_head mg_tasks;
367 338
368 /* 339 /*
369 * List of cgrp_cset_links pointing at cgroups referenced from this 340 * List of cgrp_cset_links pointing at cgroups referenced from this
@@ -372,13 +343,29 @@ struct css_set {
372 struct list_head cgrp_links; 343 struct list_head cgrp_links;
373 344
374 /* 345 /*
375 * Set of subsystem states, one for each subsystem. This array 346 * Set of subsystem states, one for each subsystem. This array is
376 * is immutable after creation apart from the init_css_set 347 * immutable after creation apart from the init_css_set during
377 * during subsystem registration (at boot time) and modular subsystem 348 * subsystem registration (at boot time).
378 * loading/unloading.
379 */ 349 */
380 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; 350 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
381 351
352 /*
353 * List of csets participating in the on-going migration either as
354 * source or destination. Protected by cgroup_mutex.
355 */
356 struct list_head mg_preload_node;
357 struct list_head mg_node;
358
359 /*
360 * If this cset is acting as the source of migration the following
361 * two fields are set. mg_src_cgrp is the source cgroup of the
362 * on-going migration and mg_dst_cset is the destination cset the
363 * target tasks on this cset should be migrated to. Protected by
364 * cgroup_mutex.
365 */
366 struct cgroup *mg_src_cgrp;
367 struct css_set *mg_dst_cset;
368
382 /* For RCU-protected deletion */ 369 /* For RCU-protected deletion */
383 struct rcu_head rcu_head; 370 struct rcu_head rcu_head;
384}; 371};
@@ -397,6 +384,7 @@ enum {
397 CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ 384 CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */
398 CFTYPE_INSANE = (1 << 2), /* don't create if sane_behavior */ 385 CFTYPE_INSANE = (1 << 2), /* don't create if sane_behavior */
399 CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ 386 CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */
387 CFTYPE_ONLY_ON_DFL = (1 << 4), /* only on default hierarchy */
400}; 388};
401 389
402#define MAX_CFTYPE_NAME 64 390#define MAX_CFTYPE_NAME 64
@@ -416,8 +404,9 @@ struct cftype {
416 umode_t mode; 404 umode_t mode;
417 405
418 /* 406 /*
419 * If non-zero, defines the maximum length of string that can 407 * The maximum length of string, excluding trailing nul, that can
420 * be passed to write_string; defaults to 64 408 * be passed to write_string. If < PAGE_SIZE-1, PAGE_SIZE-1 is
409 * assumed.
421 */ 410 */
422 size_t max_write_len; 411 size_t max_write_len;
423 412
@@ -425,10 +414,12 @@ struct cftype {
425 unsigned int flags; 414 unsigned int flags;
426 415
427 /* 416 /*
428 * The subsys this file belongs to. Initialized automatically 417 * Fields used for internal bookkeeping. Initialized automatically
429 * during registration. NULL for cgroup core files. 418 * during registration.
430 */ 419 */
431 struct cgroup_subsys *ss; 420 struct cgroup_subsys *ss; /* NULL for cgroup core files */
421 struct list_head node; /* anchored at ss->cfts */
422 struct kernfs_ops *kf_ops;
432 423
433 /* 424 /*
434 * read_u64() is a shortcut for the common case of returning a 425 * read_u64() is a shortcut for the common case of returning a
@@ -467,7 +458,7 @@ struct cftype {
467 * Returns 0 or -ve error code. 458 * Returns 0 or -ve error code.
468 */ 459 */
469 int (*write_string)(struct cgroup_subsys_state *css, struct cftype *cft, 460 int (*write_string)(struct cgroup_subsys_state *css, struct cftype *cft,
470 const char *buffer); 461 char *buffer);
471 /* 462 /*
472 * trigger() callback can be used to get some kick from the 463 * trigger() callback can be used to get some kick from the
473 * userspace, when the actual string written is not important 464 * userspace, when the actual string written is not important
@@ -475,37 +466,18 @@ struct cftype {
475 * kick type for multiplexing. 466 * kick type for multiplexing.
476 */ 467 */
477 int (*trigger)(struct cgroup_subsys_state *css, unsigned int event); 468 int (*trigger)(struct cgroup_subsys_state *css, unsigned int event);
478};
479 469
480/* 470#ifdef CONFIG_DEBUG_LOCK_ALLOC
481 * cftype_sets describe cftypes belonging to a subsystem and are chained at 471 struct lock_class_key lockdep_key;
482 * cgroup_subsys->cftsets. Each cftset points to an array of cftypes 472#endif
483 * terminated by zero length name.
484 */
485struct cftype_set {
486 struct list_head node; /* chained at subsys->cftsets */
487 struct cftype *cfts;
488}; 473};
489 474
490/* 475extern struct cgroup_root cgrp_dfl_root;
491 * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. Don't
492 * access directly.
493 */
494struct cfent {
495 struct list_head node;
496 struct dentry *dentry;
497 struct cftype *type;
498 struct cgroup_subsys_state *css;
499
500 /* file xattrs */
501 struct simple_xattrs xattrs;
502};
503 476
504/* seq_file->private points to the following, only ->priv is public */ 477static inline bool cgroup_on_dfl(const struct cgroup *cgrp)
505struct cgroup_open_file { 478{
506 struct cfent *cfe; 479 return cgrp->root == &cgrp_dfl_root;
507 void *priv; 480}
508};
509 481
510/* 482/*
511 * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details. This 483 * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details. This
@@ -516,34 +488,63 @@ static inline bool cgroup_sane_behavior(const struct cgroup *cgrp)
516 return cgrp->root->flags & CGRP_ROOT_SANE_BEHAVIOR; 488 return cgrp->root->flags & CGRP_ROOT_SANE_BEHAVIOR;
517} 489}
518 490
519/* Caller should hold rcu_read_lock() */ 491/* no synchronization, the result can only be used as a hint */
520static inline const char *cgroup_name(const struct cgroup *cgrp) 492static inline bool cgroup_has_tasks(struct cgroup *cgrp)
521{ 493{
522 return rcu_dereference(cgrp->name)->name; 494 return !list_empty(&cgrp->cset_links);
523} 495}
524 496
525static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq) 497/* returns ino associated with a cgroup, 0 indicates unmounted root */
498static inline ino_t cgroup_ino(struct cgroup *cgrp)
526{ 499{
527 struct cgroup_open_file *of = seq->private; 500 if (cgrp->kn)
528 return of->cfe->css; 501 return cgrp->kn->ino;
502 else
503 return 0;
529} 504}
530 505
531static inline struct cftype *seq_cft(struct seq_file *seq) 506static inline struct cftype *seq_cft(struct seq_file *seq)
532{ 507{
533 struct cgroup_open_file *of = seq->private; 508 struct kernfs_open_file *of = seq->private;
534 return of->cfe->type; 509
510 return of->kn->priv;
511}
512
513struct cgroup_subsys_state *seq_css(struct seq_file *seq);
514
515/*
516 * Name / path handling functions. All are thin wrappers around the kernfs
517 * counterparts and can be called under any context.
518 */
519
520static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen)
521{
522 return kernfs_name(cgrp->kn, buf, buflen);
535} 523}
536 524
525static inline char * __must_check cgroup_path(struct cgroup *cgrp, char *buf,
526 size_t buflen)
527{
528 return kernfs_path(cgrp->kn, buf, buflen);
529}
530
531static inline void pr_cont_cgroup_name(struct cgroup *cgrp)
532{
533 pr_cont_kernfs_name(cgrp->kn);
534}
535
536static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
537{
538 pr_cont_kernfs_path(cgrp->kn);
539}
540
541char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen);
542
537int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); 543int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
538int cgroup_rm_cftypes(struct cftype *cfts); 544int cgroup_rm_cftypes(struct cftype *cfts);
539 545
540bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); 546bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor);
541 547
542int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
543int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen);
544
545int cgroup_task_count(const struct cgroup *cgrp);
546
547/* 548/*
548 * Control Group taskset, used to pass around set of tasks to cgroup_subsys 549 * Control Group taskset, used to pass around set of tasks to cgroup_subsys
549 * methods. 550 * methods.
@@ -551,22 +552,15 @@ int cgroup_task_count(const struct cgroup *cgrp);
551struct cgroup_taskset; 552struct cgroup_taskset;
552struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset); 553struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset);
553struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset); 554struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset);
554struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset,
555 int subsys_id);
556int cgroup_taskset_size(struct cgroup_taskset *tset);
557 555
558/** 556/**
559 * cgroup_taskset_for_each - iterate cgroup_taskset 557 * cgroup_taskset_for_each - iterate cgroup_taskset
560 * @task: the loop cursor 558 * @task: the loop cursor
561 * @skip_css: skip if task's css matches this, %NULL to iterate through all
562 * @tset: taskset to iterate 559 * @tset: taskset to iterate
563 */ 560 */
564#define cgroup_taskset_for_each(task, skip_css, tset) \ 561#define cgroup_taskset_for_each(task, tset) \
565 for ((task) = cgroup_taskset_first((tset)); (task); \ 562 for ((task) = cgroup_taskset_first((tset)); (task); \
566 (task) = cgroup_taskset_next((tset))) \ 563 (task) = cgroup_taskset_next((tset)))
567 if (!(skip_css) || \
568 cgroup_taskset_cur_css((tset), \
569 (skip_css)->ss->subsys_id) != (skip_css))
570 564
571/* 565/*
572 * Control Group subsystem type. 566 * Control Group subsystem type.
@@ -591,7 +585,6 @@ struct cgroup_subsys {
591 struct task_struct *task); 585 struct task_struct *task);
592 void (*bind)(struct cgroup_subsys_state *root_css); 586 void (*bind)(struct cgroup_subsys_state *root_css);
593 587
594 int subsys_id;
595 int disabled; 588 int disabled;
596 int early_init; 589 int early_init;
597 590
@@ -610,27 +603,26 @@ struct cgroup_subsys {
610 bool broken_hierarchy; 603 bool broken_hierarchy;
611 bool warned_broken_hierarchy; 604 bool warned_broken_hierarchy;
612 605
606 /* the following two fields are initialized automtically during boot */
607 int id;
613#define MAX_CGROUP_TYPE_NAMELEN 32 608#define MAX_CGROUP_TYPE_NAMELEN 32
614 const char *name; 609 const char *name;
615 610
616 /* link to parent, protected by cgroup_lock() */ 611 /* link to parent, protected by cgroup_lock() */
617 struct cgroupfs_root *root; 612 struct cgroup_root *root;
618 613
619 /* list of cftype_sets */ 614 /*
620 struct list_head cftsets; 615 * List of cftypes. Each entry is the first entry of an array
616 * terminated by zero length name.
617 */
618 struct list_head cfts;
621 619
622 /* base cftypes, automatically [de]registered with subsys itself */ 620 /* base cftypes, automatically registered with subsys itself */
623 struct cftype *base_cftypes; 621 struct cftype *base_cftypes;
624 struct cftype_set base_cftset;
625
626 /* should be defined only by modular subsystems */
627 struct module *module;
628}; 622};
629 623
630#define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys; 624#define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys;
631#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
632#include <linux/cgroup_subsys.h> 625#include <linux/cgroup_subsys.h>
633#undef IS_SUBSYS_ENABLED
634#undef SUBSYS 626#undef SUBSYS
635 627
636/** 628/**
@@ -661,10 +653,12 @@ struct cgroup_subsys_state *css_parent(struct cgroup_subsys_state *css)
661 */ 653 */
662#ifdef CONFIG_PROVE_RCU 654#ifdef CONFIG_PROVE_RCU
663extern struct mutex cgroup_mutex; 655extern struct mutex cgroup_mutex;
656extern struct rw_semaphore css_set_rwsem;
664#define task_css_set_check(task, __c) \ 657#define task_css_set_check(task, __c) \
665 rcu_dereference_check((task)->cgroups, \ 658 rcu_dereference_check((task)->cgroups, \
666 lockdep_is_held(&(task)->alloc_lock) || \ 659 lockdep_is_held(&cgroup_mutex) || \
667 lockdep_is_held(&cgroup_mutex) || (__c)) 660 lockdep_is_held(&css_set_rwsem) || \
661 ((task)->flags & PF_EXITING) || (__c))
668#else 662#else
669#define task_css_set_check(task, __c) \ 663#define task_css_set_check(task, __c) \
670 rcu_dereference((task)->cgroups) 664 rcu_dereference((task)->cgroups)
@@ -837,16 +831,11 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
837struct task_struct *css_task_iter_next(struct css_task_iter *it); 831struct task_struct *css_task_iter_next(struct css_task_iter *it);
838void css_task_iter_end(struct css_task_iter *it); 832void css_task_iter_end(struct css_task_iter *it);
839 833
840int css_scan_tasks(struct cgroup_subsys_state *css,
841 bool (*test)(struct task_struct *, void *),
842 void (*process)(struct task_struct *, void *),
843 void *data, struct ptr_heap *heap);
844
845int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); 834int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
846int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); 835int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
847 836
848struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, 837struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
849 struct cgroup_subsys *ss); 838 struct cgroup_subsys *ss);
850 839
851#else /* !CONFIG_CGROUPS */ 840#else /* !CONFIG_CGROUPS */
852 841
@@ -854,7 +843,7 @@ static inline int cgroup_init_early(void) { return 0; }
854static inline int cgroup_init(void) { return 0; } 843static inline int cgroup_init(void) { return 0; }
855static inline void cgroup_fork(struct task_struct *p) {} 844static inline void cgroup_fork(struct task_struct *p) {}
856static inline void cgroup_post_fork(struct task_struct *p) {} 845static inline void cgroup_post_fork(struct task_struct *p) {}
857static inline void cgroup_exit(struct task_struct *p, int callbacks) {} 846static inline void cgroup_exit(struct task_struct *p) {}
858 847
859static inline int cgroupstats_build(struct cgroupstats *stats, 848static inline int cgroupstats_build(struct cgroupstats *stats,
860 struct dentry *dentry) 849 struct dentry *dentry)
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 7b99d717411d..768fe44e19f0 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -3,51 +3,51 @@
3 * 3 *
4 * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS. 4 * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS.
5 */ 5 */
6#if IS_SUBSYS_ENABLED(CONFIG_CPUSETS) 6#if IS_ENABLED(CONFIG_CPUSETS)
7SUBSYS(cpuset) 7SUBSYS(cpuset)
8#endif 8#endif
9 9
10#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_DEBUG) 10#if IS_ENABLED(CONFIG_CGROUP_DEBUG)
11SUBSYS(debug) 11SUBSYS(debug)
12#endif 12#endif
13 13
14#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_SCHED) 14#if IS_ENABLED(CONFIG_CGROUP_SCHED)
15SUBSYS(cpu_cgroup) 15SUBSYS(cpu)
16#endif 16#endif
17 17
18#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_CPUACCT) 18#if IS_ENABLED(CONFIG_CGROUP_CPUACCT)
19SUBSYS(cpuacct) 19SUBSYS(cpuacct)
20#endif 20#endif
21 21
22#if IS_SUBSYS_ENABLED(CONFIG_MEMCG) 22#if IS_ENABLED(CONFIG_MEMCG)
23SUBSYS(mem_cgroup) 23SUBSYS(memory)
24#endif 24#endif
25 25
26#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_DEVICE) 26#if IS_ENABLED(CONFIG_CGROUP_DEVICE)
27SUBSYS(devices) 27SUBSYS(devices)
28#endif 28#endif
29 29
30#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_FREEZER) 30#if IS_ENABLED(CONFIG_CGROUP_FREEZER)
31SUBSYS(freezer) 31SUBSYS(freezer)
32#endif 32#endif
33 33
34#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_NET_CLASSID) 34#if IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)
35SUBSYS(net_cls) 35SUBSYS(net_cls)
36#endif 36#endif
37 37
38#if IS_SUBSYS_ENABLED(CONFIG_BLK_CGROUP) 38#if IS_ENABLED(CONFIG_BLK_CGROUP)
39SUBSYS(blkio) 39SUBSYS(blkio)
40#endif 40#endif
41 41
42#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_PERF) 42#if IS_ENABLED(CONFIG_CGROUP_PERF)
43SUBSYS(perf) 43SUBSYS(perf_event)
44#endif 44#endif
45 45
46#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_NET_PRIO) 46#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
47SUBSYS(net_prio) 47SUBSYS(net_prio)
48#endif 48#endif
49 49
50#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_HUGETLB) 50#if IS_ENABLED(CONFIG_CGROUP_HUGETLB)
51SUBSYS(hugetlb) 51SUBSYS(hugetlb)
52#endif 52#endif
53/* 53/*
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index f4233b195dab..9212b017bc72 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -92,6 +92,7 @@ typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip,
92 * STUB - The ftrace_ops is just a place holder. 92 * STUB - The ftrace_ops is just a place holder.
93 * INITIALIZED - The ftrace_ops has already been initialized (first use time 93 * INITIALIZED - The ftrace_ops has already been initialized (first use time
94 * register_ftrace_function() is called, it will initialized the ops) 94 * register_ftrace_function() is called, it will initialized the ops)
95 * DELETED - The ops are being deleted, do not let them be registered again.
95 */ 96 */
96enum { 97enum {
97 FTRACE_OPS_FL_ENABLED = 1 << 0, 98 FTRACE_OPS_FL_ENABLED = 1 << 0,
@@ -103,13 +104,26 @@ enum {
103 FTRACE_OPS_FL_RECURSION_SAFE = 1 << 6, 104 FTRACE_OPS_FL_RECURSION_SAFE = 1 << 6,
104 FTRACE_OPS_FL_STUB = 1 << 7, 105 FTRACE_OPS_FL_STUB = 1 << 7,
105 FTRACE_OPS_FL_INITIALIZED = 1 << 8, 106 FTRACE_OPS_FL_INITIALIZED = 1 << 8,
107 FTRACE_OPS_FL_DELETED = 1 << 9,
106}; 108};
107 109
110/*
111 * Note, ftrace_ops can be referenced outside of RCU protection.
112 * (Although, for perf, the control ops prevent that). If ftrace_ops is
113 * allocated and not part of kernel core data, the unregistering of it will
114 * perform a scheduling on all CPUs to make sure that there are no more users.
115 * Depending on the load of the system that may take a bit of time.
116 *
117 * Any private data added must also take care not to be freed and if private
118 * data is added to a ftrace_ops that is in core code, the user of the
119 * ftrace_ops must perform a schedule_on_each_cpu() before freeing it.
120 */
108struct ftrace_ops { 121struct ftrace_ops {
109 ftrace_func_t func; 122 ftrace_func_t func;
110 struct ftrace_ops *next; 123 struct ftrace_ops *next;
111 unsigned long flags; 124 unsigned long flags;
112 int __percpu *disabled; 125 int __percpu *disabled;
126 void *private;
113#ifdef CONFIG_DYNAMIC_FTRACE 127#ifdef CONFIG_DYNAMIC_FTRACE
114 struct ftrace_hash *notrace_hash; 128 struct ftrace_hash *notrace_hash;
115 struct ftrace_hash *filter_hash; 129 struct ftrace_hash *filter_hash;
@@ -285,7 +299,7 @@ extern void
285unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops); 299unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops);
286extern void unregister_ftrace_function_probe_all(char *glob); 300extern void unregister_ftrace_function_probe_all(char *glob);
287 301
288extern int ftrace_text_reserved(void *start, void *end); 302extern int ftrace_text_reserved(const void *start, const void *end);
289 303
290extern int ftrace_nr_registered_ops(void); 304extern int ftrace_nr_registered_ops(void);
291 305
@@ -316,12 +330,9 @@ enum {
316#define FTRACE_REF_MAX ((1UL << 29) - 1) 330#define FTRACE_REF_MAX ((1UL << 29) - 1)
317 331
318struct dyn_ftrace { 332struct dyn_ftrace {
319 union { 333 unsigned long ip; /* address of mcount call-site */
320 unsigned long ip; /* address of mcount call-site */
321 struct dyn_ftrace *freelist;
322 };
323 unsigned long flags; 334 unsigned long flags;
324 struct dyn_arch_ftrace arch; 335 struct dyn_arch_ftrace arch;
325}; 336};
326 337
327int ftrace_force_update(void); 338int ftrace_force_update(void);
@@ -409,7 +420,7 @@ ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable);
409 420
410/* defined in arch */ 421/* defined in arch */
411extern int ftrace_ip_converted(unsigned long ip); 422extern int ftrace_ip_converted(unsigned long ip);
412extern int ftrace_dyn_arch_init(void *data); 423extern int ftrace_dyn_arch_init(void);
413extern void ftrace_replace_code(int enable); 424extern void ftrace_replace_code(int enable);
414extern int ftrace_update_ftrace_func(ftrace_func_t func); 425extern int ftrace_update_ftrace_func(ftrace_func_t func);
415extern void ftrace_caller(void); 426extern void ftrace_caller(void);
@@ -541,7 +552,7 @@ static inline __init int unregister_ftrace_command(char *cmd_name)
541{ 552{
542 return -EINVAL; 553 return -EINVAL;
543} 554}
544static inline int ftrace_text_reserved(void *start, void *end) 555static inline int ftrace_text_reserved(const void *start, const void *end)
545{ 556{
546 return 0; 557 return 0;
547} 558}
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 4cdb3a17bcb5..cdc30111d2f8 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -163,6 +163,8 @@ void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
163 163
164void tracing_record_cmdline(struct task_struct *tsk); 164void tracing_record_cmdline(struct task_struct *tsk);
165 165
166int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...);
167
166struct event_filter; 168struct event_filter;
167 169
168enum trace_reg { 170enum trace_reg {
@@ -197,6 +199,32 @@ struct ftrace_event_class {
197extern int ftrace_event_reg(struct ftrace_event_call *event, 199extern int ftrace_event_reg(struct ftrace_event_call *event,
198 enum trace_reg type, void *data); 200 enum trace_reg type, void *data);
199 201
202int ftrace_output_event(struct trace_iterator *iter, struct ftrace_event_call *event,
203 char *fmt, ...);
204
205int ftrace_event_define_field(struct ftrace_event_call *call,
206 char *type, int len, char *item, int offset,
207 int field_size, int sign, int filter);
208
209struct ftrace_event_buffer {
210 struct ring_buffer *buffer;
211 struct ring_buffer_event *event;
212 struct ftrace_event_file *ftrace_file;
213 void *entry;
214 unsigned long flags;
215 int pc;
216};
217
218void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer,
219 struct ftrace_event_file *ftrace_file,
220 unsigned long len);
221
222void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer);
223
224int ftrace_event_define_field(struct ftrace_event_call *call,
225 char *type, int len, char *item, int offset,
226 int field_size, int sign, int filter);
227
200enum { 228enum {
201 TRACE_EVENT_FL_FILTERED_BIT, 229 TRACE_EVENT_FL_FILTERED_BIT,
202 TRACE_EVENT_FL_CAP_ANY_BIT, 230 TRACE_EVENT_FL_CAP_ANY_BIT,
diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index 787bba3bf552..0129f89cf98d 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -49,7 +49,7 @@ int set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg)
49 49
50static inline bool hugetlb_cgroup_disabled(void) 50static inline bool hugetlb_cgroup_disabled(void)
51{ 51{
52 if (hugetlb_subsys.disabled) 52 if (hugetlb_cgrp_subsys.disabled)
53 return true; 53 return true;
54 return false; 54 return false;
55} 55}
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index abd0113b6620..eccfb4a4b379 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -162,7 +162,7 @@ extern int do_swap_account;
162 162
163static inline bool mem_cgroup_disabled(void) 163static inline bool mem_cgroup_disabled(void)
164{ 164{
165 if (mem_cgroup_subsys.disabled) 165 if (memory_cgrp_subsys.disabled)
166 return true; 166 return true;
167 return false; 167 return false;
168} 168}
diff --git a/include/linux/module.h b/include/linux/module.h
index eaf60ff9ba94..5a5053975114 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -15,7 +15,7 @@
15#include <linux/stringify.h> 15#include <linux/stringify.h>
16#include <linux/kobject.h> 16#include <linux/kobject.h>
17#include <linux/moduleparam.h> 17#include <linux/moduleparam.h>
18#include <linux/tracepoint.h> 18#include <linux/jump_label.h>
19#include <linux/export.h> 19#include <linux/export.h>
20 20
21#include <linux/percpu.h> 21#include <linux/percpu.h>
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 7159a0a933df..812b2553dfd8 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -48,12 +48,6 @@ extern int tracepoint_probe_register(const char *name, void *probe, void *data);
48extern int 48extern int
49tracepoint_probe_unregister(const char *name, void *probe, void *data); 49tracepoint_probe_unregister(const char *name, void *probe, void *data);
50 50
51extern int tracepoint_probe_register_noupdate(const char *name, void *probe,
52 void *data);
53extern int tracepoint_probe_unregister_noupdate(const char *name, void *probe,
54 void *data);
55extern void tracepoint_probe_update_all(void);
56
57#ifdef CONFIG_MODULES 51#ifdef CONFIG_MODULES
58struct tp_module { 52struct tp_module {
59 struct list_head list; 53 struct list_head list;
@@ -68,18 +62,6 @@ static inline bool trace_module_has_bad_taint(struct module *mod)
68} 62}
69#endif /* CONFIG_MODULES */ 63#endif /* CONFIG_MODULES */
70 64
71struct tracepoint_iter {
72#ifdef CONFIG_MODULES
73 struct tp_module *module;
74#endif /* CONFIG_MODULES */
75 struct tracepoint * const *tracepoint;
76};
77
78extern void tracepoint_iter_start(struct tracepoint_iter *iter);
79extern void tracepoint_iter_next(struct tracepoint_iter *iter);
80extern void tracepoint_iter_stop(struct tracepoint_iter *iter);
81extern void tracepoint_iter_reset(struct tracepoint_iter *iter);
82
83/* 65/*
84 * tracepoint_synchronize_unregister must be called between the last tracepoint 66 * tracepoint_synchronize_unregister must be called between the last tracepoint
85 * probe unregistration and the end of module exit to make sure there is no 67 * probe unregistration and the end of module exit to make sure there is no
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 24579a0312a0..81022a52bc34 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -96,5 +96,7 @@ extern void vfio_unregister_iommu_driver(
96extern struct vfio_group *vfio_group_get_external_user(struct file *filep); 96extern struct vfio_group *vfio_group_get_external_user(struct file *filep);
97extern void vfio_group_put_external_user(struct vfio_group *group); 97extern void vfio_group_put_external_user(struct vfio_group *group);
98extern int vfio_external_user_iommu_id(struct vfio_group *group); 98extern int vfio_external_user_iommu_id(struct vfio_group *group);
99extern long vfio_external_check_extension(struct vfio_group *group,
100 unsigned long arg);
99 101
100#endif /* VFIO_H */ 102#endif /* VFIO_H */
diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h
index 9cf2d5ef38d9..c15d39456e14 100644
--- a/include/net/cls_cgroup.h
+++ b/include/net/cls_cgroup.h
@@ -34,7 +34,7 @@ static inline u32 task_cls_classid(struct task_struct *p)
34 return 0; 34 return 0;
35 35
36 rcu_read_lock(); 36 rcu_read_lock();
37 classid = container_of(task_css(p, net_cls_subsys_id), 37 classid = container_of(task_css(p, net_cls_cgrp_id),
38 struct cgroup_cls_state, css)->classid; 38 struct cgroup_cls_state, css)->classid;
39 rcu_read_unlock(); 39 rcu_read_unlock();
40 40
diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h
index dafc09f0fdbc..f2a9597ff53c 100644
--- a/include/net/netprio_cgroup.h
+++ b/include/net/netprio_cgroup.h
@@ -27,32 +27,17 @@ struct netprio_map {
27 27
28void sock_update_netprioidx(struct sock *sk); 28void sock_update_netprioidx(struct sock *sk);
29 29
30#if IS_BUILTIN(CONFIG_CGROUP_NET_PRIO)
31static inline u32 task_netprioidx(struct task_struct *p) 30static inline u32 task_netprioidx(struct task_struct *p)
32{ 31{
33 struct cgroup_subsys_state *css; 32 struct cgroup_subsys_state *css;
34 u32 idx; 33 u32 idx;
35 34
36 rcu_read_lock(); 35 rcu_read_lock();
37 css = task_css(p, net_prio_subsys_id); 36 css = task_css(p, net_prio_cgrp_id);
38 idx = css->cgroup->id; 37 idx = css->cgroup->id;
39 rcu_read_unlock(); 38 rcu_read_unlock();
40 return idx; 39 return idx;
41} 40}
42#elif IS_MODULE(CONFIG_CGROUP_NET_PRIO)
43static inline u32 task_netprioidx(struct task_struct *p)
44{
45 struct cgroup_subsys_state *css;
46 u32 idx = 0;
47
48 rcu_read_lock();
49 css = task_css(p, net_prio_subsys_id);
50 if (css)
51 idx = css->cgroup->id;
52 rcu_read_unlock();
53 return idx;
54}
55#endif
56#else /* !CONFIG_CGROUP_NET_PRIO */ 41#else /* !CONFIG_CGROUP_NET_PRIO */
57static inline u32 task_netprioidx(struct task_struct *p) 42static inline u32 task_netprioidx(struct task_struct *p)
58{ 43{
diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h
index 3075ffbb9a83..4e4f2f8b1ac2 100644
--- a/include/trace/events/migrate.h
+++ b/include/trace/events/migrate.h
@@ -4,6 +4,8 @@
4#if !defined(_TRACE_MIGRATE_H) || defined(TRACE_HEADER_MULTI_READ) 4#if !defined(_TRACE_MIGRATE_H) || defined(TRACE_HEADER_MULTI_READ)
5#define _TRACE_MIGRATE_H 5#define _TRACE_MIGRATE_H
6 6
7#include <linux/tracepoint.h>
8
7#define MIGRATE_MODE \ 9#define MIGRATE_MODE \
8 {MIGRATE_ASYNC, "MIGRATE_ASYNC"}, \ 10 {MIGRATE_ASYNC, "MIGRATE_ASYNC"}, \
9 {MIGRATE_SYNC_LIGHT, "MIGRATE_SYNC_LIGHT"}, \ 11 {MIGRATE_SYNC_LIGHT, "MIGRATE_SYNC_LIGHT"}, \
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 464ea82e10db..cee02d65ab3f 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -4,6 +4,7 @@
4#if !defined(_TRACE_WRITEBACK_H) || defined(TRACE_HEADER_MULTI_READ) 4#if !defined(_TRACE_WRITEBACK_H) || defined(TRACE_HEADER_MULTI_READ)
5#define _TRACE_WRITEBACK_H 5#define _TRACE_WRITEBACK_H
6 6
7#include <linux/tracepoint.h>
7#include <linux/backing-dev.h> 8#include <linux/backing-dev.h>
8#include <linux/writeback.h> 9#include <linux/writeback.h>
9 10
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 1ee19a24cc5f..8765126b328c 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -265,11 +265,9 @@ static notrace enum print_line_t \
265ftrace_raw_output_##call(struct trace_iterator *iter, int flags, \ 265ftrace_raw_output_##call(struct trace_iterator *iter, int flags, \
266 struct trace_event *event) \ 266 struct trace_event *event) \
267{ \ 267{ \
268 struct trace_seq *s = &iter->seq; \
269 struct ftrace_raw_##template *field; \ 268 struct ftrace_raw_##template *field; \
270 struct trace_entry *entry; \ 269 struct trace_entry *entry; \
271 struct trace_seq *p = &iter->tmp_seq; \ 270 struct trace_seq *p = &iter->tmp_seq; \
272 int ret; \
273 \ 271 \
274 entry = iter->ent; \ 272 entry = iter->ent; \
275 \ 273 \
@@ -281,13 +279,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags, \
281 field = (typeof(field))entry; \ 279 field = (typeof(field))entry; \
282 \ 280 \
283 trace_seq_init(p); \ 281 trace_seq_init(p); \
284 ret = trace_seq_printf(s, "%s: ", #call); \ 282 return ftrace_output_call(iter, #call, print); \
285 if (ret) \
286 ret = trace_seq_printf(s, print); \
287 if (!ret) \
288 return TRACE_TYPE_PARTIAL_LINE; \
289 \
290 return TRACE_TYPE_HANDLED; \
291} \ 283} \
292static struct trace_event_functions ftrace_event_type_funcs_##call = { \ 284static struct trace_event_functions ftrace_event_type_funcs_##call = { \
293 .trace = ftrace_raw_output_##call, \ 285 .trace = ftrace_raw_output_##call, \
@@ -370,10 +362,11 @@ ftrace_define_fields_##call(struct ftrace_event_call *event_call) \
370 362
371#undef __dynamic_array 363#undef __dynamic_array
372#define __dynamic_array(type, item, len) \ 364#define __dynamic_array(type, item, len) \
365 __item_length = (len) * sizeof(type); \
373 __data_offsets->item = __data_size + \ 366 __data_offsets->item = __data_size + \
374 offsetof(typeof(*entry), __data); \ 367 offsetof(typeof(*entry), __data); \
375 __data_offsets->item |= (len * sizeof(type)) << 16; \ 368 __data_offsets->item |= __item_length << 16; \
376 __data_size += (len) * sizeof(type); 369 __data_size += __item_length;
377 370
378#undef __string 371#undef __string
379#define __string(item, src) __dynamic_array(char, item, \ 372#define __string(item, src) __dynamic_array(char, item, \
@@ -385,6 +378,7 @@ static inline notrace int ftrace_get_offsets_##call( \
385 struct ftrace_data_offsets_##call *__data_offsets, proto) \ 378 struct ftrace_data_offsets_##call *__data_offsets, proto) \
386{ \ 379{ \
387 int __data_size = 0; \ 380 int __data_size = 0; \
381 int __maybe_unused __item_length; \
388 struct ftrace_raw_##call __maybe_unused *entry; \ 382 struct ftrace_raw_##call __maybe_unused *entry; \
389 \ 383 \
390 tstruct; \ 384 tstruct; \
@@ -541,37 +535,27 @@ static notrace void \
541ftrace_raw_event_##call(void *__data, proto) \ 535ftrace_raw_event_##call(void *__data, proto) \
542{ \ 536{ \
543 struct ftrace_event_file *ftrace_file = __data; \ 537 struct ftrace_event_file *ftrace_file = __data; \
544 struct ftrace_event_call *event_call = ftrace_file->event_call; \
545 struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\ 538 struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
546 struct ring_buffer_event *event; \ 539 struct ftrace_event_buffer fbuffer; \
547 struct ftrace_raw_##call *entry; \ 540 struct ftrace_raw_##call *entry; \
548 struct ring_buffer *buffer; \
549 unsigned long irq_flags; \
550 int __data_size; \ 541 int __data_size; \
551 int pc; \
552 \ 542 \
553 if (ftrace_trigger_soft_disabled(ftrace_file)) \ 543 if (ftrace_trigger_soft_disabled(ftrace_file)) \
554 return; \ 544 return; \
555 \ 545 \
556 local_save_flags(irq_flags); \
557 pc = preempt_count(); \
558 \
559 __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ 546 __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
560 \ 547 \
561 event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, \ 548 entry = ftrace_event_buffer_reserve(&fbuffer, ftrace_file, \
562 event_call->event.type, \ 549 sizeof(*entry) + __data_size); \
563 sizeof(*entry) + __data_size, \ 550 \
564 irq_flags, pc); \ 551 if (!entry) \
565 if (!event) \
566 return; \ 552 return; \
567 entry = ring_buffer_event_data(event); \
568 \ 553 \
569 tstruct \ 554 tstruct \
570 \ 555 \
571 { assign; } \ 556 { assign; } \
572 \ 557 \
573 event_trigger_unlock_commit(ftrace_file, buffer, event, entry, \ 558 ftrace_event_buffer_commit(&fbuffer); \
574 irq_flags, pc); \
575} 559}
576/* 560/*
577 * The ftrace_test_probe is compiled out, it is only here as a build time check 561 * The ftrace_test_probe is compiled out, it is only here as a build time check
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 0fd47f5bc146..cb9023d4f063 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -23,6 +23,12 @@
23 23
24#define VFIO_TYPE1_IOMMU 1 24#define VFIO_TYPE1_IOMMU 1
25#define VFIO_SPAPR_TCE_IOMMU 2 25#define VFIO_SPAPR_TCE_IOMMU 2
26#define VFIO_TYPE1v2_IOMMU 3
27/*
28 * IOMMU enforces DMA cache coherence (ex. PCIe NoSnoop stripping). This
29 * capability is subject to change as groups are added or removed.
30 */
31#define VFIO_DMA_CC_IOMMU 4
26 32
27/* 33/*
28 * The IOCTL interface is designed for extensibility by embedding the 34 * The IOCTL interface is designed for extensibility by embedding the
diff --git a/include/xen/events.h b/include/xen/events.h
index c9c85cf84895..8bee7a75e850 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -2,6 +2,9 @@
2#define _XEN_EVENTS_H 2#define _XEN_EVENTS_H
3 3
4#include <linux/interrupt.h> 4#include <linux/interrupt.h>
5#ifdef CONFIG_PCI_MSI
6#include <linux/msi.h>
7#endif
5 8
6#include <xen/interface/event_channel.h> 9#include <xen/interface/event_channel.h>
7#include <asm/xen/hypercall.h> 10#include <asm/xen/hypercall.h>
@@ -52,7 +55,6 @@ int evtchn_get(unsigned int evtchn);
52void evtchn_put(unsigned int evtchn); 55void evtchn_put(unsigned int evtchn);
53 56
54void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector); 57void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector);
55int resend_irq_on_evtchn(unsigned int irq);
56void rebind_evtchn_irq(int evtchn, int irq); 58void rebind_evtchn_irq(int evtchn, int irq);
57 59
58static inline void notify_remote_via_evtchn(int port) 60static inline void notify_remote_via_evtchn(int port)
@@ -102,7 +104,7 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi,
102int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc); 104int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc);
103/* Bind an PSI pirq to an irq. */ 105/* Bind an PSI pirq to an irq. */
104int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, 106int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc,
105 int pirq, const char *name, domid_t domid); 107 int pirq, int nvec, const char *name, domid_t domid);
106#endif 108#endif
107 109
108/* De-allocates the above mentioned physical interrupt. */ 110/* De-allocates the above mentioned physical interrupt. */
diff --git a/include/xen/interface/physdev.h b/include/xen/interface/physdev.h
index 42721d13a106..610dba9b620a 100644
--- a/include/xen/interface/physdev.h
+++ b/include/xen/interface/physdev.h
@@ -131,6 +131,7 @@ struct physdev_irq {
131#define MAP_PIRQ_TYPE_GSI 0x1 131#define MAP_PIRQ_TYPE_GSI 0x1
132#define MAP_PIRQ_TYPE_UNKNOWN 0x2 132#define MAP_PIRQ_TYPE_UNKNOWN 0x2
133#define MAP_PIRQ_TYPE_MSI_SEG 0x3 133#define MAP_PIRQ_TYPE_MSI_SEG 0x3
134#define MAP_PIRQ_TYPE_MULTI_MSI 0x4
134 135
135#define PHYSDEVOP_map_pirq 13 136#define PHYSDEVOP_map_pirq 13
136struct physdev_map_pirq { 137struct physdev_map_pirq {
@@ -141,11 +142,16 @@ struct physdev_map_pirq {
141 int index; 142 int index;
142 /* IN or OUT */ 143 /* IN or OUT */
143 int pirq; 144 int pirq;
144 /* IN - high 16 bits hold segment for MAP_PIRQ_TYPE_MSI_SEG */ 145 /* IN - high 16 bits hold segment for ..._MSI_SEG and ..._MULTI_MSI */
145 int bus; 146 int bus;
146 /* IN */ 147 /* IN */
147 int devfn; 148 int devfn;
148 /* IN */ 149 /* IN
150 * - For MSI-X contains entry number.
151 * - For MSI with ..._MULTI_MSI contains number of vectors.
152 * OUT (..._MULTI_MSI only)
153 * - Number of vectors allocated.
154 */
149 int entry_nr; 155 int entry_nr;
150 /* IN */ 156 /* IN */
151 uint64_t table_base; 157 uint64_t table_base;
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index fb2ea8f26552..2cf47175b12b 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -2,6 +2,7 @@
2#define INCLUDE_XEN_OPS_H 2#define INCLUDE_XEN_OPS_H
3 3
4#include <linux/percpu.h> 4#include <linux/percpu.h>
5#include <linux/notifier.h>
5#include <asm/xen/interface.h> 6#include <asm/xen/interface.h>
6 7
7DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu); 8DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
@@ -16,6 +17,9 @@ void xen_mm_unpin_all(void);
16void xen_timer_resume(void); 17void xen_timer_resume(void);
17void xen_arch_resume(void); 18void xen_arch_resume(void);
18 19
20void xen_resume_notifier_register(struct notifier_block *nb);
21void xen_resume_notifier_unregister(struct notifier_block *nb);
22
19int xen_setup_shutdown_event(void); 23int xen_setup_shutdown_event(void);
20 24
21extern unsigned long *xen_contiguous_bitmap; 25extern unsigned long *xen_contiguous_bitmap;
diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h
index 569c07f2e344..0324c6d340c1 100644
--- a/include/xen/xenbus.h
+++ b/include/xen/xenbus.h
@@ -207,7 +207,6 @@ int xenbus_unmap_ring(struct xenbus_device *dev,
207 grant_handle_t handle, void *vaddr); 207 grant_handle_t handle, void *vaddr);
208 208
209int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port); 209int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port);
210int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port);
211int xenbus_free_evtchn(struct xenbus_device *dev, int port); 210int xenbus_free_evtchn(struct xenbus_device *dev, int port);
212 211
213enum xenbus_state xenbus_read_driver_state(const char *path); 212enum xenbus_state xenbus_read_driver_state(const char *path);
diff --git a/init/Kconfig b/init/Kconfig
index 8114a06117e3..8851c6417880 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -864,6 +864,7 @@ config NUMA_BALANCING
864 864
865menuconfig CGROUPS 865menuconfig CGROUPS
866 boolean "Control Group support" 866 boolean "Control Group support"
867 select KERNFS
867 help 868 help
868 This option adds support for grouping sets of processes together, for 869 This option adds support for grouping sets of processes together, for
869 use with process control subsystems such as Cpusets, CFS, memory 870 use with process control subsystems such as Cpusets, CFS, memory
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0c753ddd223b..fede3d3f28ff 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -40,23 +40,20 @@
40#include <linux/proc_fs.h> 40#include <linux/proc_fs.h>
41#include <linux/rcupdate.h> 41#include <linux/rcupdate.h>
42#include <linux/sched.h> 42#include <linux/sched.h>
43#include <linux/backing-dev.h>
44#include <linux/slab.h> 43#include <linux/slab.h>
45#include <linux/magic.h>
46#include <linux/spinlock.h> 44#include <linux/spinlock.h>
45#include <linux/rwsem.h>
47#include <linux/string.h> 46#include <linux/string.h>
48#include <linux/sort.h> 47#include <linux/sort.h>
49#include <linux/kmod.h> 48#include <linux/kmod.h>
50#include <linux/module.h>
51#include <linux/delayacct.h> 49#include <linux/delayacct.h>
52#include <linux/cgroupstats.h> 50#include <linux/cgroupstats.h>
53#include <linux/hashtable.h> 51#include <linux/hashtable.h>
54#include <linux/namei.h>
55#include <linux/pid_namespace.h> 52#include <linux/pid_namespace.h>
56#include <linux/idr.h> 53#include <linux/idr.h>
57#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 54#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
58#include <linux/flex_array.h> /* used in cgroup_attach_task */
59#include <linux/kthread.h> 55#include <linux/kthread.h>
56#include <linux/delay.h>
60 57
61#include <linux/atomic.h> 58#include <linux/atomic.h>
62 59
@@ -68,43 +65,49 @@
68 */ 65 */
69#define CGROUP_PIDLIST_DESTROY_DELAY HZ 66#define CGROUP_PIDLIST_DESTROY_DELAY HZ
70 67
68#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
69 MAX_CFTYPE_NAME + 2)
70
71/*
72 * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file
73 * creation/removal and hierarchy changing operations including cgroup
74 * creation, removal, css association and controller rebinding. This outer
75 * lock is needed mainly to resolve the circular dependency between kernfs
76 * active ref and cgroup_mutex. cgroup_tree_mutex nests above both.
77 */
78static DEFINE_MUTEX(cgroup_tree_mutex);
79
71/* 80/*
72 * cgroup_mutex is the master lock. Any modification to cgroup or its 81 * cgroup_mutex is the master lock. Any modification to cgroup or its
73 * hierarchy must be performed while holding it. 82 * hierarchy must be performed while holding it.
74 * 83 *
75 * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify 84 * css_set_rwsem protects task->cgroups pointer, the list of css_set
76 * cgroupfs_root of any cgroup hierarchy - subsys list, flags, 85 * objects, and the chain of tasks off each css_set.
77 * release_agent_path and so on. Modifying requires both cgroup_mutex and
78 * cgroup_root_mutex. Readers can acquire either of the two. This is to
79 * break the following locking order cycle.
80 *
81 * A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem
82 * B. namespace_sem -> cgroup_mutex
83 * 86 *
84 * B happens only through cgroup_show_options() and using cgroup_root_mutex 87 * These locks are exported if CONFIG_PROVE_RCU so that accessors in
85 * breaks it. 88 * cgroup.h can use them for lockdep annotations.
86 */ 89 */
87#ifdef CONFIG_PROVE_RCU 90#ifdef CONFIG_PROVE_RCU
88DEFINE_MUTEX(cgroup_mutex); 91DEFINE_MUTEX(cgroup_mutex);
89EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */ 92DECLARE_RWSEM(css_set_rwsem);
93EXPORT_SYMBOL_GPL(cgroup_mutex);
94EXPORT_SYMBOL_GPL(css_set_rwsem);
90#else 95#else
91static DEFINE_MUTEX(cgroup_mutex); 96static DEFINE_MUTEX(cgroup_mutex);
97static DECLARE_RWSEM(css_set_rwsem);
92#endif 98#endif
93 99
94static DEFINE_MUTEX(cgroup_root_mutex); 100/*
101 * Protects cgroup_subsys->release_agent_path. Modifying it also requires
102 * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
103 */
104static DEFINE_SPINLOCK(release_agent_path_lock);
95 105
96#define cgroup_assert_mutex_or_rcu_locked() \ 106#define cgroup_assert_mutexes_or_rcu_locked() \
97 rcu_lockdep_assert(rcu_read_lock_held() || \ 107 rcu_lockdep_assert(rcu_read_lock_held() || \
108 lockdep_is_held(&cgroup_tree_mutex) || \
98 lockdep_is_held(&cgroup_mutex), \ 109 lockdep_is_held(&cgroup_mutex), \
99 "cgroup_mutex or RCU read lock required"); 110 "cgroup_[tree_]mutex or RCU read lock required");
100
101#ifdef CONFIG_LOCKDEP
102#define cgroup_assert_mutex_or_root_locked() \
103 WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&cgroup_mutex) && \
104 !lockdep_is_held(&cgroup_root_mutex)))
105#else
106#define cgroup_assert_mutex_or_root_locked() do { } while (0)
107#endif
108 111
109/* 112/*
110 * cgroup destruction makes heavy use of work items and there can be a lot 113 * cgroup destruction makes heavy use of work items and there can be a lot
@@ -120,42 +123,41 @@ static struct workqueue_struct *cgroup_destroy_wq;
120 */ 123 */
121static struct workqueue_struct *cgroup_pidlist_destroy_wq; 124static struct workqueue_struct *cgroup_pidlist_destroy_wq;
122 125
123/* 126/* generate an array of cgroup subsystem pointers */
124 * Generate an array of cgroup subsystem pointers. At boot time, this is 127#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
125 * populated with the built in subsystems, and modular subsystems are 128static struct cgroup_subsys *cgroup_subsys[] = {
126 * registered after that. The mutable section of this array is protected by 129#include <linux/cgroup_subsys.h>
127 * cgroup_mutex. 130};
128 */ 131#undef SUBSYS
129#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys, 132
130#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) 133/* array of cgroup subsystem names */
131static struct cgroup_subsys *cgroup_subsys[CGROUP_SUBSYS_COUNT] = { 134#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
135static const char *cgroup_subsys_name[] = {
132#include <linux/cgroup_subsys.h> 136#include <linux/cgroup_subsys.h>
133}; 137};
138#undef SUBSYS
134 139
135/* 140/*
136 * The dummy hierarchy, reserved for the subsystems that are otherwise 141 * The default hierarchy, reserved for the subsystems that are otherwise
137 * unattached - it never has more than a single cgroup, and all tasks are 142 * unattached - it never has more than a single cgroup, and all tasks are
138 * part of that cgroup. 143 * part of that cgroup.
139 */ 144 */
140static struct cgroupfs_root cgroup_dummy_root; 145struct cgroup_root cgrp_dfl_root;
141 146
142/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */ 147/*
143static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup; 148 * The default hierarchy always exists but is hidden until mounted for the
149 * first time. This is for backward compatibility.
150 */
151static bool cgrp_dfl_root_visible;
144 152
145/* The list of hierarchy roots */ 153/* The list of hierarchy roots */
146 154
147static LIST_HEAD(cgroup_roots); 155static LIST_HEAD(cgroup_roots);
148static int cgroup_root_count; 156static int cgroup_root_count;
149 157
150/* 158/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
151 * Hierarchy ID allocation and mapping. It follows the same exclusion
152 * rules as other root ops - both cgroup_mutex and cgroup_root_mutex for
153 * writes, either for reads.
154 */
155static DEFINE_IDR(cgroup_hierarchy_idr); 159static DEFINE_IDR(cgroup_hierarchy_idr);
156 160
157static struct cgroup_name root_cgroup_name = { .name = "/" };
158
159/* 161/*
160 * Assign a monotonically increasing serial number to cgroups. It 162 * Assign a monotonically increasing serial number to cgroups. It
161 * guarantees cgroups with bigger numbers are newer than those with smaller 163 * guarantees cgroups with bigger numbers are newer than those with smaller
@@ -175,11 +177,13 @@ static int need_forkexit_callback __read_mostly;
175 177
176static struct cftype cgroup_base_files[]; 178static struct cftype cgroup_base_files[];
177 179
180static void cgroup_put(struct cgroup *cgrp);
181static int rebind_subsystems(struct cgroup_root *dst_root,
182 unsigned long ss_mask);
178static void cgroup_destroy_css_killed(struct cgroup *cgrp); 183static void cgroup_destroy_css_killed(struct cgroup *cgrp);
179static int cgroup_destroy_locked(struct cgroup *cgrp); 184static int cgroup_destroy_locked(struct cgroup *cgrp);
180static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], 185static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
181 bool is_add); 186 bool is_add);
182static int cgroup_file_release(struct inode *inode, struct file *file);
183static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); 187static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
184 188
185/** 189/**
@@ -197,8 +201,9 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
197 struct cgroup_subsys *ss) 201 struct cgroup_subsys *ss)
198{ 202{
199 if (ss) 203 if (ss)
200 return rcu_dereference_check(cgrp->subsys[ss->subsys_id], 204 return rcu_dereference_check(cgrp->subsys[ss->id],
201 lockdep_is_held(&cgroup_mutex)); 205 lockdep_is_held(&cgroup_tree_mutex) ||
206 lockdep_is_held(&cgroup_mutex));
202 else 207 else
203 return &cgrp->dummy_css; 208 return &cgrp->dummy_css;
204} 209}
@@ -209,6 +214,27 @@ static inline bool cgroup_is_dead(const struct cgroup *cgrp)
209 return test_bit(CGRP_DEAD, &cgrp->flags); 214 return test_bit(CGRP_DEAD, &cgrp->flags);
210} 215}
211 216
217struct cgroup_subsys_state *seq_css(struct seq_file *seq)
218{
219 struct kernfs_open_file *of = seq->private;
220 struct cgroup *cgrp = of->kn->parent->priv;
221 struct cftype *cft = seq_cft(seq);
222
223 /*
224 * This is open and unprotected implementation of cgroup_css().
225 * seq_css() is only called from a kernfs file operation which has
226 * an active reference on the file. Because all the subsystem
227 * files are drained before a css is disassociated with a cgroup,
228 * the matching css from the cgroup's subsys table is guaranteed to
229 * be and stay valid until the enclosing operation is complete.
230 */
231 if (cft->ss)
232 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
233 else
234 return &cgrp->dummy_css;
235}
236EXPORT_SYMBOL_GPL(seq_css);
237
212/** 238/**
213 * cgroup_is_descendant - test ancestry 239 * cgroup_is_descendant - test ancestry
214 * @cgrp: the cgroup to be tested 240 * @cgrp: the cgroup to be tested
@@ -227,7 +253,6 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
227 } 253 }
228 return false; 254 return false;
229} 255}
230EXPORT_SYMBOL_GPL(cgroup_is_descendant);
231 256
232static int cgroup_is_releasable(const struct cgroup *cgrp) 257static int cgroup_is_releasable(const struct cgroup *cgrp)
233{ 258{
@@ -254,54 +279,23 @@ static int notify_on_release(const struct cgroup *cgrp)
254 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ 279 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
255 if (!((css) = rcu_dereference_check( \ 280 if (!((css) = rcu_dereference_check( \
256 (cgrp)->subsys[(ssid)], \ 281 (cgrp)->subsys[(ssid)], \
282 lockdep_is_held(&cgroup_tree_mutex) || \
257 lockdep_is_held(&cgroup_mutex)))) { } \ 283 lockdep_is_held(&cgroup_mutex)))) { } \
258 else 284 else
259 285
260/** 286/**
261 * for_each_subsys - iterate all loaded cgroup subsystems 287 * for_each_subsys - iterate all enabled cgroup subsystems
262 * @ss: the iteration cursor 288 * @ss: the iteration cursor
263 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end 289 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
264 *
265 * Iterates through all loaded subsystems. Should be called under
266 * cgroup_mutex or cgroup_root_mutex.
267 */ 290 */
268#define for_each_subsys(ss, ssid) \ 291#define for_each_subsys(ss, ssid) \
269 for (({ cgroup_assert_mutex_or_root_locked(); (ssid) = 0; }); \ 292 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
270 (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ 293 (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
271 if (!((ss) = cgroup_subsys[(ssid)])) { } \
272 else
273 294
274/** 295/* iterate across the hierarchies */
275 * for_each_builtin_subsys - iterate all built-in cgroup subsystems 296#define for_each_root(root) \
276 * @ss: the iteration cursor
277 * @i: the index of @ss, CGROUP_BUILTIN_SUBSYS_COUNT after reaching the end
278 *
279 * Bulit-in subsystems are always present and iteration itself doesn't
280 * require any synchronization.
281 */
282#define for_each_builtin_subsys(ss, i) \
283 for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \
284 (((ss) = cgroup_subsys[i]) || true); (i)++)
285
286/* iterate across the active hierarchies */
287#define for_each_active_root(root) \
288 list_for_each_entry((root), &cgroup_roots, root_list) 297 list_for_each_entry((root), &cgroup_roots, root_list)
289 298
290static inline struct cgroup *__d_cgrp(struct dentry *dentry)
291{
292 return dentry->d_fsdata;
293}
294
295static inline struct cfent *__d_cfe(struct dentry *dentry)
296{
297 return dentry->d_fsdata;
298}
299
300static inline struct cftype *__d_cft(struct dentry *dentry)
301{
302 return __d_cfe(dentry)->type;
303}
304
305/** 299/**
306 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. 300 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
307 * @cgrp: the cgroup to be checked for liveness 301 * @cgrp: the cgroup to be checked for liveness
@@ -347,23 +341,23 @@ struct cgrp_cset_link {
347 struct list_head cgrp_link; 341 struct list_head cgrp_link;
348}; 342};
349 343
350/* The default css_set - used by init and its children prior to any 344/*
345 * The default css_set - used by init and its children prior to any
351 * hierarchies being mounted. It contains a pointer to the root state 346 * hierarchies being mounted. It contains a pointer to the root state
352 * for each subsystem. Also used to anchor the list of css_sets. Not 347 * for each subsystem. Also used to anchor the list of css_sets. Not
353 * reference-counted, to improve performance when child cgroups 348 * reference-counted, to improve performance when child cgroups
354 * haven't been created. 349 * haven't been created.
355 */ 350 */
351static struct css_set init_css_set = {
352 .refcount = ATOMIC_INIT(1),
353 .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
354 .tasks = LIST_HEAD_INIT(init_css_set.tasks),
355 .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
356 .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
357 .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
358};
356 359
357static struct css_set init_css_set; 360static int css_set_count = 1; /* 1 for init_css_set */
358static struct cgrp_cset_link init_cgrp_cset_link;
359
360/*
361 * css_set_lock protects the list of css_set objects, and the chain of
362 * tasks off each css_set. Nests outside task->alloc_lock due to
363 * css_task_iter_start().
364 */
365static DEFINE_RWLOCK(css_set_lock);
366static int css_set_count;
367 361
368/* 362/*
369 * hash table for cgroup groups. This improves the performance to find 363 * hash table for cgroup groups. This improves the performance to find
@@ -386,30 +380,14 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
386 return key; 380 return key;
387} 381}
388 382
389/* 383static void put_css_set_locked(struct css_set *cset, bool taskexit)
390 * We don't maintain the lists running through each css_set to its task
391 * until after the first call to css_task_iter_start(). This reduces the
392 * fork()/exit() overhead for people who have cgroups compiled into their
393 * kernel but not actually in use.
394 */
395static int use_task_css_set_links __read_mostly;
396
397static void __put_css_set(struct css_set *cset, int taskexit)
398{ 384{
399 struct cgrp_cset_link *link, *tmp_link; 385 struct cgrp_cset_link *link, *tmp_link;
400 386
401 /* 387 lockdep_assert_held(&css_set_rwsem);
402 * Ensure that the refcount doesn't hit zero while any readers 388
403 * can see it. Similar to atomic_dec_and_lock(), but for an 389 if (!atomic_dec_and_test(&cset->refcount))
404 * rwlock
405 */
406 if (atomic_add_unless(&cset->refcount, -1, 1))
407 return;
408 write_lock(&css_set_lock);
409 if (!atomic_dec_and_test(&cset->refcount)) {
410 write_unlock(&css_set_lock);
411 return; 390 return;
412 }
413 391
414 /* This css_set is dead. unlink it and release cgroup refcounts */ 392 /* This css_set is dead. unlink it and release cgroup refcounts */
415 hash_del(&cset->hlist); 393 hash_del(&cset->hlist);
@@ -421,7 +399,7 @@ static void __put_css_set(struct css_set *cset, int taskexit)
421 list_del(&link->cset_link); 399 list_del(&link->cset_link);
422 list_del(&link->cgrp_link); 400 list_del(&link->cgrp_link);
423 401
424 /* @cgrp can't go away while we're holding css_set_lock */ 402 /* @cgrp can't go away while we're holding css_set_rwsem */
425 if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) { 403 if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {
426 if (taskexit) 404 if (taskexit)
427 set_bit(CGRP_RELEASABLE, &cgrp->flags); 405 set_bit(CGRP_RELEASABLE, &cgrp->flags);
@@ -431,10 +409,24 @@ static void __put_css_set(struct css_set *cset, int taskexit)
431 kfree(link); 409 kfree(link);
432 } 410 }
433 411
434 write_unlock(&css_set_lock);
435 kfree_rcu(cset, rcu_head); 412 kfree_rcu(cset, rcu_head);
436} 413}
437 414
415static void put_css_set(struct css_set *cset, bool taskexit)
416{
417 /*
418 * Ensure that the refcount doesn't hit zero while any readers
419 * can see it. Similar to atomic_dec_and_lock(), but for an
420 * rwlock
421 */
422 if (atomic_add_unless(&cset->refcount, -1, 1))
423 return;
424
425 down_write(&css_set_rwsem);
426 put_css_set_locked(cset, taskexit);
427 up_write(&css_set_rwsem);
428}
429
438/* 430/*
439 * refcounted get/put for css_set objects 431 * refcounted get/put for css_set objects
440 */ 432 */
@@ -443,16 +435,6 @@ static inline void get_css_set(struct css_set *cset)
443 atomic_inc(&cset->refcount); 435 atomic_inc(&cset->refcount);
444} 436}
445 437
446static inline void put_css_set(struct css_set *cset)
447{
448 __put_css_set(cset, 0);
449}
450
451static inline void put_css_set_taskexit(struct css_set *cset)
452{
453 __put_css_set(cset, 1);
454}
455
456/** 438/**
457 * compare_css_sets - helper function for find_existing_css_set(). 439 * compare_css_sets - helper function for find_existing_css_set().
458 * @cset: candidate css_set being tested 440 * @cset: candidate css_set being tested
@@ -535,7 +517,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
535 struct cgroup *cgrp, 517 struct cgroup *cgrp,
536 struct cgroup_subsys_state *template[]) 518 struct cgroup_subsys_state *template[])
537{ 519{
538 struct cgroupfs_root *root = cgrp->root; 520 struct cgroup_root *root = cgrp->root;
539 struct cgroup_subsys *ss; 521 struct cgroup_subsys *ss;
540 struct css_set *cset; 522 struct css_set *cset;
541 unsigned long key; 523 unsigned long key;
@@ -547,7 +529,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
547 * won't change, so no need for locking. 529 * won't change, so no need for locking.
548 */ 530 */
549 for_each_subsys(ss, i) { 531 for_each_subsys(ss, i) {
550 if (root->subsys_mask & (1UL << i)) { 532 if (root->cgrp.subsys_mask & (1UL << i)) {
551 /* Subsystem is in this hierarchy. So we want 533 /* Subsystem is in this hierarchy. So we want
552 * the subsystem state from the new 534 * the subsystem state from the new
553 * cgroup */ 535 * cgroup */
@@ -652,11 +634,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
652 634
653 /* First see if we already have a cgroup group that matches 635 /* First see if we already have a cgroup group that matches
654 * the desired set */ 636 * the desired set */
655 read_lock(&css_set_lock); 637 down_read(&css_set_rwsem);
656 cset = find_existing_css_set(old_cset, cgrp, template); 638 cset = find_existing_css_set(old_cset, cgrp, template);
657 if (cset) 639 if (cset)
658 get_css_set(cset); 640 get_css_set(cset);
659 read_unlock(&css_set_lock); 641 up_read(&css_set_rwsem);
660 642
661 if (cset) 643 if (cset)
662 return cset; 644 return cset;
@@ -674,13 +656,16 @@ static struct css_set *find_css_set(struct css_set *old_cset,
674 atomic_set(&cset->refcount, 1); 656 atomic_set(&cset->refcount, 1);
675 INIT_LIST_HEAD(&cset->cgrp_links); 657 INIT_LIST_HEAD(&cset->cgrp_links);
676 INIT_LIST_HEAD(&cset->tasks); 658 INIT_LIST_HEAD(&cset->tasks);
659 INIT_LIST_HEAD(&cset->mg_tasks);
660 INIT_LIST_HEAD(&cset->mg_preload_node);
661 INIT_LIST_HEAD(&cset->mg_node);
677 INIT_HLIST_NODE(&cset->hlist); 662 INIT_HLIST_NODE(&cset->hlist);
678 663
679 /* Copy the set of subsystem state objects generated in 664 /* Copy the set of subsystem state objects generated in
680 * find_existing_css_set() */ 665 * find_existing_css_set() */
681 memcpy(cset->subsys, template, sizeof(cset->subsys)); 666 memcpy(cset->subsys, template, sizeof(cset->subsys));
682 667
683 write_lock(&css_set_lock); 668 down_write(&css_set_rwsem);
684 /* Add reference counts and links from the new css_set. */ 669 /* Add reference counts and links from the new css_set. */
685 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { 670 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
686 struct cgroup *c = link->cgrp; 671 struct cgroup *c = link->cgrp;
@@ -698,31 +683,105 @@ static struct css_set *find_css_set(struct css_set *old_cset,
698 key = css_set_hash(cset->subsys); 683 key = css_set_hash(cset->subsys);
699 hash_add(css_set_table, &cset->hlist, key); 684 hash_add(css_set_table, &cset->hlist, key);
700 685
701 write_unlock(&css_set_lock); 686 up_write(&css_set_rwsem);
702 687
703 return cset; 688 return cset;
704} 689}
705 690
706/* 691static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
707 * Return the cgroup for "task" from the given hierarchy. Must be
708 * called with cgroup_mutex held.
709 */
710static struct cgroup *task_cgroup_from_root(struct task_struct *task,
711 struct cgroupfs_root *root)
712{ 692{
713 struct css_set *cset; 693 struct cgroup *root_cgrp = kf_root->kn->priv;
714 struct cgroup *res = NULL; 694
695 return root_cgrp->root;
696}
697
698static int cgroup_init_root_id(struct cgroup_root *root)
699{
700 int id;
701
702 lockdep_assert_held(&cgroup_mutex);
703
704 id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
705 if (id < 0)
706 return id;
707
708 root->hierarchy_id = id;
709 return 0;
710}
711
712static void cgroup_exit_root_id(struct cgroup_root *root)
713{
714 lockdep_assert_held(&cgroup_mutex);
715
716 if (root->hierarchy_id) {
717 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
718 root->hierarchy_id = 0;
719 }
720}
721
722static void cgroup_free_root(struct cgroup_root *root)
723{
724 if (root) {
725 /* hierarhcy ID shoulid already have been released */
726 WARN_ON_ONCE(root->hierarchy_id);
727
728 idr_destroy(&root->cgroup_idr);
729 kfree(root);
730 }
731}
732
733static void cgroup_destroy_root(struct cgroup_root *root)
734{
735 struct cgroup *cgrp = &root->cgrp;
736 struct cgrp_cset_link *link, *tmp_link;
737
738 mutex_lock(&cgroup_tree_mutex);
739 mutex_lock(&cgroup_mutex);
740
741 BUG_ON(atomic_read(&root->nr_cgrps));
742 BUG_ON(!list_empty(&cgrp->children));
743
744 /* Rebind all subsystems back to the default hierarchy */
745 rebind_subsystems(&cgrp_dfl_root, cgrp->subsys_mask);
715 746
716 BUG_ON(!mutex_is_locked(&cgroup_mutex));
717 read_lock(&css_set_lock);
718 /* 747 /*
719 * No need to lock the task - since we hold cgroup_mutex the 748 * Release all the links from cset_links to this hierarchy's
720 * task can't change groups, so the only thing that can happen 749 * root cgroup
721 * is that it exits and its css is set back to init_css_set.
722 */ 750 */
723 cset = task_css_set(task); 751 down_write(&css_set_rwsem);
752
753 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
754 list_del(&link->cset_link);
755 list_del(&link->cgrp_link);
756 kfree(link);
757 }
758 up_write(&css_set_rwsem);
759
760 if (!list_empty(&root->root_list)) {
761 list_del(&root->root_list);
762 cgroup_root_count--;
763 }
764
765 cgroup_exit_root_id(root);
766
767 mutex_unlock(&cgroup_mutex);
768 mutex_unlock(&cgroup_tree_mutex);
769
770 kernfs_destroy_root(root->kf_root);
771 cgroup_free_root(root);
772}
773
774/* look up cgroup associated with given css_set on the specified hierarchy */
775static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
776 struct cgroup_root *root)
777{
778 struct cgroup *res = NULL;
779
780 lockdep_assert_held(&cgroup_mutex);
781 lockdep_assert_held(&css_set_rwsem);
782
724 if (cset == &init_css_set) { 783 if (cset == &init_css_set) {
725 res = &root->top_cgroup; 784 res = &root->cgrp;
726 } else { 785 } else {
727 struct cgrp_cset_link *link; 786 struct cgrp_cset_link *link;
728 787
@@ -735,16 +794,27 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
735 } 794 }
736 } 795 }
737 } 796 }
738 read_unlock(&css_set_lock); 797
739 BUG_ON(!res); 798 BUG_ON(!res);
740 return res; 799 return res;
741} 800}
742 801
743/* 802/*
744 * There is one global cgroup mutex. We also require taking 803 * Return the cgroup for "task" from the given hierarchy. Must be
745 * task_lock() when dereferencing a task's cgroup subsys pointers. 804 * called with cgroup_mutex and css_set_rwsem held.
746 * See "The task_lock() exception", at the end of this comment. 805 */
747 * 806static struct cgroup *task_cgroup_from_root(struct task_struct *task,
807 struct cgroup_root *root)
808{
809 /*
810 * No need to lock the task - since we hold cgroup_mutex the
811 * task can't change groups, so the only thing that can happen
812 * is that it exits and its css is set back to init_css_set.
813 */
814 return cset_cgroup_from_root(task_css_set(task), root);
815}
816
817/*
748 * A task must hold cgroup_mutex to modify cgroups. 818 * A task must hold cgroup_mutex to modify cgroups.
749 * 819 *
750 * Any task can increment and decrement the count field without lock. 820 * Any task can increment and decrement the count field without lock.
@@ -770,98 +840,79 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
770 * A cgroup can only be deleted if both its 'count' of using tasks 840 * A cgroup can only be deleted if both its 'count' of using tasks
771 * is zero, and its list of 'children' cgroups is empty. Since all 841 * is zero, and its list of 'children' cgroups is empty. Since all
772 * tasks in the system use _some_ cgroup, and since there is always at 842 * tasks in the system use _some_ cgroup, and since there is always at
773 * least one task in the system (init, pid == 1), therefore, top_cgroup 843 * least one task in the system (init, pid == 1), therefore, root cgroup
774 * always has either children cgroups and/or using tasks. So we don't 844 * always has either children cgroups and/or using tasks. So we don't
775 * need a special hack to ensure that top_cgroup cannot be deleted. 845 * need a special hack to ensure that root cgroup cannot be deleted.
776 *
777 * The task_lock() exception
778 *
779 * The need for this exception arises from the action of
780 * cgroup_attach_task(), which overwrites one task's cgroup pointer with
781 * another. It does so using cgroup_mutex, however there are
782 * several performance critical places that need to reference
783 * task->cgroup without the expense of grabbing a system global
784 * mutex. Therefore except as noted below, when dereferencing or, as
785 * in cgroup_attach_task(), modifying a task's cgroup pointer we use
786 * task_lock(), which acts on a spinlock (task->alloc_lock) already in
787 * the task_struct routinely used for such matters.
788 * 846 *
789 * P.S. One more locking exception. RCU is used to guard the 847 * P.S. One more locking exception. RCU is used to guard the
790 * update of a tasks cgroup pointer by cgroup_attach_task() 848 * update of a tasks cgroup pointer by cgroup_attach_task()
791 */ 849 */
792 850
793/*
794 * A couple of forward declarations required, due to cyclic reference loop:
795 * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
796 * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
797 * -> cgroup_mkdir.
798 */
799
800static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
801static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
802static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); 851static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
803static const struct inode_operations cgroup_dir_inode_operations; 852static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
804static const struct file_operations proc_cgroupstats_operations; 853static const struct file_operations proc_cgroupstats_operations;
805 854
806static struct backing_dev_info cgroup_backing_dev_info = { 855static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
807 .name = "cgroup", 856 char *buf)
808 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
809};
810
811static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
812{ 857{
813 struct inode *inode = new_inode(sb); 858 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
814 859 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
815 if (inode) { 860 snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
816 inode->i_ino = get_next_ino(); 861 cft->ss->name, cft->name);
817 inode->i_mode = mode; 862 else
818 inode->i_uid = current_fsuid(); 863 strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
819 inode->i_gid = current_fsgid(); 864 return buf;
820 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
821 inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
822 }
823 return inode;
824} 865}
825 866
826static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry) 867/**
868 * cgroup_file_mode - deduce file mode of a control file
869 * @cft: the control file in question
870 *
871 * returns cft->mode if ->mode is not 0
872 * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
873 * returns S_IRUGO if it has only a read handler
874 * returns S_IWUSR if it has only a write hander
875 */
876static umode_t cgroup_file_mode(const struct cftype *cft)
827{ 877{
828 struct cgroup_name *name; 878 umode_t mode = 0;
829 879
830 name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL); 880 if (cft->mode)
831 if (!name) 881 return cft->mode;
832 return NULL; 882
833 strcpy(name->name, dentry->d_name.name); 883 if (cft->read_u64 || cft->read_s64 || cft->seq_show)
834 return name; 884 mode |= S_IRUGO;
885
886 if (cft->write_u64 || cft->write_s64 || cft->write_string ||
887 cft->trigger)
888 mode |= S_IWUSR;
889
890 return mode;
835} 891}
836 892
837static void cgroup_free_fn(struct work_struct *work) 893static void cgroup_free_fn(struct work_struct *work)
838{ 894{
839 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); 895 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
840 896
841 mutex_lock(&cgroup_mutex); 897 atomic_dec(&cgrp->root->nr_cgrps);
842 cgrp->root->number_of_cgroups--;
843 mutex_unlock(&cgroup_mutex);
844
845 /*
846 * We get a ref to the parent's dentry, and put the ref when
847 * this cgroup is being freed, so it's guaranteed that the
848 * parent won't be destroyed before its children.
849 */
850 dput(cgrp->parent->dentry);
851
852 /*
853 * Drop the active superblock reference that we took when we
854 * created the cgroup. This will free cgrp->root, if we are
855 * holding the last reference to @sb.
856 */
857 deactivate_super(cgrp->root->sb);
858
859 cgroup_pidlist_destroy_all(cgrp); 898 cgroup_pidlist_destroy_all(cgrp);
860 899
861 simple_xattrs_free(&cgrp->xattrs); 900 if (cgrp->parent) {
862 901 /*
863 kfree(rcu_dereference_raw(cgrp->name)); 902 * We get a ref to the parent, and put the ref when this
864 kfree(cgrp); 903 * cgroup is being freed, so it's guaranteed that the
904 * parent won't be destroyed before its children.
905 */
906 cgroup_put(cgrp->parent);
907 kernfs_put(cgrp->kn);
908 kfree(cgrp);
909 } else {
910 /*
911 * This is root cgroup's refcnt reaching zero, which
912 * indicates that the root should be released.
913 */
914 cgroup_destroy_root(cgrp->root);
915 }
865} 916}
866 917
867static void cgroup_free_rcu(struct rcu_head *head) 918static void cgroup_free_rcu(struct rcu_head *head)
@@ -872,73 +923,40 @@ static void cgroup_free_rcu(struct rcu_head *head)
872 queue_work(cgroup_destroy_wq, &cgrp->destroy_work); 923 queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
873} 924}
874 925
875static void cgroup_diput(struct dentry *dentry, struct inode *inode) 926static void cgroup_get(struct cgroup *cgrp)
876{
877 /* is dentry a directory ? if so, kfree() associated cgroup */
878 if (S_ISDIR(inode->i_mode)) {
879 struct cgroup *cgrp = dentry->d_fsdata;
880
881 BUG_ON(!(cgroup_is_dead(cgrp)));
882
883 /*
884 * XXX: cgrp->id is only used to look up css's. As cgroup
885 * and css's lifetimes will be decoupled, it should be made
886 * per-subsystem and moved to css->id so that lookups are
887 * successful until the target css is released.
888 */
889 mutex_lock(&cgroup_mutex);
890 idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
891 mutex_unlock(&cgroup_mutex);
892 cgrp->id = -1;
893
894 call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
895 } else {
896 struct cfent *cfe = __d_cfe(dentry);
897 struct cgroup *cgrp = dentry->d_parent->d_fsdata;
898
899 WARN_ONCE(!list_empty(&cfe->node) &&
900 cgrp != &cgrp->root->top_cgroup,
901 "cfe still linked for %s\n", cfe->type->name);
902 simple_xattrs_free(&cfe->xattrs);
903 kfree(cfe);
904 }
905 iput(inode);
906}
907
908static void remove_dir(struct dentry *d)
909{ 927{
910 struct dentry *parent = dget(d->d_parent); 928 WARN_ON_ONCE(cgroup_is_dead(cgrp));
911 929 WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0);
912 d_delete(d); 930 atomic_inc(&cgrp->refcnt);
913 simple_rmdir(parent->d_inode, d);
914 dput(parent);
915} 931}
916 932
917static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) 933static void cgroup_put(struct cgroup *cgrp)
918{ 934{
919 struct cfent *cfe; 935 if (!atomic_dec_and_test(&cgrp->refcnt))
920 936 return;
921 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); 937 if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp)))
922 lockdep_assert_held(&cgroup_mutex); 938 return;
923 939
924 /* 940 /*
925 * If we're doing cleanup due to failure of cgroup_create(), 941 * XXX: cgrp->id is only used to look up css's. As cgroup and
926 * the corresponding @cfe may not exist. 942 * css's lifetimes will be decoupled, it should be made
943 * per-subsystem and moved to css->id so that lookups are
944 * successful until the target css is released.
927 */ 945 */
928 list_for_each_entry(cfe, &cgrp->files, node) { 946 mutex_lock(&cgroup_mutex);
929 struct dentry *d = cfe->dentry; 947 idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
948 mutex_unlock(&cgroup_mutex);
949 cgrp->id = -1;
930 950
931 if (cft && cfe->type != cft) 951 call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
932 continue; 952}
933 953
934 dget(d); 954static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
935 d_delete(d); 955{
936 simple_unlink(cgrp->dentry->d_inode, d); 956 char name[CGROUP_FILE_NAME_MAX];
937 list_del_init(&cfe->node);
938 dput(d);
939 957
940 break; 958 lockdep_assert_held(&cgroup_tree_mutex);
941 } 959 kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
942} 960}
943 961
944/** 962/**
@@ -952,144 +970,106 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
952 int i; 970 int i;
953 971
954 for_each_subsys(ss, i) { 972 for_each_subsys(ss, i) {
955 struct cftype_set *set; 973 struct cftype *cfts;
956 974
957 if (!test_bit(i, &subsys_mask)) 975 if (!test_bit(i, &subsys_mask))
958 continue; 976 continue;
959 list_for_each_entry(set, &ss->cftsets, node) 977 list_for_each_entry(cfts, &ss->cfts, node)
960 cgroup_addrm_files(cgrp, set->cfts, false); 978 cgroup_addrm_files(cgrp, cfts, false);
961 } 979 }
962} 980}
963 981
964/* 982static int rebind_subsystems(struct cgroup_root *dst_root,
965 * NOTE : the dentry must have been dget()'ed 983 unsigned long ss_mask)
966 */
967static void cgroup_d_remove_dir(struct dentry *dentry)
968{
969 struct dentry *parent;
970
971 parent = dentry->d_parent;
972 spin_lock(&parent->d_lock);
973 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
974 list_del_init(&dentry->d_u.d_child);
975 spin_unlock(&dentry->d_lock);
976 spin_unlock(&parent->d_lock);
977 remove_dir(dentry);
978}
979
980/*
981 * Call with cgroup_mutex held. Drops reference counts on modules, including
982 * any duplicate ones that parse_cgroupfs_options took. If this function
983 * returns an error, no reference counts are touched.
984 */
985static int rebind_subsystems(struct cgroupfs_root *root,
986 unsigned long added_mask, unsigned removed_mask)
987{ 984{
988 struct cgroup *cgrp = &root->top_cgroup;
989 struct cgroup_subsys *ss; 985 struct cgroup_subsys *ss;
990 unsigned long pinned = 0; 986 int ssid, ret;
991 int i, ret;
992 987
993 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 988 lockdep_assert_held(&cgroup_tree_mutex);
994 BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); 989 lockdep_assert_held(&cgroup_mutex);
995 990
996 /* Check that any added subsystems are currently free */ 991 for_each_subsys(ss, ssid) {
997 for_each_subsys(ss, i) { 992 if (!(ss_mask & (1 << ssid)))
998 if (!(added_mask & (1 << i)))
999 continue; 993 continue;
1000 994
1001 /* is the subsystem mounted elsewhere? */ 995 /* if @ss is on the dummy_root, we can always move it */
1002 if (ss->root != &cgroup_dummy_root) { 996 if (ss->root == &cgrp_dfl_root)
1003 ret = -EBUSY; 997 continue;
1004 goto out_put;
1005 }
1006 998
1007 /* pin the module */ 999 /* if @ss has non-root cgroups attached to it, can't move */
1008 if (!try_module_get(ss->module)) { 1000 if (!list_empty(&ss->root->cgrp.children))
1009 ret = -ENOENT; 1001 return -EBUSY;
1010 goto out_put;
1011 }
1012 pinned |= 1 << i;
1013 }
1014 1002
1015 /* subsys could be missing if unloaded between parsing and here */ 1003 /* can't move between two non-dummy roots either */
1016 if (added_mask != pinned) { 1004 if (dst_root != &cgrp_dfl_root)
1017 ret = -ENOENT; 1005 return -EBUSY;
1018 goto out_put;
1019 } 1006 }
1020 1007
1021 ret = cgroup_populate_dir(cgrp, added_mask); 1008 ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask);
1022 if (ret) 1009 if (ret) {
1023 goto out_put; 1010 if (dst_root != &cgrp_dfl_root)
1011 return ret;
1012
1013 /*
1014 * Rebinding back to the default root is not allowed to
1015 * fail. Using both default and non-default roots should
1016 * be rare. Moving subsystems back and forth even more so.
1017 * Just warn about it and continue.
1018 */
1019 if (cgrp_dfl_root_visible) {
1020 pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n",
1021 ret, ss_mask);
1022 pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n");
1023 }
1024 }
1024 1025
1025 /* 1026 /*
1026 * Nothing can fail from this point on. Remove files for the 1027 * Nothing can fail from this point on. Remove files for the
1027 * removed subsystems and rebind each subsystem. 1028 * removed subsystems and rebind each subsystem.
1028 */ 1029 */
1029 cgroup_clear_dir(cgrp, removed_mask); 1030 mutex_unlock(&cgroup_mutex);
1030 1031 for_each_subsys(ss, ssid)
1031 for_each_subsys(ss, i) { 1032 if (ss_mask & (1 << ssid))
1032 unsigned long bit = 1UL << i; 1033 cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
1033 1034 mutex_lock(&cgroup_mutex);
1034 if (bit & added_mask) {
1035 /* We're binding this subsystem to this hierarchy */
1036 BUG_ON(cgroup_css(cgrp, ss));
1037 BUG_ON(!cgroup_css(cgroup_dummy_top, ss));
1038 BUG_ON(cgroup_css(cgroup_dummy_top, ss)->cgroup != cgroup_dummy_top);
1039 1035
1040 rcu_assign_pointer(cgrp->subsys[i], 1036 for_each_subsys(ss, ssid) {
1041 cgroup_css(cgroup_dummy_top, ss)); 1037 struct cgroup_root *src_root;
1042 cgroup_css(cgrp, ss)->cgroup = cgrp; 1038 struct cgroup_subsys_state *css;
1043 1039
1044 ss->root = root; 1040 if (!(ss_mask & (1 << ssid)))
1045 if (ss->bind) 1041 continue;
1046 ss->bind(cgroup_css(cgrp, ss));
1047 1042
1048 /* refcount was already taken, and we're keeping it */ 1043 src_root = ss->root;
1049 root->subsys_mask |= bit; 1044 css = cgroup_css(&src_root->cgrp, ss);
1050 } else if (bit & removed_mask) {
1051 /* We're removing this subsystem */
1052 BUG_ON(cgroup_css(cgrp, ss) != cgroup_css(cgroup_dummy_top, ss));
1053 BUG_ON(cgroup_css(cgrp, ss)->cgroup != cgrp);
1054 1045
1055 if (ss->bind) 1046 WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));
1056 ss->bind(cgroup_css(cgroup_dummy_top, ss));
1057 1047
1058 cgroup_css(cgroup_dummy_top, ss)->cgroup = cgroup_dummy_top; 1048 RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);
1059 RCU_INIT_POINTER(cgrp->subsys[i], NULL); 1049 rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);
1050 ss->root = dst_root;
1051 css->cgroup = &dst_root->cgrp;
1060 1052
1061 cgroup_subsys[i]->root = &cgroup_dummy_root; 1053 src_root->cgrp.subsys_mask &= ~(1 << ssid);
1054 dst_root->cgrp.subsys_mask |= 1 << ssid;
1062 1055
1063 /* subsystem is now free - drop reference on module */ 1056 if (ss->bind)
1064 module_put(ss->module); 1057 ss->bind(css);
1065 root->subsys_mask &= ~bit;
1066 }
1067 } 1058 }
1068 1059
1069 /* 1060 kernfs_activate(dst_root->cgrp.kn);
1070 * Mark @root has finished binding subsystems. @root->subsys_mask
1071 * now matches the bound subsystems.
1072 */
1073 root->flags |= CGRP_ROOT_SUBSYS_BOUND;
1074
1075 return 0; 1061 return 0;
1076
1077out_put:
1078 for_each_subsys(ss, i)
1079 if (pinned & (1 << i))
1080 module_put(ss->module);
1081 return ret;
1082} 1062}
1083 1063
1084static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) 1064static int cgroup_show_options(struct seq_file *seq,
1065 struct kernfs_root *kf_root)
1085{ 1066{
1086 struct cgroupfs_root *root = dentry->d_sb->s_fs_info; 1067 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1087 struct cgroup_subsys *ss; 1068 struct cgroup_subsys *ss;
1088 int ssid; 1069 int ssid;
1089 1070
1090 mutex_lock(&cgroup_root_mutex);
1091 for_each_subsys(ss, ssid) 1071 for_each_subsys(ss, ssid)
1092 if (root->subsys_mask & (1 << ssid)) 1072 if (root->cgrp.subsys_mask & (1 << ssid))
1093 seq_printf(seq, ",%s", ss->name); 1073 seq_printf(seq, ",%s", ss->name);
1094 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) 1074 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1095 seq_puts(seq, ",sane_behavior"); 1075 seq_puts(seq, ",sane_behavior");
@@ -1097,13 +1077,16 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1097 seq_puts(seq, ",noprefix"); 1077 seq_puts(seq, ",noprefix");
1098 if (root->flags & CGRP_ROOT_XATTR) 1078 if (root->flags & CGRP_ROOT_XATTR)
1099 seq_puts(seq, ",xattr"); 1079 seq_puts(seq, ",xattr");
1080
1081 spin_lock(&release_agent_path_lock);
1100 if (strlen(root->release_agent_path)) 1082 if (strlen(root->release_agent_path))
1101 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 1083 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
1102 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags)) 1084 spin_unlock(&release_agent_path_lock);
1085
1086 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
1103 seq_puts(seq, ",clone_children"); 1087 seq_puts(seq, ",clone_children");
1104 if (strlen(root->name)) 1088 if (strlen(root->name))
1105 seq_printf(seq, ",name=%s", root->name); 1089 seq_printf(seq, ",name=%s", root->name);
1106 mutex_unlock(&cgroup_root_mutex);
1107 return 0; 1090 return 0;
1108} 1091}
1109 1092
@@ -1115,9 +1098,6 @@ struct cgroup_sb_opts {
1115 char *name; 1098 char *name;
1116 /* User explicitly requested empty subsystem */ 1099 /* User explicitly requested empty subsystem */
1117 bool none; 1100 bool none;
1118
1119 struct cgroupfs_root *new_root;
1120
1121}; 1101};
1122 1102
1123/* 1103/*
@@ -1137,7 +1117,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1137 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 1117 BUG_ON(!mutex_is_locked(&cgroup_mutex));
1138 1118
1139#ifdef CONFIG_CPUSETS 1119#ifdef CONFIG_CPUSETS
1140 mask = ~(1UL << cpuset_subsys_id); 1120 mask = ~(1UL << cpuset_cgrp_id);
1141#endif 1121#endif
1142 1122
1143 memset(opts, 0, sizeof(*opts)); 1123 memset(opts, 0, sizeof(*opts));
@@ -1227,30 +1207,34 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1227 return -ENOENT; 1207 return -ENOENT;
1228 } 1208 }
1229 1209
1230 /*
1231 * If the 'all' option was specified select all the subsystems,
1232 * otherwise if 'none', 'name=' and a subsystem name options
1233 * were not specified, let's default to 'all'
1234 */
1235 if (all_ss || (!one_ss && !opts->none && !opts->name))
1236 for_each_subsys(ss, i)
1237 if (!ss->disabled)
1238 set_bit(i, &opts->subsys_mask);
1239
1240 /* Consistency checks */ 1210 /* Consistency checks */
1241 1211
1242 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { 1212 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1243 pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); 1213 pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
1244 1214
1245 if (opts->flags & CGRP_ROOT_NOPREFIX) { 1215 if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
1246 pr_err("cgroup: sane_behavior: noprefix is not allowed\n"); 1216 opts->cpuset_clone_children || opts->release_agent ||
1217 opts->name) {
1218 pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
1247 return -EINVAL; 1219 return -EINVAL;
1248 } 1220 }
1221 } else {
1222 /*
1223 * If the 'all' option was specified select all the
1224 * subsystems, otherwise if 'none', 'name=' and a subsystem
1225 * name options were not specified, let's default to 'all'
1226 */
1227 if (all_ss || (!one_ss && !opts->none && !opts->name))
1228 for_each_subsys(ss, i)
1229 if (!ss->disabled)
1230 set_bit(i, &opts->subsys_mask);
1249 1231
1250 if (opts->cpuset_clone_children) { 1232 /*
1251 pr_err("cgroup: sane_behavior: clone_children is not allowed\n"); 1233 * We either have to specify by name or by subsystems. (So
1234 * all empty hierarchies must have a name).
1235 */
1236 if (!opts->subsys_mask && !opts->name)
1252 return -EINVAL; 1237 return -EINVAL;
1253 }
1254 } 1238 }
1255 1239
1256 /* 1240 /*
@@ -1266,21 +1250,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1266 if (opts->subsys_mask && opts->none) 1250 if (opts->subsys_mask && opts->none)
1267 return -EINVAL; 1251 return -EINVAL;
1268 1252
1269 /*
1270 * We either have to specify by name or by subsystems. (So all
1271 * empty hierarchies must have a name).
1272 */
1273 if (!opts->subsys_mask && !opts->name)
1274 return -EINVAL;
1275
1276 return 0; 1253 return 0;
1277} 1254}
1278 1255
1279static int cgroup_remount(struct super_block *sb, int *flags, char *data) 1256static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1280{ 1257{
1281 int ret = 0; 1258 int ret = 0;
1282 struct cgroupfs_root *root = sb->s_fs_info; 1259 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1283 struct cgroup *cgrp = &root->top_cgroup;
1284 struct cgroup_sb_opts opts; 1260 struct cgroup_sb_opts opts;
1285 unsigned long added_mask, removed_mask; 1261 unsigned long added_mask, removed_mask;
1286 1262
@@ -1289,21 +1265,20 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1289 return -EINVAL; 1265 return -EINVAL;
1290 } 1266 }
1291 1267
1292 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 1268 mutex_lock(&cgroup_tree_mutex);
1293 mutex_lock(&cgroup_mutex); 1269 mutex_lock(&cgroup_mutex);
1294 mutex_lock(&cgroup_root_mutex);
1295 1270
1296 /* See what subsystems are wanted */ 1271 /* See what subsystems are wanted */
1297 ret = parse_cgroupfs_options(data, &opts); 1272 ret = parse_cgroupfs_options(data, &opts);
1298 if (ret) 1273 if (ret)
1299 goto out_unlock; 1274 goto out_unlock;
1300 1275
1301 if (opts.subsys_mask != root->subsys_mask || opts.release_agent) 1276 if (opts.subsys_mask != root->cgrp.subsys_mask || opts.release_agent)
1302 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", 1277 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
1303 task_tgid_nr(current), current->comm); 1278 task_tgid_nr(current), current->comm);
1304 1279
1305 added_mask = opts.subsys_mask & ~root->subsys_mask; 1280 added_mask = opts.subsys_mask & ~root->cgrp.subsys_mask;
1306 removed_mask = root->subsys_mask & ~opts.subsys_mask; 1281 removed_mask = root->cgrp.subsys_mask & ~opts.subsys_mask;
1307 1282
1308 /* Don't allow flags or name to change at remount */ 1283 /* Don't allow flags or name to change at remount */
1309 if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || 1284 if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
@@ -1316,422 +1291,331 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1316 } 1291 }
1317 1292
1318 /* remounting is not allowed for populated hierarchies */ 1293 /* remounting is not allowed for populated hierarchies */
1319 if (root->number_of_cgroups > 1) { 1294 if (!list_empty(&root->cgrp.children)) {
1320 ret = -EBUSY; 1295 ret = -EBUSY;
1321 goto out_unlock; 1296 goto out_unlock;
1322 } 1297 }
1323 1298
1324 ret = rebind_subsystems(root, added_mask, removed_mask); 1299 ret = rebind_subsystems(root, added_mask);
1325 if (ret) 1300 if (ret)
1326 goto out_unlock; 1301 goto out_unlock;
1327 1302
1328 if (opts.release_agent) 1303 rebind_subsystems(&cgrp_dfl_root, removed_mask);
1304
1305 if (opts.release_agent) {
1306 spin_lock(&release_agent_path_lock);
1329 strcpy(root->release_agent_path, opts.release_agent); 1307 strcpy(root->release_agent_path, opts.release_agent);
1308 spin_unlock(&release_agent_path_lock);
1309 }
1330 out_unlock: 1310 out_unlock:
1331 kfree(opts.release_agent); 1311 kfree(opts.release_agent);
1332 kfree(opts.name); 1312 kfree(opts.name);
1333 mutex_unlock(&cgroup_root_mutex);
1334 mutex_unlock(&cgroup_mutex); 1313 mutex_unlock(&cgroup_mutex);
1335 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1314 mutex_unlock(&cgroup_tree_mutex);
1336 return ret; 1315 return ret;
1337} 1316}
1338 1317
1339static const struct super_operations cgroup_ops = { 1318/*
1340 .statfs = simple_statfs, 1319 * To reduce the fork() overhead for systems that are not actually using
1341 .drop_inode = generic_delete_inode, 1320 * their cgroups capability, we don't maintain the lists running through
1342 .show_options = cgroup_show_options, 1321 * each css_set to its tasks until we see the list actually used - in other
1343 .remount_fs = cgroup_remount, 1322 * words after the first mount.
1344}; 1323 */
1324static bool use_task_css_set_links __read_mostly;
1325
1326static void cgroup_enable_task_cg_lists(void)
1327{
1328 struct task_struct *p, *g;
1329
1330 down_write(&css_set_rwsem);
1331
1332 if (use_task_css_set_links)
1333 goto out_unlock;
1334
1335 use_task_css_set_links = true;
1336
1337 /*
1338 * We need tasklist_lock because RCU is not safe against
1339 * while_each_thread(). Besides, a forking task that has passed
1340 * cgroup_post_fork() without seeing use_task_css_set_links = 1
1341 * is not guaranteed to have its child immediately visible in the
1342 * tasklist if we walk through it with RCU.
1343 */
1344 read_lock(&tasklist_lock);
1345 do_each_thread(g, p) {
1346 WARN_ON_ONCE(!list_empty(&p->cg_list) ||
1347 task_css_set(p) != &init_css_set);
1348
1349 /*
1350 * We should check if the process is exiting, otherwise
1351 * it will race with cgroup_exit() in that the list
1352 * entry won't be deleted though the process has exited.
1353 * Do it while holding siglock so that we don't end up
1354 * racing against cgroup_exit().
1355 */
1356 spin_lock_irq(&p->sighand->siglock);
1357 if (!(p->flags & PF_EXITING)) {
1358 struct css_set *cset = task_css_set(p);
1359
1360 list_add(&p->cg_list, &cset->tasks);
1361 get_css_set(cset);
1362 }
1363 spin_unlock_irq(&p->sighand->siglock);
1364 } while_each_thread(g, p);
1365 read_unlock(&tasklist_lock);
1366out_unlock:
1367 up_write(&css_set_rwsem);
1368}
1345 1369
1346static void init_cgroup_housekeeping(struct cgroup *cgrp) 1370static void init_cgroup_housekeeping(struct cgroup *cgrp)
1347{ 1371{
1372 atomic_set(&cgrp->refcnt, 1);
1348 INIT_LIST_HEAD(&cgrp->sibling); 1373 INIT_LIST_HEAD(&cgrp->sibling);
1349 INIT_LIST_HEAD(&cgrp->children); 1374 INIT_LIST_HEAD(&cgrp->children);
1350 INIT_LIST_HEAD(&cgrp->files);
1351 INIT_LIST_HEAD(&cgrp->cset_links); 1375 INIT_LIST_HEAD(&cgrp->cset_links);
1352 INIT_LIST_HEAD(&cgrp->release_list); 1376 INIT_LIST_HEAD(&cgrp->release_list);
1353 INIT_LIST_HEAD(&cgrp->pidlists); 1377 INIT_LIST_HEAD(&cgrp->pidlists);
1354 mutex_init(&cgrp->pidlist_mutex); 1378 mutex_init(&cgrp->pidlist_mutex);
1355 cgrp->dummy_css.cgroup = cgrp; 1379 cgrp->dummy_css.cgroup = cgrp;
1356 simple_xattrs_init(&cgrp->xattrs);
1357} 1380}
1358 1381
1359static void init_cgroup_root(struct cgroupfs_root *root) 1382static void init_cgroup_root(struct cgroup_root *root,
1383 struct cgroup_sb_opts *opts)
1360{ 1384{
1361 struct cgroup *cgrp = &root->top_cgroup; 1385 struct cgroup *cgrp = &root->cgrp;
1362 1386
1363 INIT_LIST_HEAD(&root->root_list); 1387 INIT_LIST_HEAD(&root->root_list);
1364 root->number_of_cgroups = 1; 1388 atomic_set(&root->nr_cgrps, 1);
1365 cgrp->root = root; 1389 cgrp->root = root;
1366 RCU_INIT_POINTER(cgrp->name, &root_cgroup_name);
1367 init_cgroup_housekeeping(cgrp); 1390 init_cgroup_housekeeping(cgrp);
1368 idr_init(&root->cgroup_idr); 1391 idr_init(&root->cgroup_idr);
1369}
1370
1371static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
1372{
1373 int id;
1374
1375 lockdep_assert_held(&cgroup_mutex);
1376 lockdep_assert_held(&cgroup_root_mutex);
1377
1378 id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end,
1379 GFP_KERNEL);
1380 if (id < 0)
1381 return id;
1382
1383 root->hierarchy_id = id;
1384 return 0;
1385}
1386
1387static void cgroup_exit_root_id(struct cgroupfs_root *root)
1388{
1389 lockdep_assert_held(&cgroup_mutex);
1390 lockdep_assert_held(&cgroup_root_mutex);
1391
1392 if (root->hierarchy_id) {
1393 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
1394 root->hierarchy_id = 0;
1395 }
1396}
1397
1398static int cgroup_test_super(struct super_block *sb, void *data)
1399{
1400 struct cgroup_sb_opts *opts = data;
1401 struct cgroupfs_root *root = sb->s_fs_info;
1402
1403 /* If we asked for a name then it must match */
1404 if (opts->name && strcmp(opts->name, root->name))
1405 return 0;
1406
1407 /*
1408 * If we asked for subsystems (or explicitly for no
1409 * subsystems) then they must match
1410 */
1411 if ((opts->subsys_mask || opts->none)
1412 && (opts->subsys_mask != root->subsys_mask))
1413 return 0;
1414
1415 return 1;
1416}
1417
1418static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1419{
1420 struct cgroupfs_root *root;
1421 1392
1422 if (!opts->subsys_mask && !opts->none)
1423 return NULL;
1424
1425 root = kzalloc(sizeof(*root), GFP_KERNEL);
1426 if (!root)
1427 return ERR_PTR(-ENOMEM);
1428
1429 init_cgroup_root(root);
1430
1431 /*
1432 * We need to set @root->subsys_mask now so that @root can be
1433 * matched by cgroup_test_super() before it finishes
1434 * initialization; otherwise, competing mounts with the same
1435 * options may try to bind the same subsystems instead of waiting
1436 * for the first one leading to unexpected mount errors.
1437 * SUBSYS_BOUND will be set once actual binding is complete.
1438 */
1439 root->subsys_mask = opts->subsys_mask;
1440 root->flags = opts->flags; 1393 root->flags = opts->flags;
1441 if (opts->release_agent) 1394 if (opts->release_agent)
1442 strcpy(root->release_agent_path, opts->release_agent); 1395 strcpy(root->release_agent_path, opts->release_agent);
1443 if (opts->name) 1396 if (opts->name)
1444 strcpy(root->name, opts->name); 1397 strcpy(root->name, opts->name);
1445 if (opts->cpuset_clone_children) 1398 if (opts->cpuset_clone_children)
1446 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags); 1399 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1447 return root;
1448} 1400}
1449 1401
1450static void cgroup_free_root(struct cgroupfs_root *root) 1402static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
1451{ 1403{
1452 if (root) { 1404 LIST_HEAD(tmp_links);
1453 /* hierarhcy ID shoulid already have been released */ 1405 struct cgroup *root_cgrp = &root->cgrp;
1454 WARN_ON_ONCE(root->hierarchy_id); 1406 struct css_set *cset;
1455 1407 int i, ret;
1456 idr_destroy(&root->cgroup_idr);
1457 kfree(root);
1458 }
1459}
1460 1408
1461static int cgroup_set_super(struct super_block *sb, void *data) 1409 lockdep_assert_held(&cgroup_tree_mutex);
1462{ 1410 lockdep_assert_held(&cgroup_mutex);
1463 int ret;
1464 struct cgroup_sb_opts *opts = data;
1465 1411
1466 /* If we don't have a new root, we can't set up a new sb */ 1412 ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
1467 if (!opts->new_root) 1413 if (ret < 0)
1468 return -EINVAL; 1414 goto out;
1415 root_cgrp->id = ret;
1469 1416
1470 BUG_ON(!opts->subsys_mask && !opts->none); 1417 /*
1418 * We're accessing css_set_count without locking css_set_rwsem here,
1419 * but that's OK - it can only be increased by someone holding
1420 * cgroup_lock, and that's us. The worst that can happen is that we
1421 * have some link structures left over
1422 */
1423 ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
1424 if (ret)
1425 goto out;
1471 1426
1472 ret = set_anon_super(sb, NULL); 1427 ret = cgroup_init_root_id(root);
1473 if (ret) 1428 if (ret)
1474 return ret; 1429 goto out;
1475 1430
1476 sb->s_fs_info = opts->new_root; 1431 root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
1477 opts->new_root->sb = sb; 1432 KERNFS_ROOT_CREATE_DEACTIVATED,
1433 root_cgrp);
1434 if (IS_ERR(root->kf_root)) {
1435 ret = PTR_ERR(root->kf_root);
1436 goto exit_root_id;
1437 }
1438 root_cgrp->kn = root->kf_root->kn;
1478 1439
1479 sb->s_blocksize = PAGE_CACHE_SIZE; 1440 ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
1480 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 1441 if (ret)
1481 sb->s_magic = CGROUP_SUPER_MAGIC; 1442 goto destroy_root;
1482 sb->s_op = &cgroup_ops;
1483 1443
1484 return 0; 1444 ret = rebind_subsystems(root, ss_mask);
1485} 1445 if (ret)
1446 goto destroy_root;
1486 1447
1487static int cgroup_get_rootdir(struct super_block *sb) 1448 /*
1488{ 1449 * There must be no failure case after here, since rebinding takes
1489 static const struct dentry_operations cgroup_dops = { 1450 * care of subsystems' refcounts, which are explicitly dropped in
1490 .d_iput = cgroup_diput, 1451 * the failure exit path.
1491 .d_delete = always_delete_dentry, 1452 */
1492 }; 1453 list_add(&root->root_list, &cgroup_roots);
1454 cgroup_root_count++;
1493 1455
1494 struct inode *inode = 1456 /*
1495 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); 1457 * Link the root cgroup in this hierarchy into all the css_set
1458 * objects.
1459 */
1460 down_write(&css_set_rwsem);
1461 hash_for_each(css_set_table, i, cset, hlist)
1462 link_css_set(&tmp_links, cset, root_cgrp);
1463 up_write(&css_set_rwsem);
1496 1464
1497 if (!inode) 1465 BUG_ON(!list_empty(&root_cgrp->children));
1498 return -ENOMEM; 1466 BUG_ON(atomic_read(&root->nr_cgrps) != 1);
1499 1467
1500 inode->i_fop = &simple_dir_operations; 1468 kernfs_activate(root_cgrp->kn);
1501 inode->i_op = &cgroup_dir_inode_operations; 1469 ret = 0;
1502 /* directories start off with i_nlink == 2 (for "." entry) */ 1470 goto out;
1503 inc_nlink(inode); 1471
1504 sb->s_root = d_make_root(inode); 1472destroy_root:
1505 if (!sb->s_root) 1473 kernfs_destroy_root(root->kf_root);
1506 return -ENOMEM; 1474 root->kf_root = NULL;
1507 /* for everything else we want ->d_op set */ 1475exit_root_id:
1508 sb->s_d_op = &cgroup_dops; 1476 cgroup_exit_root_id(root);
1509 return 0; 1477out:
1478 free_cgrp_cset_links(&tmp_links);
1479 return ret;
1510} 1480}
1511 1481
1512static struct dentry *cgroup_mount(struct file_system_type *fs_type, 1482static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1513 int flags, const char *unused_dev_name, 1483 int flags, const char *unused_dev_name,
1514 void *data) 1484 void *data)
1515{ 1485{
1486 struct cgroup_root *root;
1516 struct cgroup_sb_opts opts; 1487 struct cgroup_sb_opts opts;
1517 struct cgroupfs_root *root; 1488 struct dentry *dentry;
1518 int ret = 0; 1489 int ret;
1519 struct super_block *sb;
1520 struct cgroupfs_root *new_root;
1521 struct list_head tmp_links;
1522 struct inode *inode;
1523 const struct cred *cred;
1524 1490
1525 /* First find the desired set of subsystems */ 1491 /*
1492 * The first time anyone tries to mount a cgroup, enable the list
1493 * linking each css_set to its tasks and fix up all existing tasks.
1494 */
1495 if (!use_task_css_set_links)
1496 cgroup_enable_task_cg_lists();
1497retry:
1498 mutex_lock(&cgroup_tree_mutex);
1526 mutex_lock(&cgroup_mutex); 1499 mutex_lock(&cgroup_mutex);
1500
1501 /* First find the desired set of subsystems */
1527 ret = parse_cgroupfs_options(data, &opts); 1502 ret = parse_cgroupfs_options(data, &opts);
1528 mutex_unlock(&cgroup_mutex);
1529 if (ret) 1503 if (ret)
1530 goto out_err; 1504 goto out_unlock;
1531
1532 /*
1533 * Allocate a new cgroup root. We may not need it if we're
1534 * reusing an existing hierarchy.
1535 */
1536 new_root = cgroup_root_from_opts(&opts);
1537 if (IS_ERR(new_root)) {
1538 ret = PTR_ERR(new_root);
1539 goto out_err;
1540 }
1541 opts.new_root = new_root;
1542 1505
1543 /* Locate an existing or new sb for this hierarchy */ 1506 /* look for a matching existing root */
1544 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts); 1507 if (!opts.subsys_mask && !opts.none && !opts.name) {
1545 if (IS_ERR(sb)) { 1508 cgrp_dfl_root_visible = true;
1546 ret = PTR_ERR(sb); 1509 root = &cgrp_dfl_root;
1547 cgroup_free_root(opts.new_root); 1510 cgroup_get(&root->cgrp);
1548 goto out_err; 1511 ret = 0;
1512 goto out_unlock;
1549 } 1513 }
1550 1514
1551 root = sb->s_fs_info; 1515 for_each_root(root) {
1552 BUG_ON(!root); 1516 bool name_match = false;
1553 if (root == opts.new_root) {
1554 /* We used the new root structure, so this is a new hierarchy */
1555 struct cgroup *root_cgrp = &root->top_cgroup;
1556 struct cgroupfs_root *existing_root;
1557 int i;
1558 struct css_set *cset;
1559
1560 BUG_ON(sb->s_root != NULL);
1561
1562 ret = cgroup_get_rootdir(sb);
1563 if (ret)
1564 goto drop_new_super;
1565 inode = sb->s_root->d_inode;
1566
1567 mutex_lock(&inode->i_mutex);
1568 mutex_lock(&cgroup_mutex);
1569 mutex_lock(&cgroup_root_mutex);
1570
1571 ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
1572 if (ret < 0)
1573 goto unlock_drop;
1574 root_cgrp->id = ret;
1575
1576 /* Check for name clashes with existing mounts */
1577 ret = -EBUSY;
1578 if (strlen(root->name))
1579 for_each_active_root(existing_root)
1580 if (!strcmp(existing_root->name, root->name))
1581 goto unlock_drop;
1582
1583 /*
1584 * We're accessing css_set_count without locking
1585 * css_set_lock here, but that's OK - it can only be
1586 * increased by someone holding cgroup_lock, and
1587 * that's us. The worst that can happen is that we
1588 * have some link structures left over
1589 */
1590 ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
1591 if (ret)
1592 goto unlock_drop;
1593 1517
1594 /* ID 0 is reserved for dummy root, 1 for unified hierarchy */ 1518 if (root == &cgrp_dfl_root)
1595 ret = cgroup_init_root_id(root, 2, 0); 1519 continue;
1596 if (ret)
1597 goto unlock_drop;
1598
1599 sb->s_root->d_fsdata = root_cgrp;
1600 root_cgrp->dentry = sb->s_root;
1601
1602 /*
1603 * We're inside get_sb() and will call lookup_one_len() to
1604 * create the root files, which doesn't work if SELinux is
1605 * in use. The following cred dancing somehow works around
1606 * it. See 2ce9738ba ("cgroupfs: use init_cred when
1607 * populating new cgroupfs mount") for more details.
1608 */
1609 cred = override_creds(&init_cred);
1610
1611 ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
1612 if (ret)
1613 goto rm_base_files;
1614
1615 ret = rebind_subsystems(root, root->subsys_mask, 0);
1616 if (ret)
1617 goto rm_base_files;
1618
1619 revert_creds(cred);
1620 1520
1621 /* 1521 /*
1622 * There must be no failure case after here, since rebinding 1522 * If we asked for a name then it must match. Also, if
1623 * takes care of subsystems' refcounts, which are explicitly 1523 * name matches but sybsys_mask doesn't, we should fail.
1624 * dropped in the failure exit path. 1524 * Remember whether name matched.
1625 */ 1525 */
1526 if (opts.name) {
1527 if (strcmp(opts.name, root->name))
1528 continue;
1529 name_match = true;
1530 }
1626 1531
1627 list_add(&root->root_list, &cgroup_roots);
1628 cgroup_root_count++;
1629
1630 /* Link the top cgroup in this hierarchy into all
1631 * the css_set objects */
1632 write_lock(&css_set_lock);
1633 hash_for_each(css_set_table, i, cset, hlist)
1634 link_css_set(&tmp_links, cset, root_cgrp);
1635 write_unlock(&css_set_lock);
1636
1637 free_cgrp_cset_links(&tmp_links);
1638
1639 BUG_ON(!list_empty(&root_cgrp->children));
1640 BUG_ON(root->number_of_cgroups != 1);
1641
1642 mutex_unlock(&cgroup_root_mutex);
1643 mutex_unlock(&cgroup_mutex);
1644 mutex_unlock(&inode->i_mutex);
1645 } else {
1646 /* 1532 /*
1647 * We re-used an existing hierarchy - the new root (if 1533 * If we asked for subsystems (or explicitly for no
1648 * any) is not needed 1534 * subsystems) then they must match.
1649 */ 1535 */
1650 cgroup_free_root(opts.new_root); 1536 if ((opts.subsys_mask || opts.none) &&
1537 (opts.subsys_mask != root->cgrp.subsys_mask)) {
1538 if (!name_match)
1539 continue;
1540 ret = -EBUSY;
1541 goto out_unlock;
1542 }
1651 1543
1652 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { 1544 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
1653 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { 1545 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
1654 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); 1546 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
1655 ret = -EINVAL; 1547 ret = -EINVAL;
1656 goto drop_new_super; 1548 goto out_unlock;
1657 } else { 1549 } else {
1658 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); 1550 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
1659 } 1551 }
1660 } 1552 }
1661 }
1662
1663 kfree(opts.release_agent);
1664 kfree(opts.name);
1665 return dget(sb->s_root);
1666
1667 rm_base_files:
1668 free_cgrp_cset_links(&tmp_links);
1669 cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false);
1670 revert_creds(cred);
1671 unlock_drop:
1672 cgroup_exit_root_id(root);
1673 mutex_unlock(&cgroup_root_mutex);
1674 mutex_unlock(&cgroup_mutex);
1675 mutex_unlock(&inode->i_mutex);
1676 drop_new_super:
1677 deactivate_locked_super(sb);
1678 out_err:
1679 kfree(opts.release_agent);
1680 kfree(opts.name);
1681 return ERR_PTR(ret);
1682}
1683
1684static void cgroup_kill_sb(struct super_block *sb)
1685{
1686 struct cgroupfs_root *root = sb->s_fs_info;
1687 struct cgroup *cgrp = &root->top_cgroup;
1688 struct cgrp_cset_link *link, *tmp_link;
1689 int ret;
1690
1691 BUG_ON(!root);
1692
1693 BUG_ON(root->number_of_cgroups != 1);
1694 BUG_ON(!list_empty(&cgrp->children));
1695 1553
1696 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 1554 /*
1697 mutex_lock(&cgroup_mutex); 1555 * A root's lifetime is governed by its root cgroup. Zero
1698 mutex_lock(&cgroup_root_mutex); 1556 * ref indicate that the root is being destroyed. Wait for
1557 * destruction to complete so that the subsystems are free.
1558 * We can use wait_queue for the wait but this path is
1559 * super cold. Let's just sleep for a bit and retry.
1560 */
1561 if (!atomic_inc_not_zero(&root->cgrp.refcnt)) {
1562 mutex_unlock(&cgroup_mutex);
1563 mutex_unlock(&cgroup_tree_mutex);
1564 kfree(opts.release_agent);
1565 kfree(opts.name);
1566 msleep(10);
1567 goto retry;
1568 }
1699 1569
1700 /* Rebind all subsystems back to the default hierarchy */ 1570 ret = 0;
1701 if (root->flags & CGRP_ROOT_SUBSYS_BOUND) { 1571 goto out_unlock;
1702 ret = rebind_subsystems(root, 0, root->subsys_mask);
1703 /* Shouldn't be able to fail ... */
1704 BUG_ON(ret);
1705 } 1572 }
1706 1573
1707 /* 1574 /*
1708 * Release all the links from cset_links to this hierarchy's 1575 * No such thing, create a new one. name= matching without subsys
1709 * root cgroup 1576 * specification is allowed for already existing hierarchies but we
1577 * can't create new one without subsys specification.
1710 */ 1578 */
1711 write_lock(&css_set_lock); 1579 if (!opts.subsys_mask && !opts.none) {
1712 1580 ret = -EINVAL;
1713 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { 1581 goto out_unlock;
1714 list_del(&link->cset_link);
1715 list_del(&link->cgrp_link);
1716 kfree(link);
1717 } 1582 }
1718 write_unlock(&css_set_lock);
1719 1583
1720 if (!list_empty(&root->root_list)) { 1584 root = kzalloc(sizeof(*root), GFP_KERNEL);
1721 list_del(&root->root_list); 1585 if (!root) {
1722 cgroup_root_count--; 1586 ret = -ENOMEM;
1587 goto out_unlock;
1723 } 1588 }
1724 1589
1725 cgroup_exit_root_id(root); 1590 init_cgroup_root(root, &opts);
1726 1591
1727 mutex_unlock(&cgroup_root_mutex); 1592 ret = cgroup_setup_root(root, opts.subsys_mask);
1593 if (ret)
1594 cgroup_free_root(root);
1595
1596out_unlock:
1728 mutex_unlock(&cgroup_mutex); 1597 mutex_unlock(&cgroup_mutex);
1729 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1598 mutex_unlock(&cgroup_tree_mutex);
1730 1599
1731 simple_xattrs_free(&cgrp->xattrs); 1600 kfree(opts.release_agent);
1601 kfree(opts.name);
1732 1602
1733 kill_litter_super(sb); 1603 if (ret)
1734 cgroup_free_root(root); 1604 return ERR_PTR(ret);
1605
1606 dentry = kernfs_mount(fs_type, flags, root->kf_root, NULL);
1607 if (IS_ERR(dentry))
1608 cgroup_put(&root->cgrp);
1609 return dentry;
1610}
1611
1612static void cgroup_kill_sb(struct super_block *sb)
1613{
1614 struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
1615 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1616
1617 cgroup_put(&root->cgrp);
1618 kernfs_kill_sb(sb);
1735} 1619}
1736 1620
1737static struct file_system_type cgroup_fs_type = { 1621static struct file_system_type cgroup_fs_type = {
@@ -1743,57 +1627,6 @@ static struct file_system_type cgroup_fs_type = {
1743static struct kobject *cgroup_kobj; 1627static struct kobject *cgroup_kobj;
1744 1628
1745/** 1629/**
1746 * cgroup_path - generate the path of a cgroup
1747 * @cgrp: the cgroup in question
1748 * @buf: the buffer to write the path into
1749 * @buflen: the length of the buffer
1750 *
1751 * Writes path of cgroup into buf. Returns 0 on success, -errno on error.
1752 *
1753 * We can't generate cgroup path using dentry->d_name, as accessing
1754 * dentry->name must be protected by irq-unsafe dentry->d_lock or parent
1755 * inode's i_mutex, while on the other hand cgroup_path() can be called
1756 * with some irq-safe spinlocks held.
1757 */
1758int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1759{
1760 int ret = -ENAMETOOLONG;
1761 char *start;
1762
1763 if (!cgrp->parent) {
1764 if (strlcpy(buf, "/", buflen) >= buflen)
1765 return -ENAMETOOLONG;
1766 return 0;
1767 }
1768
1769 start = buf + buflen - 1;
1770 *start = '\0';
1771
1772 rcu_read_lock();
1773 do {
1774 const char *name = cgroup_name(cgrp);
1775 int len;
1776
1777 len = strlen(name);
1778 if ((start -= len) < buf)
1779 goto out;
1780 memcpy(start, name, len);
1781
1782 if (--start < buf)
1783 goto out;
1784 *start = '/';
1785
1786 cgrp = cgrp->parent;
1787 } while (cgrp->parent);
1788 ret = 0;
1789 memmove(buf, start, buf + buflen - start);
1790out:
1791 rcu_read_unlock();
1792 return ret;
1793}
1794EXPORT_SYMBOL_GPL(cgroup_path);
1795
1796/**
1797 * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy 1630 * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
1798 * @task: target task 1631 * @task: target task
1799 * @buf: the buffer to write the path into 1632 * @buf: the buffer to write the path into
@@ -1804,49 +1637,55 @@ EXPORT_SYMBOL_GPL(cgroup_path);
1804 * function grabs cgroup_mutex and shouldn't be used inside locks used by 1637 * function grabs cgroup_mutex and shouldn't be used inside locks used by
1805 * cgroup controller callbacks. 1638 * cgroup controller callbacks.
1806 * 1639 *
1807 * Returns 0 on success, fails with -%ENAMETOOLONG if @buflen is too short. 1640 * Return value is the same as kernfs_path().
1808 */ 1641 */
1809int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) 1642char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
1810{ 1643{
1811 struct cgroupfs_root *root; 1644 struct cgroup_root *root;
1812 struct cgroup *cgrp; 1645 struct cgroup *cgrp;
1813 int hierarchy_id = 1, ret = 0; 1646 int hierarchy_id = 1;
1814 1647 char *path = NULL;
1815 if (buflen < 2)
1816 return -ENAMETOOLONG;
1817 1648
1818 mutex_lock(&cgroup_mutex); 1649 mutex_lock(&cgroup_mutex);
1650 down_read(&css_set_rwsem);
1819 1651
1820 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); 1652 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
1821 1653
1822 if (root) { 1654 if (root) {
1823 cgrp = task_cgroup_from_root(task, root); 1655 cgrp = task_cgroup_from_root(task, root);
1824 ret = cgroup_path(cgrp, buf, buflen); 1656 path = cgroup_path(cgrp, buf, buflen);
1825 } else { 1657 } else {
1826 /* if no hierarchy exists, everyone is in "/" */ 1658 /* if no hierarchy exists, everyone is in "/" */
1827 memcpy(buf, "/", 2); 1659 if (strlcpy(buf, "/", buflen) < buflen)
1660 path = buf;
1828 } 1661 }
1829 1662
1663 up_read(&css_set_rwsem);
1830 mutex_unlock(&cgroup_mutex); 1664 mutex_unlock(&cgroup_mutex);
1831 return ret; 1665 return path;
1832} 1666}
1833EXPORT_SYMBOL_GPL(task_cgroup_path); 1667EXPORT_SYMBOL_GPL(task_cgroup_path);
1834 1668
1835/* 1669/* used to track tasks and other necessary states during migration */
1836 * Control Group taskset
1837 */
1838struct task_and_cgroup {
1839 struct task_struct *task;
1840 struct cgroup *cgrp;
1841 struct css_set *cset;
1842};
1843
1844struct cgroup_taskset { 1670struct cgroup_taskset {
1845 struct task_and_cgroup single; 1671 /* the src and dst cset list running through cset->mg_node */
1846 struct flex_array *tc_array; 1672 struct list_head src_csets;
1847 int tc_array_len; 1673 struct list_head dst_csets;
1848 int idx; 1674
1849 struct cgroup *cur_cgrp; 1675 /*
1676 * Fields for cgroup_taskset_*() iteration.
1677 *
1678 * Before migration is committed, the target migration tasks are on
1679 * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of
1680 * the csets on ->dst_csets. ->csets point to either ->src_csets
1681 * or ->dst_csets depending on whether migration is committed.
1682 *
1683 * ->cur_csets and ->cur_task point to the current task position
1684 * during iteration.
1685 */
1686 struct list_head *csets;
1687 struct css_set *cur_cset;
1688 struct task_struct *cur_task;
1850}; 1689};
1851 1690
1852/** 1691/**
@@ -1857,15 +1696,11 @@ struct cgroup_taskset {
1857 */ 1696 */
1858struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset) 1697struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
1859{ 1698{
1860 if (tset->tc_array) { 1699 tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
1861 tset->idx = 0; 1700 tset->cur_task = NULL;
1862 return cgroup_taskset_next(tset); 1701
1863 } else { 1702 return cgroup_taskset_next(tset);
1864 tset->cur_cgrp = tset->single.cgrp;
1865 return tset->single.task;
1866 }
1867} 1703}
1868EXPORT_SYMBOL_GPL(cgroup_taskset_first);
1869 1704
1870/** 1705/**
1871 * cgroup_taskset_next - iterate to the next task in taskset 1706 * cgroup_taskset_next - iterate to the next task in taskset
@@ -1876,48 +1711,36 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_first);
1876 */ 1711 */
1877struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) 1712struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
1878{ 1713{
1879 struct task_and_cgroup *tc; 1714 struct css_set *cset = tset->cur_cset;
1715 struct task_struct *task = tset->cur_task;
1880 1716
1881 if (!tset->tc_array || tset->idx >= tset->tc_array_len) 1717 while (&cset->mg_node != tset->csets) {
1882 return NULL; 1718 if (!task)
1719 task = list_first_entry(&cset->mg_tasks,
1720 struct task_struct, cg_list);
1721 else
1722 task = list_next_entry(task, cg_list);
1883 1723
1884 tc = flex_array_get(tset->tc_array, tset->idx++); 1724 if (&task->cg_list != &cset->mg_tasks) {
1885 tset->cur_cgrp = tc->cgrp; 1725 tset->cur_cset = cset;
1886 return tc->task; 1726 tset->cur_task = task;
1887} 1727 return task;
1888EXPORT_SYMBOL_GPL(cgroup_taskset_next); 1728 }
1889 1729
1890/** 1730 cset = list_next_entry(cset, mg_node);
1891 * cgroup_taskset_cur_css - return the matching css for the current task 1731 task = NULL;
1892 * @tset: taskset of interest 1732 }
1893 * @subsys_id: the ID of the target subsystem
1894 *
1895 * Return the css for the current (last returned) task of @tset for
1896 * subsystem specified by @subsys_id. This function must be preceded by
1897 * either cgroup_taskset_first() or cgroup_taskset_next().
1898 */
1899struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset,
1900 int subsys_id)
1901{
1902 return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]);
1903}
1904EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css);
1905 1733
1906/** 1734 return NULL;
1907 * cgroup_taskset_size - return the number of tasks in taskset
1908 * @tset: taskset of interest
1909 */
1910int cgroup_taskset_size(struct cgroup_taskset *tset)
1911{
1912 return tset->tc_array ? tset->tc_array_len : 1;
1913} 1735}
1914EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1915 1736
1916 1737/**
1917/*
1918 * cgroup_task_migrate - move a task from one cgroup to another. 1738 * cgroup_task_migrate - move a task from one cgroup to another.
1739 * @old_cgrp; the cgroup @tsk is being migrated from
1740 * @tsk: the task being migrated
1741 * @new_cset: the new css_set @tsk is being attached to
1919 * 1742 *
1920 * Must be called with cgroup_mutex and threadgroup locked. 1743 * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.
1921 */ 1744 */
1922static void cgroup_task_migrate(struct cgroup *old_cgrp, 1745static void cgroup_task_migrate(struct cgroup *old_cgrp,
1923 struct task_struct *tsk, 1746 struct task_struct *tsk,
@@ -1925,6 +1748,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
1925{ 1748{
1926 struct css_set *old_cset; 1749 struct css_set *old_cset;
1927 1750
1751 lockdep_assert_held(&cgroup_mutex);
1752 lockdep_assert_held(&css_set_rwsem);
1753
1928 /* 1754 /*
1929 * We are synchronized through threadgroup_lock() against PF_EXITING 1755 * We are synchronized through threadgroup_lock() against PF_EXITING
1930 * setting such that we can't race against cgroup_exit() changing the 1756 * setting such that we can't race against cgroup_exit() changing the
@@ -1933,15 +1759,16 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
1933 WARN_ON_ONCE(tsk->flags & PF_EXITING); 1759 WARN_ON_ONCE(tsk->flags & PF_EXITING);
1934 old_cset = task_css_set(tsk); 1760 old_cset = task_css_set(tsk);
1935 1761
1936 task_lock(tsk); 1762 get_css_set(new_cset);
1937 rcu_assign_pointer(tsk->cgroups, new_cset); 1763 rcu_assign_pointer(tsk->cgroups, new_cset);
1938 task_unlock(tsk);
1939 1764
1940 /* Update the css_set linked lists if we're using them */ 1765 /*
1941 write_lock(&css_set_lock); 1766 * Use move_tail so that cgroup_taskset_first() still returns the
1942 if (!list_empty(&tsk->cg_list)) 1767 * leader after migration. This works because cgroup_migrate()
1943 list_move(&tsk->cg_list, &new_cset->tasks); 1768 * ensures that the dst_cset of the leader is the first on the
1944 write_unlock(&css_set_lock); 1769 * tset's dst_csets list.
1770 */
1771 list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
1945 1772
1946 /* 1773 /*
1947 * We just gained a reference on old_cset by taking it from the 1774 * We just gained a reference on old_cset by taking it from the
@@ -1949,100 +1776,199 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
1949 * we're safe to drop it here; it will be freed under RCU. 1776 * we're safe to drop it here; it will be freed under RCU.
1950 */ 1777 */
1951 set_bit(CGRP_RELEASABLE, &old_cgrp->flags); 1778 set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
1952 put_css_set(old_cset); 1779 put_css_set_locked(old_cset, false);
1953} 1780}
1954 1781
1955/** 1782/**
1956 * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup 1783 * cgroup_migrate_finish - cleanup after attach
1957 * @cgrp: the cgroup to attach to 1784 * @preloaded_csets: list of preloaded css_sets
1958 * @tsk: the task or the leader of the threadgroup to be attached
1959 * @threadgroup: attach the whole threadgroup?
1960 * 1785 *
1961 * Call holding cgroup_mutex and the group_rwsem of the leader. Will take 1786 * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See
1962 * task_lock of @tsk or each thread in the threadgroup individually in turn. 1787 * those functions for details.
1963 */ 1788 */
1964static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, 1789static void cgroup_migrate_finish(struct list_head *preloaded_csets)
1965 bool threadgroup)
1966{ 1790{
1967 int retval, i, group_size; 1791 struct css_set *cset, *tmp_cset;
1968 struct cgroupfs_root *root = cgrp->root;
1969 struct cgroup_subsys_state *css, *failed_css = NULL;
1970 /* threadgroup list cursor and array */
1971 struct task_struct *leader = tsk;
1972 struct task_and_cgroup *tc;
1973 struct flex_array *group;
1974 struct cgroup_taskset tset = { };
1975 1792
1976 /* 1793 lockdep_assert_held(&cgroup_mutex);
1977 * step 0: in order to do expensive, possibly blocking operations for 1794
1978 * every thread, we cannot iterate the thread group list, since it needs 1795 down_write(&css_set_rwsem);
1979 * rcu or tasklist locked. instead, build an array of all threads in the 1796 list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
1980 * group - group_rwsem prevents new threads from appearing, and if 1797 cset->mg_src_cgrp = NULL;
1981 * threads exit, this will just be an over-estimate. 1798 cset->mg_dst_cset = NULL;
1982 */ 1799 list_del_init(&cset->mg_preload_node);
1983 if (threadgroup) 1800 put_css_set_locked(cset, false);
1984 group_size = get_nr_threads(tsk); 1801 }
1985 else 1802 up_write(&css_set_rwsem);
1986 group_size = 1; 1803}
1987 /* flex_array supports very large thread-groups better than kmalloc. */ 1804
1988 group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL); 1805/**
1989 if (!group) 1806 * cgroup_migrate_add_src - add a migration source css_set
1990 return -ENOMEM; 1807 * @src_cset: the source css_set to add
1991 /* pre-allocate to guarantee space while iterating in rcu read-side. */ 1808 * @dst_cgrp: the destination cgroup
1992 retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL); 1809 * @preloaded_csets: list of preloaded css_sets
1993 if (retval) 1810 *
1994 goto out_free_group_list; 1811 * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin
1812 * @src_cset and add it to @preloaded_csets, which should later be cleaned
1813 * up by cgroup_migrate_finish().
1814 *
1815 * This function may be called without holding threadgroup_lock even if the
1816 * target is a process. Threads may be created and destroyed but as long
1817 * as cgroup_mutex is not dropped, no new css_set can be put into play and
1818 * the preloaded css_sets are guaranteed to cover all migrations.
1819 */
1820static void cgroup_migrate_add_src(struct css_set *src_cset,
1821 struct cgroup *dst_cgrp,
1822 struct list_head *preloaded_csets)
1823{
1824 struct cgroup *src_cgrp;
1825
1826 lockdep_assert_held(&cgroup_mutex);
1827 lockdep_assert_held(&css_set_rwsem);
1828
1829 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
1830
1831 /* nothing to do if this cset already belongs to the cgroup */
1832 if (src_cgrp == dst_cgrp)
1833 return;
1834
1835 if (!list_empty(&src_cset->mg_preload_node))
1836 return;
1837
1838 WARN_ON(src_cset->mg_src_cgrp);
1839 WARN_ON(!list_empty(&src_cset->mg_tasks));
1840 WARN_ON(!list_empty(&src_cset->mg_node));
1841
1842 src_cset->mg_src_cgrp = src_cgrp;
1843 get_css_set(src_cset);
1844 list_add(&src_cset->mg_preload_node, preloaded_csets);
1845}
1846
1847/**
1848 * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
1849 * @dst_cgrp: the destination cgroup
1850 * @preloaded_csets: list of preloaded source css_sets
1851 *
1852 * Tasks are about to be moved to @dst_cgrp and all the source css_sets
1853 * have been preloaded to @preloaded_csets. This function looks up and
1854 * pins all destination css_sets, links each to its source, and put them on
1855 * @preloaded_csets.
1856 *
1857 * This function must be called after cgroup_migrate_add_src() has been
1858 * called on each migration source css_set. After migration is performed
1859 * using cgroup_migrate(), cgroup_migrate_finish() must be called on
1860 * @preloaded_csets.
1861 */
1862static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
1863 struct list_head *preloaded_csets)
1864{
1865 LIST_HEAD(csets);
1866 struct css_set *src_cset;
1867
1868 lockdep_assert_held(&cgroup_mutex);
1869
1870 /* look up the dst cset for each src cset and link it to src */
1871 list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) {
1872 struct css_set *dst_cset;
1873
1874 dst_cset = find_css_set(src_cset, dst_cgrp);
1875 if (!dst_cset)
1876 goto err;
1877
1878 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
1879 src_cset->mg_dst_cset = dst_cset;
1880
1881 if (list_empty(&dst_cset->mg_preload_node))
1882 list_add(&dst_cset->mg_preload_node, &csets);
1883 else
1884 put_css_set(dst_cset, false);
1885 }
1886
1887 list_splice(&csets, preloaded_csets);
1888 return 0;
1889err:
1890 cgroup_migrate_finish(&csets);
1891 return -ENOMEM;
1892}
1893
1894/**
1895 * cgroup_migrate - migrate a process or task to a cgroup
1896 * @cgrp: the destination cgroup
1897 * @leader: the leader of the process or the task to migrate
1898 * @threadgroup: whether @leader points to the whole process or a single task
1899 *
1900 * Migrate a process or task denoted by @leader to @cgrp. If migrating a
1901 * process, the caller must be holding threadgroup_lock of @leader. The
1902 * caller is also responsible for invoking cgroup_migrate_add_src() and
1903 * cgroup_migrate_prepare_dst() on the targets before invoking this
1904 * function and following up with cgroup_migrate_finish().
1905 *
1906 * As long as a controller's ->can_attach() doesn't fail, this function is
1907 * guaranteed to succeed. This means that, excluding ->can_attach()
1908 * failure, when migrating multiple targets, the success or failure can be
1909 * decided for all targets by invoking group_migrate_prepare_dst() before
1910 * actually starting migrating.
1911 */
1912static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
1913 bool threadgroup)
1914{
1915 struct cgroup_taskset tset = {
1916 .src_csets = LIST_HEAD_INIT(tset.src_csets),
1917 .dst_csets = LIST_HEAD_INIT(tset.dst_csets),
1918 .csets = &tset.src_csets,
1919 };
1920 struct cgroup_subsys_state *css, *failed_css = NULL;
1921 struct css_set *cset, *tmp_cset;
1922 struct task_struct *task, *tmp_task;
1923 int i, ret;
1995 1924
1996 i = 0;
1997 /* 1925 /*
1998 * Prevent freeing of tasks while we take a snapshot. Tasks that are 1926 * Prevent freeing of tasks while we take a snapshot. Tasks that are
1999 * already PF_EXITING could be freed from underneath us unless we 1927 * already PF_EXITING could be freed from underneath us unless we
2000 * take an rcu_read_lock. 1928 * take an rcu_read_lock.
2001 */ 1929 */
1930 down_write(&css_set_rwsem);
2002 rcu_read_lock(); 1931 rcu_read_lock();
1932 task = leader;
2003 do { 1933 do {
2004 struct task_and_cgroup ent; 1934 /* @task either already exited or can't exit until the end */
1935 if (task->flags & PF_EXITING)
1936 goto next;
2005 1937
2006 /* @tsk either already exited or can't exit until the end */ 1938 /* leave @task alone if post_fork() hasn't linked it yet */
2007 if (tsk->flags & PF_EXITING) 1939 if (list_empty(&task->cg_list))
2008 goto next; 1940 goto next;
2009 1941
2010 /* as per above, nr_threads may decrease, but not increase. */ 1942 cset = task_css_set(task);
2011 BUG_ON(i >= group_size); 1943 if (!cset->mg_src_cgrp)
2012 ent.task = tsk;
2013 ent.cgrp = task_cgroup_from_root(tsk, root);
2014 /* nothing to do if this task is already in the cgroup */
2015 if (ent.cgrp == cgrp)
2016 goto next; 1944 goto next;
1945
2017 /* 1946 /*
2018 * saying GFP_ATOMIC has no effect here because we did prealloc 1947 * cgroup_taskset_first() must always return the leader.
2019 * earlier, but it's good form to communicate our expectations. 1948 * Take care to avoid disturbing the ordering.
2020 */ 1949 */
2021 retval = flex_array_put(group, i, &ent, GFP_ATOMIC); 1950 list_move_tail(&task->cg_list, &cset->mg_tasks);
2022 BUG_ON(retval != 0); 1951 if (list_empty(&cset->mg_node))
2023 i++; 1952 list_add_tail(&cset->mg_node, &tset.src_csets);
1953 if (list_empty(&cset->mg_dst_cset->mg_node))
1954 list_move_tail(&cset->mg_dst_cset->mg_node,
1955 &tset.dst_csets);
2024 next: 1956 next:
2025 if (!threadgroup) 1957 if (!threadgroup)
2026 break; 1958 break;
2027 } while_each_thread(leader, tsk); 1959 } while_each_thread(leader, task);
2028 rcu_read_unlock(); 1960 rcu_read_unlock();
2029 /* remember the number of threads in the array for later. */ 1961 up_write(&css_set_rwsem);
2030 group_size = i;
2031 tset.tc_array = group;
2032 tset.tc_array_len = group_size;
2033 1962
2034 /* methods shouldn't be called if no task is actually migrating */ 1963 /* methods shouldn't be called if no task is actually migrating */
2035 retval = 0; 1964 if (list_empty(&tset.src_csets))
2036 if (!group_size) 1965 return 0;
2037 goto out_free_group_list;
2038 1966
2039 /* 1967 /* check that we can legitimately attach to the cgroup */
2040 * step 1: check that we can legitimately attach to the cgroup.
2041 */
2042 for_each_css(css, i, cgrp) { 1968 for_each_css(css, i, cgrp) {
2043 if (css->ss->can_attach) { 1969 if (css->ss->can_attach) {
2044 retval = css->ss->can_attach(css, &tset); 1970 ret = css->ss->can_attach(css, &tset);
2045 if (retval) { 1971 if (ret) {
2046 failed_css = css; 1972 failed_css = css;
2047 goto out_cancel_attach; 1973 goto out_cancel_attach;
2048 } 1974 }
@@ -2050,70 +1976,91 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2050 } 1976 }
2051 1977
2052 /* 1978 /*
2053 * step 2: make sure css_sets exist for all threads to be migrated. 1979 * Now that we're guaranteed success, proceed to move all tasks to
2054 * we use find_css_set, which allocates a new one if necessary. 1980 * the new cgroup. There are no failure cases after here, so this
1981 * is the commit point.
2055 */ 1982 */
2056 for (i = 0; i < group_size; i++) { 1983 down_write(&css_set_rwsem);
2057 struct css_set *old_cset; 1984 list_for_each_entry(cset, &tset.src_csets, mg_node) {
2058 1985 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
2059 tc = flex_array_get(group, i); 1986 cgroup_task_migrate(cset->mg_src_cgrp, task,
2060 old_cset = task_css_set(tc->task); 1987 cset->mg_dst_cset);
2061 tc->cset = find_css_set(old_cset, cgrp);
2062 if (!tc->cset) {
2063 retval = -ENOMEM;
2064 goto out_put_css_set_refs;
2065 }
2066 } 1988 }
1989 up_write(&css_set_rwsem);
2067 1990
2068 /* 1991 /*
2069 * step 3: now that we're guaranteed success wrt the css_sets, 1992 * Migration is committed, all target tasks are now on dst_csets.
2070 * proceed to move all tasks to the new cgroup. There are no 1993 * Nothing is sensitive to fork() after this point. Notify
2071 * failure cases after here, so this is the commit point. 1994 * controllers that migration is complete.
2072 */ 1995 */
2073 for (i = 0; i < group_size; i++) { 1996 tset.csets = &tset.dst_csets;
2074 tc = flex_array_get(group, i);
2075 cgroup_task_migrate(tc->cgrp, tc->task, tc->cset);
2076 }
2077 /* nothing is sensitive to fork() after this point. */
2078 1997
2079 /*
2080 * step 4: do subsystem attach callbacks.
2081 */
2082 for_each_css(css, i, cgrp) 1998 for_each_css(css, i, cgrp)
2083 if (css->ss->attach) 1999 if (css->ss->attach)
2084 css->ss->attach(css, &tset); 2000 css->ss->attach(css, &tset);
2085 2001
2086 /* 2002 ret = 0;
2087 * step 5: success! and cleanup 2003 goto out_release_tset;
2088 */ 2004
2089 retval = 0;
2090out_put_css_set_refs:
2091 if (retval) {
2092 for (i = 0; i < group_size; i++) {
2093 tc = flex_array_get(group, i);
2094 if (!tc->cset)
2095 break;
2096 put_css_set(tc->cset);
2097 }
2098 }
2099out_cancel_attach: 2005out_cancel_attach:
2100 if (retval) { 2006 for_each_css(css, i, cgrp) {
2101 for_each_css(css, i, cgrp) { 2007 if (css == failed_css)
2102 if (css == failed_css) 2008 break;
2103 break; 2009 if (css->ss->cancel_attach)
2104 if (css->ss->cancel_attach) 2010 css->ss->cancel_attach(css, &tset);
2105 css->ss->cancel_attach(css, &tset);
2106 }
2107 } 2011 }
2108out_free_group_list: 2012out_release_tset:
2109 flex_array_free(group); 2013 down_write(&css_set_rwsem);
2110 return retval; 2014 list_splice_init(&tset.dst_csets, &tset.src_csets);
2015 list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
2016 list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
2017 list_del_init(&cset->mg_node);
2018 }
2019 up_write(&css_set_rwsem);
2020 return ret;
2021}
2022
2023/**
2024 * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
2025 * @dst_cgrp: the cgroup to attach to
2026 * @leader: the task or the leader of the threadgroup to be attached
2027 * @threadgroup: attach the whole threadgroup?
2028 *
2029 * Call holding cgroup_mutex and threadgroup_lock of @leader.
2030 */
2031static int cgroup_attach_task(struct cgroup *dst_cgrp,
2032 struct task_struct *leader, bool threadgroup)
2033{
2034 LIST_HEAD(preloaded_csets);
2035 struct task_struct *task;
2036 int ret;
2037
2038 /* look up all src csets */
2039 down_read(&css_set_rwsem);
2040 rcu_read_lock();
2041 task = leader;
2042 do {
2043 cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
2044 &preloaded_csets);
2045 if (!threadgroup)
2046 break;
2047 } while_each_thread(leader, task);
2048 rcu_read_unlock();
2049 up_read(&css_set_rwsem);
2050
2051 /* prepare dst csets and commit */
2052 ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
2053 if (!ret)
2054 ret = cgroup_migrate(dst_cgrp, leader, threadgroup);
2055
2056 cgroup_migrate_finish(&preloaded_csets);
2057 return ret;
2111} 2058}
2112 2059
2113/* 2060/*
2114 * Find the task_struct of the task to attach by vpid and pass it along to the 2061 * Find the task_struct of the task to attach by vpid and pass it along to the
2115 * function to attach either it or all tasks in its threadgroup. Will lock 2062 * function to attach either it or all tasks in its threadgroup. Will lock
2116 * cgroup_mutex and threadgroup; may take task_lock of task. 2063 * cgroup_mutex and threadgroup.
2117 */ 2064 */
2118static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) 2065static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2119{ 2066{
@@ -2198,12 +2145,19 @@ out_unlock_cgroup:
2198 */ 2145 */
2199int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) 2146int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2200{ 2147{
2201 struct cgroupfs_root *root; 2148 struct cgroup_root *root;
2202 int retval = 0; 2149 int retval = 0;
2203 2150
2204 mutex_lock(&cgroup_mutex); 2151 mutex_lock(&cgroup_mutex);
2205 for_each_active_root(root) { 2152 for_each_root(root) {
2206 struct cgroup *from_cgrp = task_cgroup_from_root(from, root); 2153 struct cgroup *from_cgrp;
2154
2155 if (root == &cgrp_dfl_root)
2156 continue;
2157
2158 down_read(&css_set_rwsem);
2159 from_cgrp = task_cgroup_from_root(from, root);
2160 up_read(&css_set_rwsem);
2207 2161
2208 retval = cgroup_attach_task(from_cgrp, tsk, false); 2162 retval = cgroup_attach_task(from_cgrp, tsk, false);
2209 if (retval) 2163 if (retval)
@@ -2228,16 +2182,17 @@ static int cgroup_procs_write(struct cgroup_subsys_state *css,
2228} 2182}
2229 2183
2230static int cgroup_release_agent_write(struct cgroup_subsys_state *css, 2184static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
2231 struct cftype *cft, const char *buffer) 2185 struct cftype *cft, char *buffer)
2232{ 2186{
2233 BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX); 2187 struct cgroup_root *root = css->cgroup->root;
2234 if (strlen(buffer) >= PATH_MAX) 2188
2235 return -EINVAL; 2189 BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX);
2236 if (!cgroup_lock_live_group(css->cgroup)) 2190 if (!cgroup_lock_live_group(css->cgroup))
2237 return -ENODEV; 2191 return -ENODEV;
2238 mutex_lock(&cgroup_root_mutex); 2192 spin_lock(&release_agent_path_lock);
2239 strcpy(css->cgroup->root->release_agent_path, buffer); 2193 strlcpy(root->release_agent_path, buffer,
2240 mutex_unlock(&cgroup_root_mutex); 2194 sizeof(root->release_agent_path));
2195 spin_unlock(&release_agent_path_lock);
2241 mutex_unlock(&cgroup_mutex); 2196 mutex_unlock(&cgroup_mutex);
2242 return 0; 2197 return 0;
2243} 2198}
@@ -2262,32 +2217,23 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
2262 return 0; 2217 return 0;
2263} 2218}
2264 2219
2265/* A buffer size big enough for numbers or short strings */ 2220static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
2266#define CGROUP_LOCAL_BUFFER_SIZE 64 2221 size_t nbytes, loff_t off)
2267
2268static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf,
2269 size_t nbytes, loff_t *ppos)
2270{ 2222{
2271 struct cfent *cfe = __d_cfe(file->f_dentry); 2223 struct cgroup *cgrp = of->kn->parent->priv;
2272 struct cftype *cft = __d_cft(file->f_dentry); 2224 struct cftype *cft = of->kn->priv;
2273 struct cgroup_subsys_state *css = cfe->css; 2225 struct cgroup_subsys_state *css;
2274 size_t max_bytes = cft->max_write_len ?: CGROUP_LOCAL_BUFFER_SIZE - 1;
2275 char *buf;
2276 int ret; 2226 int ret;
2277 2227
2278 if (nbytes >= max_bytes) 2228 /*
2279 return -E2BIG; 2229 * kernfs guarantees that a file isn't deleted with operations in
2280 2230 * flight, which means that the matching css is and stays alive and
2281 buf = kmalloc(nbytes + 1, GFP_KERNEL); 2231 * doesn't need to be pinned. The RCU locking is not necessary
2282 if (!buf) 2232 * either. It's just for the convenience of using cgroup_css().
2283 return -ENOMEM; 2233 */
2284 2234 rcu_read_lock();
2285 if (copy_from_user(buf, userbuf, nbytes)) { 2235 css = cgroup_css(cgrp, cft->ss);
2286 ret = -EFAULT; 2236 rcu_read_unlock();
2287 goto out_free;
2288 }
2289
2290 buf[nbytes] = '\0';
2291 2237
2292 if (cft->write_string) { 2238 if (cft->write_string) {
2293 ret = cft->write_string(css, cft, strstrip(buf)); 2239 ret = cft->write_string(css, cft, strstrip(buf));
@@ -2306,53 +2252,23 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf,
2306 } else { 2252 } else {
2307 ret = -EINVAL; 2253 ret = -EINVAL;
2308 } 2254 }
2309out_free: 2255
2310 kfree(buf);
2311 return ret ?: nbytes; 2256 return ret ?: nbytes;
2312} 2257}
2313 2258
2314/*
2315 * seqfile ops/methods for returning structured data. Currently just
2316 * supports string->u64 maps, but can be extended in future.
2317 */
2318
2319static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) 2259static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
2320{ 2260{
2321 struct cftype *cft = seq_cft(seq); 2261 return seq_cft(seq)->seq_start(seq, ppos);
2322
2323 if (cft->seq_start) {
2324 return cft->seq_start(seq, ppos);
2325 } else {
2326 /*
2327 * The same behavior and code as single_open(). Returns
2328 * !NULL if pos is at the beginning; otherwise, NULL.
2329 */
2330 return NULL + !*ppos;
2331 }
2332} 2262}
2333 2263
2334static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos) 2264static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
2335{ 2265{
2336 struct cftype *cft = seq_cft(seq); 2266 return seq_cft(seq)->seq_next(seq, v, ppos);
2337
2338 if (cft->seq_next) {
2339 return cft->seq_next(seq, v, ppos);
2340 } else {
2341 /*
2342 * The same behavior and code as single_open(), always
2343 * terminate after the initial read.
2344 */
2345 ++*ppos;
2346 return NULL;
2347 }
2348} 2267}
2349 2268
2350static void cgroup_seqfile_stop(struct seq_file *seq, void *v) 2269static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
2351{ 2270{
2352 struct cftype *cft = seq_cft(seq); 2271 seq_cft(seq)->seq_stop(seq, v);
2353
2354 if (cft->seq_stop)
2355 cft->seq_stop(seq, v);
2356} 2272}
2357 2273
2358static int cgroup_seqfile_show(struct seq_file *m, void *arg) 2274static int cgroup_seqfile_show(struct seq_file *m, void *arg)
@@ -2372,96 +2288,35 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2372 return 0; 2288 return 0;
2373} 2289}
2374 2290
2375static struct seq_operations cgroup_seq_operations = { 2291static struct kernfs_ops cgroup_kf_single_ops = {
2376 .start = cgroup_seqfile_start, 2292 .atomic_write_len = PAGE_SIZE,
2377 .next = cgroup_seqfile_next, 2293 .write = cgroup_file_write,
2378 .stop = cgroup_seqfile_stop, 2294 .seq_show = cgroup_seqfile_show,
2379 .show = cgroup_seqfile_show,
2380}; 2295};
2381 2296
2382static int cgroup_file_open(struct inode *inode, struct file *file) 2297static struct kernfs_ops cgroup_kf_ops = {
2383{ 2298 .atomic_write_len = PAGE_SIZE,
2384 struct cfent *cfe = __d_cfe(file->f_dentry); 2299 .write = cgroup_file_write,
2385 struct cftype *cft = __d_cft(file->f_dentry); 2300 .seq_start = cgroup_seqfile_start,
2386 struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); 2301 .seq_next = cgroup_seqfile_next,
2387 struct cgroup_subsys_state *css; 2302 .seq_stop = cgroup_seqfile_stop,
2388 struct cgroup_open_file *of; 2303 .seq_show = cgroup_seqfile_show,
2389 int err; 2304};
2390
2391 err = generic_file_open(inode, file);
2392 if (err)
2393 return err;
2394
2395 /*
2396 * If the file belongs to a subsystem, pin the css. Will be
2397 * unpinned either on open failure or release. This ensures that
2398 * @css stays alive for all file operations.
2399 */
2400 rcu_read_lock();
2401 css = cgroup_css(cgrp, cft->ss);
2402 if (cft->ss && !css_tryget(css))
2403 css = NULL;
2404 rcu_read_unlock();
2405
2406 if (!css)
2407 return -ENODEV;
2408
2409 /*
2410 * @cfe->css is used by read/write/close to determine the
2411 * associated css. @file->private_data would be a better place but
2412 * that's already used by seqfile. Multiple accessors may use it
2413 * simultaneously which is okay as the association never changes.
2414 */
2415 WARN_ON_ONCE(cfe->css && cfe->css != css);
2416 cfe->css = css;
2417
2418 of = __seq_open_private(file, &cgroup_seq_operations,
2419 sizeof(struct cgroup_open_file));
2420 if (of) {
2421 of->cfe = cfe;
2422 return 0;
2423 }
2424
2425 if (css->ss)
2426 css_put(css);
2427 return -ENOMEM;
2428}
2429
2430static int cgroup_file_release(struct inode *inode, struct file *file)
2431{
2432 struct cfent *cfe = __d_cfe(file->f_dentry);
2433 struct cgroup_subsys_state *css = cfe->css;
2434
2435 if (css->ss)
2436 css_put(css);
2437 return seq_release_private(inode, file);
2438}
2439 2305
2440/* 2306/*
2441 * cgroup_rename - Only allow simple rename of directories in place. 2307 * cgroup_rename - Only allow simple rename of directories in place.
2442 */ 2308 */
2443static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, 2309static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
2444 struct inode *new_dir, struct dentry *new_dentry) 2310 const char *new_name_str)
2445{ 2311{
2312 struct cgroup *cgrp = kn->priv;
2446 int ret; 2313 int ret;
2447 struct cgroup_name *name, *old_name;
2448 struct cgroup *cgrp;
2449
2450 /*
2451 * It's convinient to use parent dir's i_mutex to protected
2452 * cgrp->name.
2453 */
2454 lockdep_assert_held(&old_dir->i_mutex);
2455 2314
2456 if (!S_ISDIR(old_dentry->d_inode->i_mode)) 2315 if (kernfs_type(kn) != KERNFS_DIR)
2457 return -ENOTDIR; 2316 return -ENOTDIR;
2458 if (new_dentry->d_inode) 2317 if (kn->parent != new_parent)
2459 return -EEXIST;
2460 if (old_dir != new_dir)
2461 return -EIO; 2318 return -EIO;
2462 2319
2463 cgrp = __d_cgrp(old_dentry);
2464
2465 /* 2320 /*
2466 * This isn't a proper migration and its usefulness is very 2321 * This isn't a proper migration and its usefulness is very
2467 * limited. Disallow if sane_behavior. 2322 * limited. Disallow if sane_behavior.
@@ -2469,218 +2324,40 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
2469 if (cgroup_sane_behavior(cgrp)) 2324 if (cgroup_sane_behavior(cgrp))
2470 return -EPERM; 2325 return -EPERM;
2471 2326
2472 name = cgroup_alloc_name(new_dentry); 2327 /*
2473 if (!name) 2328 * We're gonna grab cgroup_tree_mutex which nests outside kernfs
2474 return -ENOMEM; 2329 * active_ref. kernfs_rename() doesn't require active_ref
2475 2330 * protection. Break them before grabbing cgroup_tree_mutex.
2476 ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry); 2331 */
2477 if (ret) { 2332 kernfs_break_active_protection(new_parent);
2478 kfree(name); 2333 kernfs_break_active_protection(kn);
2479 return ret;
2480 }
2481
2482 old_name = rcu_dereference_protected(cgrp->name, true);
2483 rcu_assign_pointer(cgrp->name, name);
2484
2485 kfree_rcu(old_name, rcu_head);
2486 return 0;
2487}
2488
2489static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
2490{
2491 if (S_ISDIR(dentry->d_inode->i_mode))
2492 return &__d_cgrp(dentry)->xattrs;
2493 else
2494 return &__d_cfe(dentry)->xattrs;
2495}
2496
2497static inline int xattr_enabled(struct dentry *dentry)
2498{
2499 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
2500 return root->flags & CGRP_ROOT_XATTR;
2501}
2502
2503static bool is_valid_xattr(const char *name)
2504{
2505 if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
2506 !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN))
2507 return true;
2508 return false;
2509}
2510
2511static int cgroup_setxattr(struct dentry *dentry, const char *name,
2512 const void *val, size_t size, int flags)
2513{
2514 if (!xattr_enabled(dentry))
2515 return -EOPNOTSUPP;
2516 if (!is_valid_xattr(name))
2517 return -EINVAL;
2518 return simple_xattr_set(__d_xattrs(dentry), name, val, size, flags);
2519}
2520
2521static int cgroup_removexattr(struct dentry *dentry, const char *name)
2522{
2523 if (!xattr_enabled(dentry))
2524 return -EOPNOTSUPP;
2525 if (!is_valid_xattr(name))
2526 return -EINVAL;
2527 return simple_xattr_remove(__d_xattrs(dentry), name);
2528}
2529
2530static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name,
2531 void *buf, size_t size)
2532{
2533 if (!xattr_enabled(dentry))
2534 return -EOPNOTSUPP;
2535 if (!is_valid_xattr(name))
2536 return -EINVAL;
2537 return simple_xattr_get(__d_xattrs(dentry), name, buf, size);
2538}
2539
2540static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size)
2541{
2542 if (!xattr_enabled(dentry))
2543 return -EOPNOTSUPP;
2544 return simple_xattr_list(__d_xattrs(dentry), buf, size);
2545}
2546
2547static const struct file_operations cgroup_file_operations = {
2548 .read = seq_read,
2549 .write = cgroup_file_write,
2550 .llseek = generic_file_llseek,
2551 .open = cgroup_file_open,
2552 .release = cgroup_file_release,
2553};
2554
2555static const struct inode_operations cgroup_file_inode_operations = {
2556 .setxattr = cgroup_setxattr,
2557 .getxattr = cgroup_getxattr,
2558 .listxattr = cgroup_listxattr,
2559 .removexattr = cgroup_removexattr,
2560};
2561
2562static const struct inode_operations cgroup_dir_inode_operations = {
2563 .lookup = simple_lookup,
2564 .mkdir = cgroup_mkdir,
2565 .rmdir = cgroup_rmdir,
2566 .rename = cgroup_rename,
2567 .setxattr = cgroup_setxattr,
2568 .getxattr = cgroup_getxattr,
2569 .listxattr = cgroup_listxattr,
2570 .removexattr = cgroup_removexattr,
2571};
2572
2573static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2574 struct super_block *sb)
2575{
2576 struct inode *inode;
2577
2578 if (!dentry)
2579 return -ENOENT;
2580 if (dentry->d_inode)
2581 return -EEXIST;
2582
2583 inode = cgroup_new_inode(mode, sb);
2584 if (!inode)
2585 return -ENOMEM;
2586
2587 if (S_ISDIR(mode)) {
2588 inode->i_op = &cgroup_dir_inode_operations;
2589 inode->i_fop = &simple_dir_operations;
2590
2591 /* start off with i_nlink == 2 (for "." entry) */
2592 inc_nlink(inode);
2593 inc_nlink(dentry->d_parent->d_inode);
2594
2595 /*
2596 * Control reaches here with cgroup_mutex held.
2597 * @inode->i_mutex should nest outside cgroup_mutex but we
2598 * want to populate it immediately without releasing
2599 * cgroup_mutex. As @inode isn't visible to anyone else
2600 * yet, trylock will always succeed without affecting
2601 * lockdep checks.
2602 */
2603 WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex));
2604 } else if (S_ISREG(mode)) {
2605 inode->i_size = 0;
2606 inode->i_fop = &cgroup_file_operations;
2607 inode->i_op = &cgroup_file_inode_operations;
2608 }
2609 d_instantiate(dentry, inode);
2610 dget(dentry); /* Extra count - pin the dentry in core */
2611 return 0;
2612}
2613
2614/**
2615 * cgroup_file_mode - deduce file mode of a control file
2616 * @cft: the control file in question
2617 *
2618 * returns cft->mode if ->mode is not 0
2619 * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
2620 * returns S_IRUGO if it has only a read handler
2621 * returns S_IWUSR if it has only a write hander
2622 */
2623static umode_t cgroup_file_mode(const struct cftype *cft)
2624{
2625 umode_t mode = 0;
2626 2334
2627 if (cft->mode) 2335 mutex_lock(&cgroup_tree_mutex);
2628 return cft->mode; 2336 mutex_lock(&cgroup_mutex);
2629 2337
2630 if (cft->read_u64 || cft->read_s64 || cft->seq_show) 2338 ret = kernfs_rename(kn, new_parent, new_name_str);
2631 mode |= S_IRUGO;
2632 2339
2633 if (cft->write_u64 || cft->write_s64 || cft->write_string || 2340 mutex_unlock(&cgroup_mutex);
2634 cft->trigger) 2341 mutex_unlock(&cgroup_tree_mutex);
2635 mode |= S_IWUSR;
2636 2342
2637 return mode; 2343 kernfs_unbreak_active_protection(kn);
2344 kernfs_unbreak_active_protection(new_parent);
2345 return ret;
2638} 2346}
2639 2347
2640static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) 2348static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
2641{ 2349{
2642 struct dentry *dir = cgrp->dentry; 2350 char name[CGROUP_FILE_NAME_MAX];
2643 struct cgroup *parent = __d_cgrp(dir); 2351 struct kernfs_node *kn;
2644 struct dentry *dentry; 2352 struct lock_class_key *key = NULL;
2645 struct cfent *cfe;
2646 int error;
2647 umode_t mode;
2648 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2649
2650 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
2651 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
2652 strcpy(name, cft->ss->name);
2653 strcat(name, ".");
2654 }
2655 strcat(name, cft->name);
2656
2657 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
2658
2659 cfe = kzalloc(sizeof(*cfe), GFP_KERNEL);
2660 if (!cfe)
2661 return -ENOMEM;
2662 2353
2663 dentry = lookup_one_len(name, dir, strlen(name)); 2354#ifdef CONFIG_DEBUG_LOCK_ALLOC
2664 if (IS_ERR(dentry)) { 2355 key = &cft->lockdep_key;
2665 error = PTR_ERR(dentry); 2356#endif
2666 goto out; 2357 kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
2667 } 2358 cgroup_file_mode(cft), 0, cft->kf_ops, cft,
2668 2359 NULL, false, key);
2669 cfe->type = (void *)cft; 2360 return PTR_ERR_OR_ZERO(kn);
2670 cfe->dentry = dentry;
2671 dentry->d_fsdata = cfe;
2672 simple_xattrs_init(&cfe->xattrs);
2673
2674 mode = cgroup_file_mode(cft);
2675 error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb);
2676 if (!error) {
2677 list_add_tail(&cfe->node, &parent->files);
2678 cfe = NULL;
2679 }
2680 dput(dentry);
2681out:
2682 kfree(cfe);
2683 return error;
2684} 2361}
2685 2362
2686/** 2363/**
@@ -2700,11 +2377,12 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2700 struct cftype *cft; 2377 struct cftype *cft;
2701 int ret; 2378 int ret;
2702 2379
2703 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); 2380 lockdep_assert_held(&cgroup_tree_mutex);
2704 lockdep_assert_held(&cgroup_mutex);
2705 2381
2706 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2382 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2707 /* does cft->flags tell us to skip this file on @cgrp? */ 2383 /* does cft->flags tell us to skip this file on @cgrp? */
2384 if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
2385 continue;
2708 if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) 2386 if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
2709 continue; 2387 continue;
2710 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) 2388 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
@@ -2726,44 +2404,19 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2726 return 0; 2404 return 0;
2727} 2405}
2728 2406
2729static void cgroup_cfts_prepare(void) 2407static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
2730 __acquires(&cgroup_mutex)
2731{
2732 /*
2733 * Thanks to the entanglement with vfs inode locking, we can't walk
2734 * the existing cgroups under cgroup_mutex and create files.
2735 * Instead, we use css_for_each_descendant_pre() and drop RCU read
2736 * lock before calling cgroup_addrm_files().
2737 */
2738 mutex_lock(&cgroup_mutex);
2739}
2740
2741static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
2742 __releases(&cgroup_mutex)
2743{ 2408{
2744 LIST_HEAD(pending); 2409 LIST_HEAD(pending);
2745 struct cgroup_subsys *ss = cfts[0].ss; 2410 struct cgroup_subsys *ss = cfts[0].ss;
2746 struct cgroup *root = &ss->root->top_cgroup; 2411 struct cgroup *root = &ss->root->cgrp;
2747 struct super_block *sb = ss->root->sb;
2748 struct dentry *prev = NULL;
2749 struct inode *inode;
2750 struct cgroup_subsys_state *css; 2412 struct cgroup_subsys_state *css;
2751 u64 update_before;
2752 int ret = 0; 2413 int ret = 0;
2753 2414
2754 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ 2415 lockdep_assert_held(&cgroup_tree_mutex);
2755 if (!cfts || ss->root == &cgroup_dummy_root ||
2756 !atomic_inc_not_zero(&sb->s_active)) {
2757 mutex_unlock(&cgroup_mutex);
2758 return 0;
2759 }
2760 2416
2761 /* 2417 /* don't bother if @ss isn't attached */
2762 * All cgroups which are created after we drop cgroup_mutex will 2418 if (ss->root == &cgrp_dfl_root)
2763 * have the updated set of files, so we only need to update the 2419 return 0;
2764 * cgroups created before the current @cgroup_serial_nr_next.
2765 */
2766 update_before = cgroup_serial_nr_next;
2767 2420
2768 /* add/rm files for all cgroups created before */ 2421 /* add/rm files for all cgroups created before */
2769 css_for_each_descendant_pre(css, cgroup_css(root, ss)) { 2422 css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
@@ -2772,62 +2425,75 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
2772 if (cgroup_is_dead(cgrp)) 2425 if (cgroup_is_dead(cgrp))
2773 continue; 2426 continue;
2774 2427
2775 inode = cgrp->dentry->d_inode; 2428 ret = cgroup_addrm_files(cgrp, cfts, is_add);
2776 dget(cgrp->dentry);
2777 dput(prev);
2778 prev = cgrp->dentry;
2779
2780 mutex_unlock(&cgroup_mutex);
2781 mutex_lock(&inode->i_mutex);
2782 mutex_lock(&cgroup_mutex);
2783 if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
2784 ret = cgroup_addrm_files(cgrp, cfts, is_add);
2785 mutex_unlock(&inode->i_mutex);
2786 if (ret) 2429 if (ret)
2787 break; 2430 break;
2788 } 2431 }
2789 mutex_unlock(&cgroup_mutex); 2432
2790 dput(prev); 2433 if (is_add && !ret)
2791 deactivate_super(sb); 2434 kernfs_activate(root->kn);
2792 return ret; 2435 return ret;
2793} 2436}
2794 2437
2795/** 2438static void cgroup_exit_cftypes(struct cftype *cfts)
2796 * cgroup_add_cftypes - add an array of cftypes to a subsystem
2797 * @ss: target cgroup subsystem
2798 * @cfts: zero-length name terminated array of cftypes
2799 *
2800 * Register @cfts to @ss. Files described by @cfts are created for all
2801 * existing cgroups to which @ss is attached and all future cgroups will
2802 * have them too. This function can be called anytime whether @ss is
2803 * attached or not.
2804 *
2805 * Returns 0 on successful registration, -errno on failure. Note that this
2806 * function currently returns 0 as long as @cfts registration is successful
2807 * even if some file creation attempts on existing cgroups fail.
2808 */
2809int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2810{ 2439{
2811 struct cftype_set *set;
2812 struct cftype *cft; 2440 struct cftype *cft;
2813 int ret;
2814 2441
2815 set = kzalloc(sizeof(*set), GFP_KERNEL); 2442 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2816 if (!set) 2443 /* free copy for custom atomic_write_len, see init_cftypes() */
2817 return -ENOMEM; 2444 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
2445 kfree(cft->kf_ops);
2446 cft->kf_ops = NULL;
2447 cft->ss = NULL;
2448 }
2449}
2818 2450
2819 for (cft = cfts; cft->name[0] != '\0'; cft++) 2451static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2452{
2453 struct cftype *cft;
2454
2455 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2456 struct kernfs_ops *kf_ops;
2457
2458 WARN_ON(cft->ss || cft->kf_ops);
2459
2460 if (cft->seq_start)
2461 kf_ops = &cgroup_kf_ops;
2462 else
2463 kf_ops = &cgroup_kf_single_ops;
2464
2465 /*
2466 * Ugh... if @cft wants a custom max_write_len, we need to
2467 * make a copy of kf_ops to set its atomic_write_len.
2468 */
2469 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
2470 kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
2471 if (!kf_ops) {
2472 cgroup_exit_cftypes(cfts);
2473 return -ENOMEM;
2474 }
2475 kf_ops->atomic_write_len = cft->max_write_len;
2476 }
2477
2478 cft->kf_ops = kf_ops;
2820 cft->ss = ss; 2479 cft->ss = ss;
2480 }
2821 2481
2822 cgroup_cfts_prepare(); 2482 return 0;
2823 set->cfts = cfts; 2483}
2824 list_add_tail(&set->node, &ss->cftsets); 2484
2825 ret = cgroup_cfts_commit(cfts, true); 2485static int cgroup_rm_cftypes_locked(struct cftype *cfts)
2826 if (ret) 2486{
2827 cgroup_rm_cftypes(cfts); 2487 lockdep_assert_held(&cgroup_tree_mutex);
2828 return ret; 2488
2489 if (!cfts || !cfts[0].ss)
2490 return -ENOENT;
2491
2492 list_del(&cfts->node);
2493 cgroup_apply_cftypes(cfts, false);
2494 cgroup_exit_cftypes(cfts);
2495 return 0;
2829} 2496}
2830EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
2831 2497
2832/** 2498/**
2833 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem 2499 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
@@ -2842,24 +2508,48 @@ EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
2842 */ 2508 */
2843int cgroup_rm_cftypes(struct cftype *cfts) 2509int cgroup_rm_cftypes(struct cftype *cfts)
2844{ 2510{
2845 struct cftype_set *set; 2511 int ret;
2846 2512
2847 if (!cfts || !cfts[0].ss) 2513 mutex_lock(&cgroup_tree_mutex);
2848 return -ENOENT; 2514 ret = cgroup_rm_cftypes_locked(cfts);
2515 mutex_unlock(&cgroup_tree_mutex);
2516 return ret;
2517}
2849 2518
2850 cgroup_cfts_prepare(); 2519/**
2520 * cgroup_add_cftypes - add an array of cftypes to a subsystem
2521 * @ss: target cgroup subsystem
2522 * @cfts: zero-length name terminated array of cftypes
2523 *
2524 * Register @cfts to @ss. Files described by @cfts are created for all
2525 * existing cgroups to which @ss is attached and all future cgroups will
2526 * have them too. This function can be called anytime whether @ss is
2527 * attached or not.
2528 *
2529 * Returns 0 on successful registration, -errno on failure. Note that this
2530 * function currently returns 0 as long as @cfts registration is successful
2531 * even if some file creation attempts on existing cgroups fail.
2532 */
2533int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2534{
2535 int ret;
2851 2536
2852 list_for_each_entry(set, &cfts[0].ss->cftsets, node) { 2537 if (!cfts || cfts[0].name[0] == '\0')
2853 if (set->cfts == cfts) { 2538 return 0;
2854 list_del(&set->node); 2539
2855 kfree(set); 2540 ret = cgroup_init_cftypes(ss, cfts);
2856 cgroup_cfts_commit(cfts, false); 2541 if (ret)
2857 return 0; 2542 return ret;
2858 } 2543
2859 } 2544 mutex_lock(&cgroup_tree_mutex);
2860 2545
2861 cgroup_cfts_commit(NULL, false); 2546 list_add_tail(&cfts->node, &ss->cfts);
2862 return -ENOENT; 2547 ret = cgroup_apply_cftypes(cfts, true);
2548 if (ret)
2549 cgroup_rm_cftypes_locked(cfts);
2550
2551 mutex_unlock(&cgroup_tree_mutex);
2552 return ret;
2863} 2553}
2864 2554
2865/** 2555/**
@@ -2868,57 +2558,18 @@ int cgroup_rm_cftypes(struct cftype *cfts)
2868 * 2558 *
2869 * Return the number of tasks in the cgroup. 2559 * Return the number of tasks in the cgroup.
2870 */ 2560 */
2871int cgroup_task_count(const struct cgroup *cgrp) 2561static int cgroup_task_count(const struct cgroup *cgrp)
2872{ 2562{
2873 int count = 0; 2563 int count = 0;
2874 struct cgrp_cset_link *link; 2564 struct cgrp_cset_link *link;
2875 2565
2876 read_lock(&css_set_lock); 2566 down_read(&css_set_rwsem);
2877 list_for_each_entry(link, &cgrp->cset_links, cset_link) 2567 list_for_each_entry(link, &cgrp->cset_links, cset_link)
2878 count += atomic_read(&link->cset->refcount); 2568 count += atomic_read(&link->cset->refcount);
2879 read_unlock(&css_set_lock); 2569 up_read(&css_set_rwsem);
2880 return count; 2570 return count;
2881} 2571}
2882 2572
2883/*
2884 * To reduce the fork() overhead for systems that are not actually using
2885 * their cgroups capability, we don't maintain the lists running through
2886 * each css_set to its tasks until we see the list actually used - in other
2887 * words after the first call to css_task_iter_start().
2888 */
2889static void cgroup_enable_task_cg_lists(void)
2890{
2891 struct task_struct *p, *g;
2892 write_lock(&css_set_lock);
2893 use_task_css_set_links = 1;
2894 /*
2895 * We need tasklist_lock because RCU is not safe against
2896 * while_each_thread(). Besides, a forking task that has passed
2897 * cgroup_post_fork() without seeing use_task_css_set_links = 1
2898 * is not guaranteed to have its child immediately visible in the
2899 * tasklist if we walk through it with RCU.
2900 */
2901 read_lock(&tasklist_lock);
2902 do_each_thread(g, p) {
2903 task_lock(p);
2904 /*
2905 * We should check if the process is exiting, otherwise
2906 * it will race with cgroup_exit() in that the list
2907 * entry won't be deleted though the process has exited.
2908 * Do it while holding siglock so that we don't end up
2909 * racing against cgroup_exit().
2910 */
2911 spin_lock_irq(&p->sighand->siglock);
2912 if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
2913 list_add(&p->cg_list, &task_css_set(p)->tasks);
2914 spin_unlock_irq(&p->sighand->siglock);
2915
2916 task_unlock(p);
2917 } while_each_thread(g, p);
2918 read_unlock(&tasklist_lock);
2919 write_unlock(&css_set_lock);
2920}
2921
2922/** 2573/**
2923 * css_next_child - find the next child of a given css 2574 * css_next_child - find the next child of a given css
2924 * @pos_css: the current position (%NULL to initiate traversal) 2575 * @pos_css: the current position (%NULL to initiate traversal)
@@ -2937,7 +2588,7 @@ css_next_child(struct cgroup_subsys_state *pos_css,
2937 struct cgroup *cgrp = parent_css->cgroup; 2588 struct cgroup *cgrp = parent_css->cgroup;
2938 struct cgroup *next; 2589 struct cgroup *next;
2939 2590
2940 cgroup_assert_mutex_or_rcu_locked(); 2591 cgroup_assert_mutexes_or_rcu_locked();
2941 2592
2942 /* 2593 /*
2943 * @pos could already have been removed. Once a cgroup is removed, 2594 * @pos could already have been removed. Once a cgroup is removed,
@@ -2973,7 +2624,6 @@ css_next_child(struct cgroup_subsys_state *pos_css,
2973 2624
2974 return cgroup_css(next, parent_css->ss); 2625 return cgroup_css(next, parent_css->ss);
2975} 2626}
2976EXPORT_SYMBOL_GPL(css_next_child);
2977 2627
2978/** 2628/**
2979 * css_next_descendant_pre - find the next descendant for pre-order walk 2629 * css_next_descendant_pre - find the next descendant for pre-order walk
@@ -2995,7 +2645,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
2995{ 2645{
2996 struct cgroup_subsys_state *next; 2646 struct cgroup_subsys_state *next;
2997 2647
2998 cgroup_assert_mutex_or_rcu_locked(); 2648 cgroup_assert_mutexes_or_rcu_locked();
2999 2649
3000 /* if first iteration, visit @root */ 2650 /* if first iteration, visit @root */
3001 if (!pos) 2651 if (!pos)
@@ -3016,7 +2666,6 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
3016 2666
3017 return NULL; 2667 return NULL;
3018} 2668}
3019EXPORT_SYMBOL_GPL(css_next_descendant_pre);
3020 2669
3021/** 2670/**
3022 * css_rightmost_descendant - return the rightmost descendant of a css 2671 * css_rightmost_descendant - return the rightmost descendant of a css
@@ -3036,7 +2685,7 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos)
3036{ 2685{
3037 struct cgroup_subsys_state *last, *tmp; 2686 struct cgroup_subsys_state *last, *tmp;
3038 2687
3039 cgroup_assert_mutex_or_rcu_locked(); 2688 cgroup_assert_mutexes_or_rcu_locked();
3040 2689
3041 do { 2690 do {
3042 last = pos; 2691 last = pos;
@@ -3048,7 +2697,6 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos)
3048 2697
3049 return last; 2698 return last;
3050} 2699}
3051EXPORT_SYMBOL_GPL(css_rightmost_descendant);
3052 2700
3053static struct cgroup_subsys_state * 2701static struct cgroup_subsys_state *
3054css_leftmost_descendant(struct cgroup_subsys_state *pos) 2702css_leftmost_descendant(struct cgroup_subsys_state *pos)
@@ -3084,7 +2732,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
3084{ 2732{
3085 struct cgroup_subsys_state *next; 2733 struct cgroup_subsys_state *next;
3086 2734
3087 cgroup_assert_mutex_or_rcu_locked(); 2735 cgroup_assert_mutexes_or_rcu_locked();
3088 2736
3089 /* if first iteration, visit leftmost descendant which may be @root */ 2737 /* if first iteration, visit leftmost descendant which may be @root */
3090 if (!pos) 2738 if (!pos)
@@ -3102,7 +2750,6 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
3102 /* no sibling left, visit parent */ 2750 /* no sibling left, visit parent */
3103 return css_parent(pos); 2751 return css_parent(pos);
3104} 2752}
3105EXPORT_SYMBOL_GPL(css_next_descendant_post);
3106 2753
3107/** 2754/**
3108 * css_advance_task_iter - advance a task itererator to the next css_set 2755 * css_advance_task_iter - advance a task itererator to the next css_set
@@ -3125,9 +2772,14 @@ static void css_advance_task_iter(struct css_task_iter *it)
3125 } 2772 }
3126 link = list_entry(l, struct cgrp_cset_link, cset_link); 2773 link = list_entry(l, struct cgrp_cset_link, cset_link);
3127 cset = link->cset; 2774 cset = link->cset;
3128 } while (list_empty(&cset->tasks)); 2775 } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
2776
3129 it->cset_link = l; 2777 it->cset_link = l;
3130 it->task = cset->tasks.next; 2778
2779 if (!list_empty(&cset->tasks))
2780 it->task = cset->tasks.next;
2781 else
2782 it->task = cset->mg_tasks.next;
3131} 2783}
3132 2784
3133/** 2785/**
@@ -3146,17 +2798,12 @@ static void css_advance_task_iter(struct css_task_iter *it)
3146 */ 2798 */
3147void css_task_iter_start(struct cgroup_subsys_state *css, 2799void css_task_iter_start(struct cgroup_subsys_state *css,
3148 struct css_task_iter *it) 2800 struct css_task_iter *it)
3149 __acquires(css_set_lock) 2801 __acquires(css_set_rwsem)
3150{ 2802{
3151 /* 2803 /* no one should try to iterate before mounting cgroups */
3152 * The first time anyone tries to iterate across a css, we need to 2804 WARN_ON_ONCE(!use_task_css_set_links);
3153 * enable the list linking each css_set to its tasks, and fix up
3154 * all existing tasks.
3155 */
3156 if (!use_task_css_set_links)
3157 cgroup_enable_task_cg_lists();
3158 2805
3159 read_lock(&css_set_lock); 2806 down_read(&css_set_rwsem);
3160 2807
3161 it->origin_css = css; 2808 it->origin_css = css;
3162 it->cset_link = &css->cgroup->cset_links; 2809 it->cset_link = &css->cgroup->cset_links;
@@ -3176,24 +2823,29 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
3176{ 2823{
3177 struct task_struct *res; 2824 struct task_struct *res;
3178 struct list_head *l = it->task; 2825 struct list_head *l = it->task;
3179 struct cgrp_cset_link *link; 2826 struct cgrp_cset_link *link = list_entry(it->cset_link,
2827 struct cgrp_cset_link, cset_link);
3180 2828
3181 /* If the iterator cg is NULL, we have no tasks */ 2829 /* If the iterator cg is NULL, we have no tasks */
3182 if (!it->cset_link) 2830 if (!it->cset_link)
3183 return NULL; 2831 return NULL;
3184 res = list_entry(l, struct task_struct, cg_list); 2832 res = list_entry(l, struct task_struct, cg_list);
3185 /* Advance iterator to find next entry */ 2833
2834 /*
2835 * Advance iterator to find next entry. cset->tasks is consumed
2836 * first and then ->mg_tasks. After ->mg_tasks, we move onto the
2837 * next cset.
2838 */
3186 l = l->next; 2839 l = l->next;
3187 link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); 2840
3188 if (l == &link->cset->tasks) { 2841 if (l == &link->cset->tasks)
3189 /* 2842 l = link->cset->mg_tasks.next;
3190 * We reached the end of this task list - move on to the 2843
3191 * next cgrp_cset_link. 2844 if (l == &link->cset->mg_tasks)
3192 */
3193 css_advance_task_iter(it); 2845 css_advance_task_iter(it);
3194 } else { 2846 else
3195 it->task = l; 2847 it->task = l;
3196 } 2848
3197 return res; 2849 return res;
3198} 2850}
3199 2851
@@ -3204,191 +2856,62 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
3204 * Finish task iteration started by css_task_iter_start(). 2856 * Finish task iteration started by css_task_iter_start().
3205 */ 2857 */
3206void css_task_iter_end(struct css_task_iter *it) 2858void css_task_iter_end(struct css_task_iter *it)
3207 __releases(css_set_lock) 2859 __releases(css_set_rwsem)
3208{
3209 read_unlock(&css_set_lock);
3210}
3211
3212static inline int started_after_time(struct task_struct *t1,
3213 struct timespec *time,
3214 struct task_struct *t2)
3215{
3216 int start_diff = timespec_compare(&t1->start_time, time);
3217 if (start_diff > 0) {
3218 return 1;
3219 } else if (start_diff < 0) {
3220 return 0;
3221 } else {
3222 /*
3223 * Arbitrarily, if two processes started at the same
3224 * time, we'll say that the lower pointer value
3225 * started first. Note that t2 may have exited by now
3226 * so this may not be a valid pointer any longer, but
3227 * that's fine - it still serves to distinguish
3228 * between two tasks started (effectively) simultaneously.
3229 */
3230 return t1 > t2;
3231 }
3232}
3233
3234/*
3235 * This function is a callback from heap_insert() and is used to order
3236 * the heap.
3237 * In this case we order the heap in descending task start time.
3238 */
3239static inline int started_after(void *p1, void *p2)
3240{ 2860{
3241 struct task_struct *t1 = p1; 2861 up_read(&css_set_rwsem);
3242 struct task_struct *t2 = p2;
3243 return started_after_time(t1, &t2->start_time, t2);
3244} 2862}
3245 2863
3246/** 2864/**
3247 * css_scan_tasks - iterate though all the tasks in a css 2865 * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
3248 * @css: the css to iterate tasks of 2866 * @to: cgroup to which the tasks will be moved
3249 * @test: optional test callback 2867 * @from: cgroup in which the tasks currently reside
3250 * @process: process callback
3251 * @data: data passed to @test and @process
3252 * @heap: optional pre-allocated heap used for task iteration
3253 *
3254 * Iterate through all the tasks in @css, calling @test for each, and if it
3255 * returns %true, call @process for it also.
3256 *
3257 * @test may be NULL, meaning always true (select all tasks), which
3258 * effectively duplicates css_task_iter_{start,next,end}() but does not
3259 * lock css_set_lock for the call to @process.
3260 *
3261 * It is guaranteed that @process will act on every task that is a member
3262 * of @css for the duration of this call. This function may or may not
3263 * call @process for tasks that exit or move to a different css during the
3264 * call, or are forked or move into the css during the call.
3265 *
3266 * Note that @test may be called with locks held, and may in some
3267 * situations be called multiple times for the same task, so it should be
3268 * cheap.
3269 * 2868 *
3270 * If @heap is non-NULL, a heap has been pre-allocated and will be used for 2869 * Locking rules between cgroup_post_fork() and the migration path
3271 * heap operations (and its "gt" member will be overwritten), else a 2870 * guarantee that, if a task is forking while being migrated, the new child
3272 * temporary heap will be used (allocation of which may cause this function 2871 * is guaranteed to be either visible in the source cgroup after the
3273 * to fail). 2872 * parent's migration is complete or put into the target cgroup. No task
2873 * can slip out of migration through forking.
3274 */ 2874 */
3275int css_scan_tasks(struct cgroup_subsys_state *css, 2875int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3276 bool (*test)(struct task_struct *, void *),
3277 void (*process)(struct task_struct *, void *),
3278 void *data, struct ptr_heap *heap)
3279{ 2876{
3280 int retval, i; 2877 LIST_HEAD(preloaded_csets);
2878 struct cgrp_cset_link *link;
3281 struct css_task_iter it; 2879 struct css_task_iter it;
3282 struct task_struct *p, *dropped; 2880 struct task_struct *task;
3283 /* Never dereference latest_task, since it's not refcounted */ 2881 int ret;
3284 struct task_struct *latest_task = NULL;
3285 struct ptr_heap tmp_heap;
3286 struct timespec latest_time = { 0, 0 };
3287
3288 if (heap) {
3289 /* The caller supplied our heap and pre-allocated its memory */
3290 heap->gt = &started_after;
3291 } else {
3292 /* We need to allocate our own heap memory */
3293 heap = &tmp_heap;
3294 retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
3295 if (retval)
3296 /* cannot allocate the heap */
3297 return retval;
3298 }
3299 2882
3300 again: 2883 mutex_lock(&cgroup_mutex);
3301 /*
3302 * Scan tasks in the css, using the @test callback to determine
3303 * which are of interest, and invoking @process callback on the
3304 * ones which need an update. Since we don't want to hold any
3305 * locks during the task updates, gather tasks to be processed in a
3306 * heap structure. The heap is sorted by descending task start
3307 * time. If the statically-sized heap fills up, we overflow tasks
3308 * that started later, and in future iterations only consider tasks
3309 * that started after the latest task in the previous pass. This
3310 * guarantees forward progress and that we don't miss any tasks.
3311 */
3312 heap->size = 0;
3313 css_task_iter_start(css, &it);
3314 while ((p = css_task_iter_next(&it))) {
3315 /*
3316 * Only affect tasks that qualify per the caller's callback,
3317 * if he provided one
3318 */
3319 if (test && !test(p, data))
3320 continue;
3321 /*
3322 * Only process tasks that started after the last task
3323 * we processed
3324 */
3325 if (!started_after_time(p, &latest_time, latest_task))
3326 continue;
3327 dropped = heap_insert(heap, p);
3328 if (dropped == NULL) {
3329 /*
3330 * The new task was inserted; the heap wasn't
3331 * previously full
3332 */
3333 get_task_struct(p);
3334 } else if (dropped != p) {
3335 /*
3336 * The new task was inserted, and pushed out a
3337 * different task
3338 */
3339 get_task_struct(p);
3340 put_task_struct(dropped);
3341 }
3342 /*
3343 * Else the new task was newer than anything already in
3344 * the heap and wasn't inserted
3345 */
3346 }
3347 css_task_iter_end(&it);
3348 2884
3349 if (heap->size) { 2885 /* all tasks in @from are being moved, all csets are source */
3350 for (i = 0; i < heap->size; i++) { 2886 down_read(&css_set_rwsem);
3351 struct task_struct *q = heap->ptrs[i]; 2887 list_for_each_entry(link, &from->cset_links, cset_link)
3352 if (i == 0) { 2888 cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
3353 latest_time = q->start_time; 2889 up_read(&css_set_rwsem);
3354 latest_task = q;
3355 }
3356 /* Process the task per the caller's callback */
3357 process(q, data);
3358 put_task_struct(q);
3359 }
3360 /*
3361 * If we had to process any tasks at all, scan again
3362 * in case some of them were in the middle of forking
3363 * children that didn't get processed.
3364 * Not the most efficient way to do it, but it avoids
3365 * having to take callback_mutex in the fork path
3366 */
3367 goto again;
3368 }
3369 if (heap == &tmp_heap)
3370 heap_free(&tmp_heap);
3371 return 0;
3372}
3373 2890
3374static void cgroup_transfer_one_task(struct task_struct *task, void *data) 2891 ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
3375{ 2892 if (ret)
3376 struct cgroup *new_cgroup = data; 2893 goto out_err;
3377 2894
3378 mutex_lock(&cgroup_mutex); 2895 /*
3379 cgroup_attach_task(new_cgroup, task, false); 2896 * Migrate tasks one-by-one until @form is empty. This fails iff
2897 * ->can_attach() fails.
2898 */
2899 do {
2900 css_task_iter_start(&from->dummy_css, &it);
2901 task = css_task_iter_next(&it);
2902 if (task)
2903 get_task_struct(task);
2904 css_task_iter_end(&it);
2905
2906 if (task) {
2907 ret = cgroup_migrate(to, task, false);
2908 put_task_struct(task);
2909 }
2910 } while (task && !ret);
2911out_err:
2912 cgroup_migrate_finish(&preloaded_csets);
3380 mutex_unlock(&cgroup_mutex); 2913 mutex_unlock(&cgroup_mutex);
3381} 2914 return ret;
3382
3383/**
3384 * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
3385 * @to: cgroup to which the tasks will be moved
3386 * @from: cgroup in which the tasks currently reside
3387 */
3388int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3389{
3390 return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task,
3391 to, NULL);
3392} 2915}
3393 2916
3394/* 2917/*
@@ -3687,21 +3210,31 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3687 */ 3210 */
3688int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) 3211int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3689{ 3212{
3690 int ret = -EINVAL; 3213 struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
3691 struct cgroup *cgrp; 3214 struct cgroup *cgrp;
3692 struct css_task_iter it; 3215 struct css_task_iter it;
3693 struct task_struct *tsk; 3216 struct task_struct *tsk;
3694 3217
3218 /* it should be kernfs_node belonging to cgroupfs and is a directory */
3219 if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
3220 kernfs_type(kn) != KERNFS_DIR)
3221 return -EINVAL;
3222
3223 mutex_lock(&cgroup_mutex);
3224
3695 /* 3225 /*
3696 * Validate dentry by checking the superblock operations, 3226 * We aren't being called from kernfs and there's no guarantee on
3697 * and make sure it's a directory. 3227 * @kn->priv's validity. For this and css_tryget_from_dir(),
3228 * @kn->priv is RCU safe. Let's do the RCU dancing.
3698 */ 3229 */
3699 if (dentry->d_sb->s_op != &cgroup_ops || 3230 rcu_read_lock();
3700 !S_ISDIR(dentry->d_inode->i_mode)) 3231 cgrp = rcu_dereference(kn->priv);
3701 goto err; 3232 if (!cgrp || cgroup_is_dead(cgrp)) {
3702 3233 rcu_read_unlock();
3703 ret = 0; 3234 mutex_unlock(&cgroup_mutex);
3704 cgrp = dentry->d_fsdata; 3235 return -ENOENT;
3236 }
3237 rcu_read_unlock();
3705 3238
3706 css_task_iter_start(&cgrp->dummy_css, &it); 3239 css_task_iter_start(&cgrp->dummy_css, &it);
3707 while ((tsk = css_task_iter_next(&it))) { 3240 while ((tsk = css_task_iter_next(&it))) {
@@ -3726,8 +3259,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3726 } 3259 }
3727 css_task_iter_end(&it); 3260 css_task_iter_end(&it);
3728 3261
3729err: 3262 mutex_unlock(&cgroup_mutex);
3730 return ret; 3263 return 0;
3731} 3264}
3732 3265
3733 3266
@@ -3745,7 +3278,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3745 * after a seek to the start). Use a binary-search to find the 3278 * after a seek to the start). Use a binary-search to find the
3746 * next pid to display, if any 3279 * next pid to display, if any
3747 */ 3280 */
3748 struct cgroup_open_file *of = s->private; 3281 struct kernfs_open_file *of = s->private;
3749 struct cgroup *cgrp = seq_css(s)->cgroup; 3282 struct cgroup *cgrp = seq_css(s)->cgroup;
3750 struct cgroup_pidlist *l; 3283 struct cgroup_pidlist *l;
3751 enum cgroup_filetype type = seq_cft(s)->private; 3284 enum cgroup_filetype type = seq_cft(s)->private;
@@ -3800,7 +3333,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3800 3333
3801static void cgroup_pidlist_stop(struct seq_file *s, void *v) 3334static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3802{ 3335{
3803 struct cgroup_open_file *of = s->private; 3336 struct kernfs_open_file *of = s->private;
3804 struct cgroup_pidlist *l = of->priv; 3337 struct cgroup_pidlist *l = of->priv;
3805 3338
3806 if (l) 3339 if (l)
@@ -3811,7 +3344,7 @@ static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3811 3344
3812static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) 3345static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
3813{ 3346{
3814 struct cgroup_open_file *of = s->private; 3347 struct kernfs_open_file *of = s->private;
3815 struct cgroup_pidlist *l = of->priv; 3348 struct cgroup_pidlist *l = of->priv;
3816 pid_t *p = v; 3349 pid_t *p = v;
3817 pid_t *end = l->list + l->length; 3350 pid_t *end = l->list + l->length;
@@ -3861,23 +3394,6 @@ static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
3861 return 0; 3394 return 0;
3862} 3395}
3863 3396
3864/*
3865 * When dput() is called asynchronously, if umount has been done and
3866 * then deactivate_super() in cgroup_free_fn() kills the superblock,
3867 * there's a small window that vfs will see the root dentry with non-zero
3868 * refcnt and trigger BUG().
3869 *
3870 * That's why we hold a reference before dput() and drop it right after.
3871 */
3872static void cgroup_dput(struct cgroup *cgrp)
3873{
3874 struct super_block *sb = cgrp->root->sb;
3875
3876 atomic_inc(&sb->s_active);
3877 dput(cgrp->dentry);
3878 deactivate_super(sb);
3879}
3880
3881static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, 3397static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
3882 struct cftype *cft) 3398 struct cftype *cft)
3883{ 3399{
@@ -3944,7 +3460,7 @@ static struct cftype cgroup_base_files[] = {
3944 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, 3460 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
3945 .seq_show = cgroup_release_agent_show, 3461 .seq_show = cgroup_release_agent_show,
3946 .write_string = cgroup_release_agent_write, 3462 .write_string = cgroup_release_agent_write,
3947 .max_write_len = PATH_MAX, 3463 .max_write_len = PATH_MAX - 1,
3948 }, 3464 },
3949 { } /* terminate */ 3465 { } /* terminate */
3950}; 3466};
@@ -3963,13 +3479,13 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
3963 3479
3964 /* process cftsets of each subsystem */ 3480 /* process cftsets of each subsystem */
3965 for_each_subsys(ss, i) { 3481 for_each_subsys(ss, i) {
3966 struct cftype_set *set; 3482 struct cftype *cfts;
3967 3483
3968 if (!test_bit(i, &subsys_mask)) 3484 if (!test_bit(i, &subsys_mask))
3969 continue; 3485 continue;
3970 3486
3971 list_for_each_entry(set, &ss->cftsets, node) { 3487 list_for_each_entry(cfts, &ss->cfts, node) {
3972 ret = cgroup_addrm_files(cgrp, set->cfts, true); 3488 ret = cgroup_addrm_files(cgrp, cfts, true);
3973 if (ret < 0) 3489 if (ret < 0)
3974 goto err; 3490 goto err;
3975 } 3491 }
@@ -4012,7 +3528,7 @@ static void css_free_work_fn(struct work_struct *work)
4012 css_put(css->parent); 3528 css_put(css->parent);
4013 3529
4014 css->ss->css_free(css); 3530 css->ss->css_free(css);
4015 cgroup_dput(cgrp); 3531 cgroup_put(cgrp);
4016} 3532}
4017 3533
4018static void css_free_rcu_fn(struct rcu_head *rcu_head) 3534static void css_free_rcu_fn(struct rcu_head *rcu_head)
@@ -4020,10 +3536,6 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
4020 struct cgroup_subsys_state *css = 3536 struct cgroup_subsys_state *css =
4021 container_of(rcu_head, struct cgroup_subsys_state, rcu_head); 3537 container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
4022 3538
4023 /*
4024 * css holds an extra ref to @cgrp->dentry which is put on the last
4025 * css_put(). dput() requires process context which we don't have.
4026 */
4027 INIT_WORK(&css->destroy_work, css_free_work_fn); 3539 INIT_WORK(&css->destroy_work, css_free_work_fn);
4028 queue_work(cgroup_destroy_wq, &css->destroy_work); 3540 queue_work(cgroup_destroy_wq, &css->destroy_work);
4029} 3541}
@@ -4033,7 +3545,7 @@ static void css_release(struct percpu_ref *ref)
4033 struct cgroup_subsys_state *css = 3545 struct cgroup_subsys_state *css =
4034 container_of(ref, struct cgroup_subsys_state, refcnt); 3546 container_of(ref, struct cgroup_subsys_state, refcnt);
4035 3547
4036 rcu_assign_pointer(css->cgroup->subsys[css->ss->subsys_id], NULL); 3548 RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL);
4037 call_rcu(&css->rcu_head, css_free_rcu_fn); 3549 call_rcu(&css->rcu_head, css_free_rcu_fn);
4038} 3550}
4039 3551
@@ -4058,6 +3570,7 @@ static int online_css(struct cgroup_subsys_state *css)
4058 struct cgroup_subsys *ss = css->ss; 3570 struct cgroup_subsys *ss = css->ss;
4059 int ret = 0; 3571 int ret = 0;
4060 3572
3573 lockdep_assert_held(&cgroup_tree_mutex);
4061 lockdep_assert_held(&cgroup_mutex); 3574 lockdep_assert_held(&cgroup_mutex);
4062 3575
4063 if (ss->css_online) 3576 if (ss->css_online)
@@ -4065,7 +3578,7 @@ static int online_css(struct cgroup_subsys_state *css)
4065 if (!ret) { 3578 if (!ret) {
4066 css->flags |= CSS_ONLINE; 3579 css->flags |= CSS_ONLINE;
4067 css->cgroup->nr_css++; 3580 css->cgroup->nr_css++;
4068 rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css); 3581 rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
4069 } 3582 }
4070 return ret; 3583 return ret;
4071} 3584}
@@ -4075,6 +3588,7 @@ static void offline_css(struct cgroup_subsys_state *css)
4075{ 3588{
4076 struct cgroup_subsys *ss = css->ss; 3589 struct cgroup_subsys *ss = css->ss;
4077 3590
3591 lockdep_assert_held(&cgroup_tree_mutex);
4078 lockdep_assert_held(&cgroup_mutex); 3592 lockdep_assert_held(&cgroup_mutex);
4079 3593
4080 if (!(css->flags & CSS_ONLINE)) 3594 if (!(css->flags & CSS_ONLINE))
@@ -4085,7 +3599,7 @@ static void offline_css(struct cgroup_subsys_state *css)
4085 3599
4086 css->flags &= ~CSS_ONLINE; 3600 css->flags &= ~CSS_ONLINE;
4087 css->cgroup->nr_css--; 3601 css->cgroup->nr_css--;
4088 RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css); 3602 RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css);
4089} 3603}
4090 3604
4091/** 3605/**
@@ -4103,7 +3617,6 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4103 struct cgroup_subsys_state *css; 3617 struct cgroup_subsys_state *css;
4104 int err; 3618 int err;
4105 3619
4106 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
4107 lockdep_assert_held(&cgroup_mutex); 3620 lockdep_assert_held(&cgroup_mutex);
4108 3621
4109 css = ss->css_alloc(cgroup_css(parent, ss)); 3622 css = ss->css_alloc(cgroup_css(parent, ss));
@@ -4116,7 +3629,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4116 3629
4117 init_css(css, ss, cgrp); 3630 init_css(css, ss, cgrp);
4118 3631
4119 err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id); 3632 err = cgroup_populate_dir(cgrp, 1 << ss->id);
4120 if (err) 3633 if (err)
4121 goto err_free_percpu_ref; 3634 goto err_free_percpu_ref;
4122 3635
@@ -4124,9 +3637,11 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4124 if (err) 3637 if (err)
4125 goto err_clear_dir; 3638 goto err_clear_dir;
4126 3639
4127 dget(cgrp->dentry); 3640 cgroup_get(cgrp);
4128 css_get(css->parent); 3641 css_get(css->parent);
4129 3642
3643 cgrp->subsys_mask |= 1 << ss->id;
3644
4130 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && 3645 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4131 parent->parent) { 3646 parent->parent) {
4132 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", 3647 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
@@ -4139,7 +3654,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4139 return 0; 3654 return 0;
4140 3655
4141err_clear_dir: 3656err_clear_dir:
4142 cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id); 3657 cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
4143err_free_percpu_ref: 3658err_free_percpu_ref:
4144 percpu_ref_cancel_init(&css->refcnt); 3659 percpu_ref_cancel_init(&css->refcnt);
4145err_free_css: 3660err_free_css:
@@ -4147,35 +3662,34 @@ err_free_css:
4147 return err; 3662 return err;
4148} 3663}
4149 3664
4150/* 3665/**
4151 * cgroup_create - create a cgroup 3666 * cgroup_create - create a cgroup
4152 * @parent: cgroup that will be parent of the new cgroup 3667 * @parent: cgroup that will be parent of the new cgroup
4153 * @dentry: dentry of the new cgroup 3668 * @name: name of the new cgroup
4154 * @mode: mode to set on new inode 3669 * @mode: mode to set on new cgroup
4155 *
4156 * Must be called with the mutex on the parent inode held
4157 */ 3670 */
4158static long cgroup_create(struct cgroup *parent, struct dentry *dentry, 3671static long cgroup_create(struct cgroup *parent, const char *name,
4159 umode_t mode) 3672 umode_t mode)
4160{ 3673{
4161 struct cgroup *cgrp; 3674 struct cgroup *cgrp;
4162 struct cgroup_name *name; 3675 struct cgroup_root *root = parent->root;
4163 struct cgroupfs_root *root = parent->root;
4164 int ssid, err; 3676 int ssid, err;
4165 struct cgroup_subsys *ss; 3677 struct cgroup_subsys *ss;
4166 struct super_block *sb = root->sb; 3678 struct kernfs_node *kn;
3679
3680 /*
3681 * XXX: The default hierarchy isn't fully implemented yet. Block
3682 * !root cgroup creation on it for now.
3683 */
3684 if (root == &cgrp_dfl_root)
3685 return -EINVAL;
4167 3686
4168 /* allocate the cgroup and its ID, 0 is reserved for the root */ 3687 /* allocate the cgroup and its ID, 0 is reserved for the root */
4169 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); 3688 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
4170 if (!cgrp) 3689 if (!cgrp)
4171 return -ENOMEM; 3690 return -ENOMEM;
4172 3691
4173 name = cgroup_alloc_name(dentry); 3692 mutex_lock(&cgroup_tree_mutex);
4174 if (!name) {
4175 err = -ENOMEM;
4176 goto err_free_cgrp;
4177 }
4178 rcu_assign_pointer(cgrp->name, name);
4179 3693
4180 /* 3694 /*
4181 * Only live parents can have children. Note that the liveliness 3695 * Only live parents can have children. Note that the liveliness
@@ -4186,7 +3700,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4186 */ 3700 */
4187 if (!cgroup_lock_live_group(parent)) { 3701 if (!cgroup_lock_live_group(parent)) {
4188 err = -ENODEV; 3702 err = -ENODEV;
4189 goto err_free_name; 3703 goto err_unlock_tree;
4190 } 3704 }
4191 3705
4192 /* 3706 /*
@@ -4199,18 +3713,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4199 goto err_unlock; 3713 goto err_unlock;
4200 } 3714 }
4201 3715
4202 /* Grab a reference on the superblock so the hierarchy doesn't
4203 * get deleted on unmount if there are child cgroups. This
4204 * can be done outside cgroup_mutex, since the sb can't
4205 * disappear while someone has an open control file on the
4206 * fs */
4207 atomic_inc(&sb->s_active);
4208
4209 init_cgroup_housekeeping(cgrp); 3716 init_cgroup_housekeeping(cgrp);
4210 3717
4211 dentry->d_fsdata = cgrp;
4212 cgrp->dentry = dentry;
4213
4214 cgrp->parent = parent; 3718 cgrp->parent = parent;
4215 cgrp->dummy_css.parent = &parent->dummy_css; 3719 cgrp->dummy_css.parent = &parent->dummy_css;
4216 cgrp->root = parent->root; 3720 cgrp->root = parent->root;
@@ -4221,24 +3725,26 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4221 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) 3725 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
4222 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 3726 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4223 3727
3728 /* create the directory */
3729 kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
3730 if (IS_ERR(kn)) {
3731 err = PTR_ERR(kn);
3732 goto err_free_id;
3733 }
3734 cgrp->kn = kn;
3735
4224 /* 3736 /*
4225 * Create directory. cgroup_create_file() returns with the new 3737 * This extra ref will be put in cgroup_free_fn() and guarantees
4226 * directory locked on success so that it can be populated without 3738 * that @cgrp->kn is always accessible.
4227 * dropping cgroup_mutex.
4228 */ 3739 */
4229 err = cgroup_create_file(dentry, S_IFDIR | mode, sb); 3740 kernfs_get(kn);
4230 if (err < 0)
4231 goto err_free_id;
4232 lockdep_assert_held(&dentry->d_inode->i_mutex);
4233 3741
4234 cgrp->serial_nr = cgroup_serial_nr_next++; 3742 cgrp->serial_nr = cgroup_serial_nr_next++;
4235 3743
4236 /* allocation complete, commit to creation */ 3744 /* allocation complete, commit to creation */
4237 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); 3745 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4238 root->number_of_cgroups++; 3746 atomic_inc(&root->nr_cgrps);
4239 3747 cgroup_get(parent);
4240 /* hold a ref to the parent's dentry */
4241 dget(parent->dentry);
4242 3748
4243 /* 3749 /*
4244 * @cgrp is now fully operational. If something fails after this 3750 * @cgrp is now fully operational. If something fails after this
@@ -4252,43 +3758,56 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4252 3758
4253 /* let's create and online css's */ 3759 /* let's create and online css's */
4254 for_each_subsys(ss, ssid) { 3760 for_each_subsys(ss, ssid) {
4255 if (root->subsys_mask & (1 << ssid)) { 3761 if (root->cgrp.subsys_mask & (1 << ssid)) {
4256 err = create_css(cgrp, ss); 3762 err = create_css(cgrp, ss);
4257 if (err) 3763 if (err)
4258 goto err_destroy; 3764 goto err_destroy;
4259 } 3765 }
4260 } 3766 }
4261 3767
3768 kernfs_activate(kn);
3769
4262 mutex_unlock(&cgroup_mutex); 3770 mutex_unlock(&cgroup_mutex);
4263 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 3771 mutex_unlock(&cgroup_tree_mutex);
4264 3772
4265 return 0; 3773 return 0;
4266 3774
4267err_free_id: 3775err_free_id:
4268 idr_remove(&root->cgroup_idr, cgrp->id); 3776 idr_remove(&root->cgroup_idr, cgrp->id);
4269 /* Release the reference count that we took on the superblock */
4270 deactivate_super(sb);
4271err_unlock: 3777err_unlock:
4272 mutex_unlock(&cgroup_mutex); 3778 mutex_unlock(&cgroup_mutex);
4273err_free_name: 3779err_unlock_tree:
4274 kfree(rcu_dereference_raw(cgrp->name)); 3780 mutex_unlock(&cgroup_tree_mutex);
4275err_free_cgrp:
4276 kfree(cgrp); 3781 kfree(cgrp);
4277 return err; 3782 return err;
4278 3783
4279err_destroy: 3784err_destroy:
4280 cgroup_destroy_locked(cgrp); 3785 cgroup_destroy_locked(cgrp);
4281 mutex_unlock(&cgroup_mutex); 3786 mutex_unlock(&cgroup_mutex);
4282 mutex_unlock(&dentry->d_inode->i_mutex); 3787 mutex_unlock(&cgroup_tree_mutex);
4283 return err; 3788 return err;
4284} 3789}
4285 3790
4286static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 3791static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
3792 umode_t mode)
4287{ 3793{
4288 struct cgroup *c_parent = dentry->d_parent->d_fsdata; 3794 struct cgroup *parent = parent_kn->priv;
3795 int ret;
3796
3797 /*
3798 * cgroup_create() grabs cgroup_tree_mutex which nests outside
3799 * kernfs active_ref and cgroup_create() already synchronizes
3800 * properly against removal through cgroup_lock_live_group().
3801 * Break it before calling cgroup_create().
3802 */
3803 cgroup_get(parent);
3804 kernfs_break_active_protection(parent_kn);
3805
3806 ret = cgroup_create(parent, name, mode);
4289 3807
4290 /* the vfs holds inode->i_mutex already */ 3808 kernfs_unbreak_active_protection(parent_kn);
4291 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 3809 cgroup_put(parent);
3810 return ret;
4292} 3811}
4293 3812
4294/* 3813/*
@@ -4301,6 +3820,7 @@ static void css_killed_work_fn(struct work_struct *work)
4301 container_of(work, struct cgroup_subsys_state, destroy_work); 3820 container_of(work, struct cgroup_subsys_state, destroy_work);
4302 struct cgroup *cgrp = css->cgroup; 3821 struct cgroup *cgrp = css->cgroup;
4303 3822
3823 mutex_lock(&cgroup_tree_mutex);
4304 mutex_lock(&cgroup_mutex); 3824 mutex_lock(&cgroup_mutex);
4305 3825
4306 /* 3826 /*
@@ -4318,6 +3838,7 @@ static void css_killed_work_fn(struct work_struct *work)
4318 cgroup_destroy_css_killed(cgrp); 3838 cgroup_destroy_css_killed(cgrp);
4319 3839
4320 mutex_unlock(&cgroup_mutex); 3840 mutex_unlock(&cgroup_mutex);
3841 mutex_unlock(&cgroup_tree_mutex);
4321 3842
4322 /* 3843 /*
4323 * Put the css refs from kill_css(). Each css holds an extra 3844 * Put the css refs from kill_css(). Each css holds an extra
@@ -4339,18 +3860,15 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
4339 queue_work(cgroup_destroy_wq, &css->destroy_work); 3860 queue_work(cgroup_destroy_wq, &css->destroy_work);
4340} 3861}
4341 3862
4342/** 3863static void __kill_css(struct cgroup_subsys_state *css)
4343 * kill_css - destroy a css
4344 * @css: css to destroy
4345 *
4346 * This function initiates destruction of @css by removing cgroup interface
4347 * files and putting its base reference. ->css_offline() will be invoked
4348 * asynchronously once css_tryget() is guaranteed to fail and when the
4349 * reference count reaches zero, @css will be released.
4350 */
4351static void kill_css(struct cgroup_subsys_state *css)
4352{ 3864{
4353 cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id); 3865 lockdep_assert_held(&cgroup_tree_mutex);
3866
3867 /*
3868 * This must happen before css is disassociated with its cgroup.
3869 * See seq_css() for details.
3870 */
3871 cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
4354 3872
4355 /* 3873 /*
4356 * Killing would put the base ref, but we need to keep it alive 3874 * Killing would put the base ref, but we need to keep it alive
@@ -4372,6 +3890,28 @@ static void kill_css(struct cgroup_subsys_state *css)
4372} 3890}
4373 3891
4374/** 3892/**
3893 * kill_css - destroy a css
3894 * @css: css to destroy
3895 *
3896 * This function initiates destruction of @css by removing cgroup interface
3897 * files and putting its base reference. ->css_offline() will be invoked
3898 * asynchronously once css_tryget() is guaranteed to fail and when the
3899 * reference count reaches zero, @css will be released.
3900 */
3901static void kill_css(struct cgroup_subsys_state *css)
3902{
3903 struct cgroup *cgrp = css->cgroup;
3904
3905 lockdep_assert_held(&cgroup_tree_mutex);
3906
3907 /* if already killed, noop */
3908 if (cgrp->subsys_mask & (1 << css->ss->id)) {
3909 cgrp->subsys_mask &= ~(1 << css->ss->id);
3910 __kill_css(css);
3911 }
3912}
3913
3914/**
4375 * cgroup_destroy_locked - the first stage of cgroup destruction 3915 * cgroup_destroy_locked - the first stage of cgroup destruction
4376 * @cgrp: cgroup to be destroyed 3916 * @cgrp: cgroup to be destroyed
4377 * 3917 *
@@ -4398,22 +3938,21 @@ static void kill_css(struct cgroup_subsys_state *css)
4398static int cgroup_destroy_locked(struct cgroup *cgrp) 3938static int cgroup_destroy_locked(struct cgroup *cgrp)
4399 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 3939 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4400{ 3940{
4401 struct dentry *d = cgrp->dentry;
4402 struct cgroup_subsys_state *css;
4403 struct cgroup *child; 3941 struct cgroup *child;
3942 struct cgroup_subsys_state *css;
4404 bool empty; 3943 bool empty;
4405 int ssid; 3944 int ssid;
4406 3945
4407 lockdep_assert_held(&d->d_inode->i_mutex); 3946 lockdep_assert_held(&cgroup_tree_mutex);
4408 lockdep_assert_held(&cgroup_mutex); 3947 lockdep_assert_held(&cgroup_mutex);
4409 3948
4410 /* 3949 /*
4411 * css_set_lock synchronizes access to ->cset_links and prevents 3950 * css_set_rwsem synchronizes access to ->cset_links and prevents
4412 * @cgrp from being removed while __put_css_set() is in progress. 3951 * @cgrp from being removed while put_css_set() is in progress.
4413 */ 3952 */
4414 read_lock(&css_set_lock); 3953 down_read(&css_set_rwsem);
4415 empty = list_empty(&cgrp->cset_links); 3954 empty = list_empty(&cgrp->cset_links);
4416 read_unlock(&css_set_lock); 3955 up_read(&css_set_rwsem);
4417 if (!empty) 3956 if (!empty)
4418 return -EBUSY; 3957 return -EBUSY;
4419 3958
@@ -4434,14 +3973,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4434 return -EBUSY; 3973 return -EBUSY;
4435 3974
4436 /* 3975 /*
4437 * Initiate massacre of all css's. cgroup_destroy_css_killed()
4438 * will be invoked to perform the rest of destruction once the
4439 * percpu refs of all css's are confirmed to be killed.
4440 */
4441 for_each_css(css, ssid, cgrp)
4442 kill_css(css);
4443
4444 /*
4445 * Mark @cgrp dead. This prevents further task migration and child 3976 * Mark @cgrp dead. This prevents further task migration and child
4446 * creation by disabling cgroup_lock_live_group(). Note that 3977 * creation by disabling cgroup_lock_live_group(). Note that
4447 * CGRP_DEAD assertion is depended upon by css_next_child() to 3978 * CGRP_DEAD assertion is depended upon by css_next_child() to
@@ -4450,6 +3981,17 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4450 */ 3981 */
4451 set_bit(CGRP_DEAD, &cgrp->flags); 3982 set_bit(CGRP_DEAD, &cgrp->flags);
4452 3983
3984 /*
3985 * Initiate massacre of all css's. cgroup_destroy_css_killed()
3986 * will be invoked to perform the rest of destruction once the
3987 * percpu refs of all css's are confirmed to be killed. This
3988 * involves removing the subsystem's files, drop cgroup_mutex.
3989 */
3990 mutex_unlock(&cgroup_mutex);
3991 for_each_css(css, ssid, cgrp)
3992 kill_css(css);
3993 mutex_lock(&cgroup_mutex);
3994
4453 /* CGRP_DEAD is set, remove from ->release_list for the last time */ 3995 /* CGRP_DEAD is set, remove from ->release_list for the last time */
4454 raw_spin_lock(&release_list_lock); 3996 raw_spin_lock(&release_list_lock);
4455 if (!list_empty(&cgrp->release_list)) 3997 if (!list_empty(&cgrp->release_list))
@@ -4465,14 +4007,20 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4465 if (!cgrp->nr_css) 4007 if (!cgrp->nr_css)
4466 cgroup_destroy_css_killed(cgrp); 4008 cgroup_destroy_css_killed(cgrp);
4467 4009
4010 /* remove @cgrp directory along with the base files */
4011 mutex_unlock(&cgroup_mutex);
4012
4468 /* 4013 /*
4469 * Clear the base files and remove @cgrp directory. The removal 4014 * There are two control paths which try to determine cgroup from
4470 * puts the base ref but we aren't quite done with @cgrp yet, so 4015 * dentry without going through kernfs - cgroupstats_build() and
4471 * hold onto it. 4016 * css_tryget_from_dir(). Those are supported by RCU protecting
4017 * clearing of cgrp->kn->priv backpointer, which should happen
4018 * after all files under it have been removed.
4472 */ 4019 */
4473 cgroup_addrm_files(cgrp, cgroup_base_files, false); 4020 kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */
4474 dget(d); 4021 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
4475 cgroup_d_remove_dir(d); 4022
4023 mutex_lock(&cgroup_mutex);
4476 4024
4477 return 0; 4025 return 0;
4478}; 4026};
@@ -4489,72 +4037,82 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4489static void cgroup_destroy_css_killed(struct cgroup *cgrp) 4037static void cgroup_destroy_css_killed(struct cgroup *cgrp)
4490{ 4038{
4491 struct cgroup *parent = cgrp->parent; 4039 struct cgroup *parent = cgrp->parent;
4492 struct dentry *d = cgrp->dentry;
4493 4040
4041 lockdep_assert_held(&cgroup_tree_mutex);
4494 lockdep_assert_held(&cgroup_mutex); 4042 lockdep_assert_held(&cgroup_mutex);
4495 4043
4496 /* delete this cgroup from parent->children */ 4044 /* delete this cgroup from parent->children */
4497 list_del_rcu(&cgrp->sibling); 4045 list_del_rcu(&cgrp->sibling);
4498 4046
4499 dput(d); 4047 cgroup_put(cgrp);
4500 4048
4501 set_bit(CGRP_RELEASABLE, &parent->flags); 4049 set_bit(CGRP_RELEASABLE, &parent->flags);
4502 check_for_release(parent); 4050 check_for_release(parent);
4503} 4051}
4504 4052
4505static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) 4053static int cgroup_rmdir(struct kernfs_node *kn)
4506{ 4054{
4507 int ret; 4055 struct cgroup *cgrp = kn->priv;
4508 4056 int ret = 0;
4509 mutex_lock(&cgroup_mutex);
4510 ret = cgroup_destroy_locked(dentry->d_fsdata);
4511 mutex_unlock(&cgroup_mutex);
4512 4057
4513 return ret; 4058 /*
4514} 4059 * This is self-destruction but @kn can't be removed while this
4060 * callback is in progress. Let's break active protection. Once
4061 * the protection is broken, @cgrp can be destroyed at any point.
4062 * Pin it so that it stays accessible.
4063 */
4064 cgroup_get(cgrp);
4065 kernfs_break_active_protection(kn);
4515 4066
4516static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) 4067 mutex_lock(&cgroup_tree_mutex);
4517{ 4068 mutex_lock(&cgroup_mutex);
4518 INIT_LIST_HEAD(&ss->cftsets);
4519 4069
4520 /* 4070 /*
4521 * base_cftset is embedded in subsys itself, no need to worry about 4071 * @cgrp might already have been destroyed while we're trying to
4522 * deregistration. 4072 * grab the mutexes.
4523 */ 4073 */
4524 if (ss->base_cftypes) { 4074 if (!cgroup_is_dead(cgrp))
4525 struct cftype *cft; 4075 ret = cgroup_destroy_locked(cgrp);
4526 4076
4527 for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++) 4077 mutex_unlock(&cgroup_mutex);
4528 cft->ss = ss; 4078 mutex_unlock(&cgroup_tree_mutex);
4529 4079
4530 ss->base_cftset.cfts = ss->base_cftypes; 4080 kernfs_unbreak_active_protection(kn);
4531 list_add_tail(&ss->base_cftset.node, &ss->cftsets); 4081 cgroup_put(cgrp);
4532 } 4082 return ret;
4533} 4083}
4534 4084
4085static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
4086 .remount_fs = cgroup_remount,
4087 .show_options = cgroup_show_options,
4088 .mkdir = cgroup_mkdir,
4089 .rmdir = cgroup_rmdir,
4090 .rename = cgroup_rename,
4091};
4092
4535static void __init cgroup_init_subsys(struct cgroup_subsys *ss) 4093static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4536{ 4094{
4537 struct cgroup_subsys_state *css; 4095 struct cgroup_subsys_state *css;
4538 4096
4539 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 4097 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4540 4098
4099 mutex_lock(&cgroup_tree_mutex);
4541 mutex_lock(&cgroup_mutex); 4100 mutex_lock(&cgroup_mutex);
4542 4101
4543 /* init base cftset */ 4102 INIT_LIST_HEAD(&ss->cfts);
4544 cgroup_init_cftsets(ss);
4545 4103
4546 /* Create the top cgroup state for this subsystem */ 4104 /* Create the root cgroup state for this subsystem */
4547 ss->root = &cgroup_dummy_root; 4105 ss->root = &cgrp_dfl_root;
4548 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); 4106 css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
4549 /* We don't handle early failures gracefully */ 4107 /* We don't handle early failures gracefully */
4550 BUG_ON(IS_ERR(css)); 4108 BUG_ON(IS_ERR(css));
4551 init_css(css, ss, cgroup_dummy_top); 4109 init_css(css, ss, &cgrp_dfl_root.cgrp);
4552 4110
4553 /* Update the init_css_set to contain a subsys 4111 /* Update the init_css_set to contain a subsys
4554 * pointer to this state - since the subsystem is 4112 * pointer to this state - since the subsystem is
4555 * newly registered, all tasks and hence the 4113 * newly registered, all tasks and hence the
4556 * init_css_set is in the subsystem's top cgroup. */ 4114 * init_css_set is in the subsystem's root cgroup. */
4557 init_css_set.subsys[ss->subsys_id] = css; 4115 init_css_set.subsys[ss->id] = css;
4558 4116
4559 need_forkexit_callback |= ss->fork || ss->exit; 4117 need_forkexit_callback |= ss->fork || ss->exit;
4560 4118
@@ -4565,185 +4123,11 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4565 4123
4566 BUG_ON(online_css(css)); 4124 BUG_ON(online_css(css));
4567 4125
4568 mutex_unlock(&cgroup_mutex); 4126 cgrp_dfl_root.cgrp.subsys_mask |= 1 << ss->id;
4569
4570 /* this function shouldn't be used with modular subsystems, since they
4571 * need to register a subsys_id, among other things */
4572 BUG_ON(ss->module);
4573}
4574
4575/**
4576 * cgroup_load_subsys: load and register a modular subsystem at runtime
4577 * @ss: the subsystem to load
4578 *
4579 * This function should be called in a modular subsystem's initcall. If the
4580 * subsystem is built as a module, it will be assigned a new subsys_id and set
4581 * up for use. If the subsystem is built-in anyway, work is delegated to the
4582 * simpler cgroup_init_subsys.
4583 */
4584int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4585{
4586 struct cgroup_subsys_state *css;
4587 int i, ret;
4588 struct hlist_node *tmp;
4589 struct css_set *cset;
4590 unsigned long key;
4591
4592 /* check name and function validity */
4593 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
4594 ss->css_alloc == NULL || ss->css_free == NULL)
4595 return -EINVAL;
4596
4597 /*
4598 * we don't support callbacks in modular subsystems. this check is
4599 * before the ss->module check for consistency; a subsystem that could
4600 * be a module should still have no callbacks even if the user isn't
4601 * compiling it as one.
4602 */
4603 if (ss->fork || ss->exit)
4604 return -EINVAL;
4605
4606 /*
4607 * an optionally modular subsystem is built-in: we want to do nothing,
4608 * since cgroup_init_subsys will have already taken care of it.
4609 */
4610 if (ss->module == NULL) {
4611 /* a sanity check */
4612 BUG_ON(cgroup_subsys[ss->subsys_id] != ss);
4613 return 0;
4614 }
4615
4616 /* init base cftset */
4617 cgroup_init_cftsets(ss);
4618
4619 mutex_lock(&cgroup_mutex);
4620 mutex_lock(&cgroup_root_mutex);
4621 cgroup_subsys[ss->subsys_id] = ss;
4622
4623 /*
4624 * no ss->css_alloc seems to need anything important in the ss
4625 * struct, so this can happen first (i.e. before the dummy root
4626 * attachment).
4627 */
4628 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
4629 if (IS_ERR(css)) {
4630 /* failure case - need to deassign the cgroup_subsys[] slot. */
4631 cgroup_subsys[ss->subsys_id] = NULL;
4632 mutex_unlock(&cgroup_root_mutex);
4633 mutex_unlock(&cgroup_mutex);
4634 return PTR_ERR(css);
4635 }
4636
4637 ss->root = &cgroup_dummy_root;
4638
4639 /* our new subsystem will be attached to the dummy hierarchy. */
4640 init_css(css, ss, cgroup_dummy_top);
4641
4642 /*
4643 * Now we need to entangle the css into the existing css_sets. unlike
4644 * in cgroup_init_subsys, there are now multiple css_sets, so each one
4645 * will need a new pointer to it; done by iterating the css_set_table.
4646 * furthermore, modifying the existing css_sets will corrupt the hash
4647 * table state, so each changed css_set will need its hash recomputed.
4648 * this is all done under the css_set_lock.
4649 */
4650 write_lock(&css_set_lock);
4651 hash_for_each_safe(css_set_table, i, tmp, cset, hlist) {
4652 /* skip entries that we already rehashed */
4653 if (cset->subsys[ss->subsys_id])
4654 continue;
4655 /* remove existing entry */
4656 hash_del(&cset->hlist);
4657 /* set new value */
4658 cset->subsys[ss->subsys_id] = css;
4659 /* recompute hash and restore entry */
4660 key = css_set_hash(cset->subsys);
4661 hash_add(css_set_table, &cset->hlist, key);
4662 }
4663 write_unlock(&css_set_lock);
4664
4665 ret = online_css(css);
4666 if (ret) {
4667 ss->css_free(css);
4668 goto err_unload;
4669 }
4670
4671 /* success! */
4672 mutex_unlock(&cgroup_root_mutex);
4673 mutex_unlock(&cgroup_mutex);
4674 return 0;
4675
4676err_unload:
4677 mutex_unlock(&cgroup_root_mutex);
4678 mutex_unlock(&cgroup_mutex);
4679 /* @ss can't be mounted here as try_module_get() would fail */
4680 cgroup_unload_subsys(ss);
4681 return ret;
4682}
4683EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4684
4685/**
4686 * cgroup_unload_subsys: unload a modular subsystem
4687 * @ss: the subsystem to unload
4688 *
4689 * This function should be called in a modular subsystem's exitcall. When this
4690 * function is invoked, the refcount on the subsystem's module will be 0, so
4691 * the subsystem will not be attached to any hierarchy.
4692 */
4693void cgroup_unload_subsys(struct cgroup_subsys *ss)
4694{
4695 struct cgrp_cset_link *link;
4696 struct cgroup_subsys_state *css;
4697
4698 BUG_ON(ss->module == NULL);
4699
4700 /*
4701 * we shouldn't be called if the subsystem is in use, and the use of
4702 * try_module_get() in rebind_subsystems() should ensure that it
4703 * doesn't start being used while we're killing it off.
4704 */
4705 BUG_ON(ss->root != &cgroup_dummy_root);
4706
4707 mutex_lock(&cgroup_mutex);
4708 mutex_lock(&cgroup_root_mutex);
4709
4710 css = cgroup_css(cgroup_dummy_top, ss);
4711 if (css)
4712 offline_css(css);
4713 4127
4714 /* deassign the subsys_id */
4715 cgroup_subsys[ss->subsys_id] = NULL;
4716
4717 /*
4718 * disentangle the css from all css_sets attached to the dummy
4719 * top. as in loading, we need to pay our respects to the hashtable
4720 * gods.
4721 */
4722 write_lock(&css_set_lock);
4723 list_for_each_entry(link, &cgroup_dummy_top->cset_links, cset_link) {
4724 struct css_set *cset = link->cset;
4725 unsigned long key;
4726
4727 hash_del(&cset->hlist);
4728 cset->subsys[ss->subsys_id] = NULL;
4729 key = css_set_hash(cset->subsys);
4730 hash_add(css_set_table, &cset->hlist, key);
4731 }
4732 write_unlock(&css_set_lock);
4733
4734 /*
4735 * remove subsystem's css from the cgroup_dummy_top and free it -
4736 * need to free before marking as null because ss->css_free needs
4737 * the cgrp->subsys pointer to find their state.
4738 */
4739 if (css)
4740 ss->css_free(css);
4741 RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
4742
4743 mutex_unlock(&cgroup_root_mutex);
4744 mutex_unlock(&cgroup_mutex); 4128 mutex_unlock(&cgroup_mutex);
4129 mutex_unlock(&cgroup_tree_mutex);
4745} 4130}
4746EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
4747 4131
4748/** 4132/**
4749 * cgroup_init_early - cgroup initialization at system boot 4133 * cgroup_init_early - cgroup initialization at system boot
@@ -4753,34 +4137,24 @@ EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
4753 */ 4137 */
4754int __init cgroup_init_early(void) 4138int __init cgroup_init_early(void)
4755{ 4139{
4140 static struct cgroup_sb_opts __initdata opts =
4141 { .flags = CGRP_ROOT_SANE_BEHAVIOR };
4756 struct cgroup_subsys *ss; 4142 struct cgroup_subsys *ss;
4757 int i; 4143 int i;
4758 4144
4759 atomic_set(&init_css_set.refcount, 1); 4145 init_cgroup_root(&cgrp_dfl_root, &opts);
4760 INIT_LIST_HEAD(&init_css_set.cgrp_links);
4761 INIT_LIST_HEAD(&init_css_set.tasks);
4762 INIT_HLIST_NODE(&init_css_set.hlist);
4763 css_set_count = 1;
4764 init_cgroup_root(&cgroup_dummy_root);
4765 cgroup_root_count = 1;
4766 RCU_INIT_POINTER(init_task.cgroups, &init_css_set); 4146 RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
4767 4147
4768 init_cgrp_cset_link.cset = &init_css_set; 4148 for_each_subsys(ss, i) {
4769 init_cgrp_cset_link.cgrp = cgroup_dummy_top; 4149 WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
4770 list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links); 4150 "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",
4771 list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links); 4151 i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
4772 4152 ss->id, ss->name);
4773 /* at bootup time, we don't worry about modular subsystems */ 4153 WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
4774 for_each_builtin_subsys(ss, i) { 4154 "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
4775 BUG_ON(!ss->name); 4155
4776 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); 4156 ss->id = i;
4777 BUG_ON(!ss->css_alloc); 4157 ss->name = cgroup_subsys_name[i];
4778 BUG_ON(!ss->css_free);
4779 if (ss->subsys_id != i) {
4780 printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
4781 ss->name, ss->subsys_id);
4782 BUG();
4783 }
4784 4158
4785 if (ss->early_init) 4159 if (ss->early_init)
4786 cgroup_init_subsys(ss); 4160 cgroup_init_subsys(ss);
@@ -4798,53 +4172,46 @@ int __init cgroup_init(void)
4798{ 4172{
4799 struct cgroup_subsys *ss; 4173 struct cgroup_subsys *ss;
4800 unsigned long key; 4174 unsigned long key;
4801 int i, err; 4175 int ssid, err;
4802 4176
4803 err = bdi_init(&cgroup_backing_dev_info); 4177 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
4804 if (err)
4805 return err;
4806 4178
4807 for_each_builtin_subsys(ss, i) { 4179 mutex_lock(&cgroup_tree_mutex);
4808 if (!ss->early_init)
4809 cgroup_init_subsys(ss);
4810 }
4811
4812 /* allocate id for the dummy hierarchy */
4813 mutex_lock(&cgroup_mutex); 4180 mutex_lock(&cgroup_mutex);
4814 mutex_lock(&cgroup_root_mutex);
4815 4181
4816 /* Add init_css_set to the hash table */ 4182 /* Add init_css_set to the hash table */
4817 key = css_set_hash(init_css_set.subsys); 4183 key = css_set_hash(init_css_set.subsys);
4818 hash_add(css_set_table, &init_css_set.hlist, key); 4184 hash_add(css_set_table, &init_css_set.hlist, key);
4819 4185
4820 BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1)); 4186 BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
4821 4187
4822 err = idr_alloc(&cgroup_dummy_root.cgroup_idr, cgroup_dummy_top,
4823 0, 1, GFP_KERNEL);
4824 BUG_ON(err < 0);
4825
4826 mutex_unlock(&cgroup_root_mutex);
4827 mutex_unlock(&cgroup_mutex); 4188 mutex_unlock(&cgroup_mutex);
4189 mutex_unlock(&cgroup_tree_mutex);
4828 4190
4829 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); 4191 for_each_subsys(ss, ssid) {
4830 if (!cgroup_kobj) { 4192 if (!ss->early_init)
4831 err = -ENOMEM; 4193 cgroup_init_subsys(ss);
4832 goto out; 4194
4195 /*
4196 * cftype registration needs kmalloc and can't be done
4197 * during early_init. Register base cftypes separately.
4198 */
4199 if (ss->base_cftypes)
4200 WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
4833 } 4201 }
4834 4202
4203 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
4204 if (!cgroup_kobj)
4205 return -ENOMEM;
4206
4835 err = register_filesystem(&cgroup_fs_type); 4207 err = register_filesystem(&cgroup_fs_type);
4836 if (err < 0) { 4208 if (err < 0) {
4837 kobject_put(cgroup_kobj); 4209 kobject_put(cgroup_kobj);
4838 goto out; 4210 return err;
4839 } 4211 }
4840 4212
4841 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); 4213 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
4842 4214 return 0;
4843out:
4844 if (err)
4845 bdi_destroy(&cgroup_backing_dev_info);
4846
4847 return err;
4848} 4215}
4849 4216
4850static int __init cgroup_wq_init(void) 4217static int __init cgroup_wq_init(void)
@@ -4876,12 +4243,6 @@ core_initcall(cgroup_wq_init);
4876 * proc_cgroup_show() 4243 * proc_cgroup_show()
4877 * - Print task's cgroup paths into seq_file, one line for each hierarchy 4244 * - Print task's cgroup paths into seq_file, one line for each hierarchy
4878 * - Used for /proc/<pid>/cgroup. 4245 * - Used for /proc/<pid>/cgroup.
4879 * - No need to task_lock(tsk) on this tsk->cgroup reference, as it
4880 * doesn't really matter if tsk->cgroup changes after we read it,
4881 * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
4882 * anyway. No need to check that tsk->cgroup != NULL, thanks to
4883 * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
4884 * cgroup to top_cgroup.
4885 */ 4246 */
4886 4247
4887/* TODO: Use a proper seq_file iterator */ 4248/* TODO: Use a proper seq_file iterator */
@@ -4889,12 +4250,12 @@ int proc_cgroup_show(struct seq_file *m, void *v)
4889{ 4250{
4890 struct pid *pid; 4251 struct pid *pid;
4891 struct task_struct *tsk; 4252 struct task_struct *tsk;
4892 char *buf; 4253 char *buf, *path;
4893 int retval; 4254 int retval;
4894 struct cgroupfs_root *root; 4255 struct cgroup_root *root;
4895 4256
4896 retval = -ENOMEM; 4257 retval = -ENOMEM;
4897 buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 4258 buf = kmalloc(PATH_MAX, GFP_KERNEL);
4898 if (!buf) 4259 if (!buf)
4899 goto out; 4260 goto out;
4900 4261
@@ -4907,29 +4268,36 @@ int proc_cgroup_show(struct seq_file *m, void *v)
4907 retval = 0; 4268 retval = 0;
4908 4269
4909 mutex_lock(&cgroup_mutex); 4270 mutex_lock(&cgroup_mutex);
4271 down_read(&css_set_rwsem);
4910 4272
4911 for_each_active_root(root) { 4273 for_each_root(root) {
4912 struct cgroup_subsys *ss; 4274 struct cgroup_subsys *ss;
4913 struct cgroup *cgrp; 4275 struct cgroup *cgrp;
4914 int ssid, count = 0; 4276 int ssid, count = 0;
4915 4277
4278 if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)
4279 continue;
4280
4916 seq_printf(m, "%d:", root->hierarchy_id); 4281 seq_printf(m, "%d:", root->hierarchy_id);
4917 for_each_subsys(ss, ssid) 4282 for_each_subsys(ss, ssid)
4918 if (root->subsys_mask & (1 << ssid)) 4283 if (root->cgrp.subsys_mask & (1 << ssid))
4919 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 4284 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
4920 if (strlen(root->name)) 4285 if (strlen(root->name))
4921 seq_printf(m, "%sname=%s", count ? "," : "", 4286 seq_printf(m, "%sname=%s", count ? "," : "",
4922 root->name); 4287 root->name);
4923 seq_putc(m, ':'); 4288 seq_putc(m, ':');
4924 cgrp = task_cgroup_from_root(tsk, root); 4289 cgrp = task_cgroup_from_root(tsk, root);
4925 retval = cgroup_path(cgrp, buf, PAGE_SIZE); 4290 path = cgroup_path(cgrp, buf, PATH_MAX);
4926 if (retval < 0) 4291 if (!path) {
4292 retval = -ENAMETOOLONG;
4927 goto out_unlock; 4293 goto out_unlock;
4928 seq_puts(m, buf); 4294 }
4295 seq_puts(m, path);
4929 seq_putc(m, '\n'); 4296 seq_putc(m, '\n');
4930 } 4297 }
4931 4298
4932out_unlock: 4299out_unlock:
4300 up_read(&css_set_rwsem);
4933 mutex_unlock(&cgroup_mutex); 4301 mutex_unlock(&cgroup_mutex);
4934 put_task_struct(tsk); 4302 put_task_struct(tsk);
4935out_free: 4303out_free:
@@ -4955,7 +4323,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
4955 for_each_subsys(ss, i) 4323 for_each_subsys(ss, i)
4956 seq_printf(m, "%s\t%d\t%d\t%d\n", 4324 seq_printf(m, "%s\t%d\t%d\t%d\n",
4957 ss->name, ss->root->hierarchy_id, 4325 ss->name, ss->root->hierarchy_id,
4958 ss->root->number_of_cgroups, !ss->disabled); 4326 atomic_read(&ss->root->nr_cgrps), !ss->disabled);
4959 4327
4960 mutex_unlock(&cgroup_mutex); 4328 mutex_unlock(&cgroup_mutex);
4961 return 0; 4329 return 0;
@@ -4974,27 +4342,16 @@ static const struct file_operations proc_cgroupstats_operations = {
4974}; 4342};
4975 4343
4976/** 4344/**
4977 * cgroup_fork - attach newly forked task to its parents cgroup. 4345 * cgroup_fork - initialize cgroup related fields during copy_process()
4978 * @child: pointer to task_struct of forking parent process. 4346 * @child: pointer to task_struct of forking parent process.
4979 * 4347 *
4980 * Description: A task inherits its parent's cgroup at fork(). 4348 * A task is associated with the init_css_set until cgroup_post_fork()
4981 * 4349 * attaches it to the parent's css_set. Empty cg_list indicates that
4982 * A pointer to the shared css_set was automatically copied in 4350 * @child isn't holding reference to its css_set.
4983 * fork.c by dup_task_struct(). However, we ignore that copy, since
4984 * it was not made under the protection of RCU or cgroup_mutex, so
4985 * might no longer be a valid cgroup pointer. cgroup_attach_task() might
4986 * have already changed current->cgroups, allowing the previously
4987 * referenced cgroup group to be removed and freed.
4988 *
4989 * At the point that cgroup_fork() is called, 'current' is the parent
4990 * task, and the passed argument 'child' points to the child task.
4991 */ 4351 */
4992void cgroup_fork(struct task_struct *child) 4352void cgroup_fork(struct task_struct *child)
4993{ 4353{
4994 task_lock(current); 4354 RCU_INIT_POINTER(child->cgroups, &init_css_set);
4995 get_css_set(task_css_set(current));
4996 child->cgroups = current->cgroups;
4997 task_unlock(current);
4998 INIT_LIST_HEAD(&child->cg_list); 4355 INIT_LIST_HEAD(&child->cg_list);
4999} 4356}
5000 4357
@@ -5014,23 +4371,37 @@ void cgroup_post_fork(struct task_struct *child)
5014 int i; 4371 int i;
5015 4372
5016 /* 4373 /*
5017 * use_task_css_set_links is set to 1 before we walk the tasklist 4374 * This may race against cgroup_enable_task_cg_links(). As that
5018 * under the tasklist_lock and we read it here after we added the child 4375 * function sets use_task_css_set_links before grabbing
5019 * to the tasklist under the tasklist_lock as well. If the child wasn't 4376 * tasklist_lock and we just went through tasklist_lock to add
5020 * yet in the tasklist when we walked through it from 4377 * @child, it's guaranteed that either we see the set
5021 * cgroup_enable_task_cg_lists(), then use_task_css_set_links value 4378 * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
5022 * should be visible now due to the paired locking and barriers implied 4379 * @child during its iteration.
5023 * by LOCK/UNLOCK: it is written before the tasklist_lock unlock 4380 *
5024 * in cgroup_enable_task_cg_lists() and read here after the tasklist_lock 4381 * If we won the race, @child is associated with %current's
5025 * lock on fork. 4382 * css_set. Grabbing css_set_rwsem guarantees both that the
4383 * association is stable, and, on completion of the parent's
4384 * migration, @child is visible in the source of migration or
4385 * already in the destination cgroup. This guarantee is necessary
4386 * when implementing operations which need to migrate all tasks of
4387 * a cgroup to another.
4388 *
4389 * Note that if we lose to cgroup_enable_task_cg_links(), @child
4390 * will remain in init_css_set. This is safe because all tasks are
4391 * in the init_css_set before cg_links is enabled and there's no
4392 * operation which transfers all tasks out of init_css_set.
5026 */ 4393 */
5027 if (use_task_css_set_links) { 4394 if (use_task_css_set_links) {
5028 write_lock(&css_set_lock); 4395 struct css_set *cset;
5029 task_lock(child); 4396
5030 if (list_empty(&child->cg_list)) 4397 down_write(&css_set_rwsem);
5031 list_add(&child->cg_list, &task_css_set(child)->tasks); 4398 cset = task_css_set(current);
5032 task_unlock(child); 4399 if (list_empty(&child->cg_list)) {
5033 write_unlock(&css_set_lock); 4400 rcu_assign_pointer(child->cgroups, cset);
4401 list_add(&child->cg_list, &cset->tasks);
4402 get_css_set(cset);
4403 }
4404 up_write(&css_set_rwsem);
5034 } 4405 }
5035 4406
5036 /* 4407 /*
@@ -5039,15 +4410,7 @@ void cgroup_post_fork(struct task_struct *child)
5039 * and addition to css_set. 4410 * and addition to css_set.
5040 */ 4411 */
5041 if (need_forkexit_callback) { 4412 if (need_forkexit_callback) {
5042 /* 4413 for_each_subsys(ss, i)
5043 * fork/exit callbacks are supported only for builtin
5044 * subsystems, and the builtin section of the subsys
5045 * array is immutable, so we don't need to lock the
5046 * subsys array here. On the other hand, modular section
5047 * of the array can be freed at module unload, so we
5048 * can't touch that.
5049 */
5050 for_each_builtin_subsys(ss, i)
5051 if (ss->fork) 4414 if (ss->fork)
5052 ss->fork(child); 4415 ss->fork(child);
5053 } 4416 }
@@ -5056,7 +4419,6 @@ void cgroup_post_fork(struct task_struct *child)
5056/** 4419/**
5057 * cgroup_exit - detach cgroup from exiting task 4420 * cgroup_exit - detach cgroup from exiting task
5058 * @tsk: pointer to task_struct of exiting process 4421 * @tsk: pointer to task_struct of exiting process
5059 * @run_callback: run exit callbacks?
5060 * 4422 *
5061 * Description: Detach cgroup from @tsk and release it. 4423 * Description: Detach cgroup from @tsk and release it.
5062 * 4424 *
@@ -5066,57 +4428,38 @@ void cgroup_post_fork(struct task_struct *child)
5066 * use notify_on_release cgroups where very high task exit scaling 4428 * use notify_on_release cgroups where very high task exit scaling
5067 * is required on large systems. 4429 * is required on large systems.
5068 * 4430 *
5069 * the_top_cgroup_hack: 4431 * We set the exiting tasks cgroup to the root cgroup (top_cgroup). We
5070 * 4432 * call cgroup_exit() while the task is still competent to handle
5071 * Set the exiting tasks cgroup to the root cgroup (top_cgroup). 4433 * notify_on_release(), then leave the task attached to the root cgroup in
5072 * 4434 * each hierarchy for the remainder of its exit. No need to bother with
5073 * We call cgroup_exit() while the task is still competent to 4435 * init_css_set refcnting. init_css_set never goes away and we can't race
5074 * handle notify_on_release(), then leave the task attached to the 4436 * with migration path - PF_EXITING is visible to migration path.
5075 * root cgroup in each hierarchy for the remainder of its exit.
5076 *
5077 * To do this properly, we would increment the reference count on
5078 * top_cgroup, and near the very end of the kernel/exit.c do_exit()
5079 * code we would add a second cgroup function call, to drop that
5080 * reference. This would just create an unnecessary hot spot on
5081 * the top_cgroup reference count, to no avail.
5082 *
5083 * Normally, holding a reference to a cgroup without bumping its
5084 * count is unsafe. The cgroup could go away, or someone could
5085 * attach us to a different cgroup, decrementing the count on
5086 * the first cgroup that we never incremented. But in this case,
5087 * top_cgroup isn't going away, and either task has PF_EXITING set,
5088 * which wards off any cgroup_attach_task() attempts, or task is a failed
5089 * fork, never visible to cgroup_attach_task.
5090 */ 4437 */
5091void cgroup_exit(struct task_struct *tsk, int run_callbacks) 4438void cgroup_exit(struct task_struct *tsk)
5092{ 4439{
5093 struct cgroup_subsys *ss; 4440 struct cgroup_subsys *ss;
5094 struct css_set *cset; 4441 struct css_set *cset;
4442 bool put_cset = false;
5095 int i; 4443 int i;
5096 4444
5097 /* 4445 /*
5098 * Unlink from the css_set task list if necessary. 4446 * Unlink from @tsk from its css_set. As migration path can't race
5099 * Optimistically check cg_list before taking 4447 * with us, we can check cg_list without grabbing css_set_rwsem.
5100 * css_set_lock
5101 */ 4448 */
5102 if (!list_empty(&tsk->cg_list)) { 4449 if (!list_empty(&tsk->cg_list)) {
5103 write_lock(&css_set_lock); 4450 down_write(&css_set_rwsem);
5104 if (!list_empty(&tsk->cg_list)) 4451 list_del_init(&tsk->cg_list);
5105 list_del_init(&tsk->cg_list); 4452 up_write(&css_set_rwsem);
5106 write_unlock(&css_set_lock); 4453 put_cset = true;
5107 } 4454 }
5108 4455
5109 /* Reassign the task to the init_css_set. */ 4456 /* Reassign the task to the init_css_set. */
5110 task_lock(tsk);
5111 cset = task_css_set(tsk); 4457 cset = task_css_set(tsk);
5112 RCU_INIT_POINTER(tsk->cgroups, &init_css_set); 4458 RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
5113 4459
5114 if (run_callbacks && need_forkexit_callback) { 4460 if (need_forkexit_callback) {
5115 /* 4461 /* see cgroup_post_fork() for details */
5116 * fork/exit callbacks are supported only for builtin 4462 for_each_subsys(ss, i) {
5117 * subsystems, see cgroup_post_fork() for details.
5118 */
5119 for_each_builtin_subsys(ss, i) {
5120 if (ss->exit) { 4463 if (ss->exit) {
5121 struct cgroup_subsys_state *old_css = cset->subsys[i]; 4464 struct cgroup_subsys_state *old_css = cset->subsys[i];
5122 struct cgroup_subsys_state *css = task_css(tsk, i); 4465 struct cgroup_subsys_state *css = task_css(tsk, i);
@@ -5125,9 +4468,9 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
5125 } 4468 }
5126 } 4469 }
5127 } 4470 }
5128 task_unlock(tsk);
5129 4471
5130 put_css_set_taskexit(cset); 4472 if (put_cset)
4473 put_css_set(cset, true);
5131} 4474}
5132 4475
5133static void check_for_release(struct cgroup *cgrp) 4476static void check_for_release(struct cgroup *cgrp)
@@ -5184,16 +4527,17 @@ static void cgroup_release_agent(struct work_struct *work)
5184 while (!list_empty(&release_list)) { 4527 while (!list_empty(&release_list)) {
5185 char *argv[3], *envp[3]; 4528 char *argv[3], *envp[3];
5186 int i; 4529 int i;
5187 char *pathbuf = NULL, *agentbuf = NULL; 4530 char *pathbuf = NULL, *agentbuf = NULL, *path;
5188 struct cgroup *cgrp = list_entry(release_list.next, 4531 struct cgroup *cgrp = list_entry(release_list.next,
5189 struct cgroup, 4532 struct cgroup,
5190 release_list); 4533 release_list);
5191 list_del_init(&cgrp->release_list); 4534 list_del_init(&cgrp->release_list);
5192 raw_spin_unlock(&release_list_lock); 4535 raw_spin_unlock(&release_list_lock);
5193 pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); 4536 pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
5194 if (!pathbuf) 4537 if (!pathbuf)
5195 goto continue_free; 4538 goto continue_free;
5196 if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) 4539 path = cgroup_path(cgrp, pathbuf, PATH_MAX);
4540 if (!path)
5197 goto continue_free; 4541 goto continue_free;
5198 agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); 4542 agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
5199 if (!agentbuf) 4543 if (!agentbuf)
@@ -5201,7 +4545,7 @@ static void cgroup_release_agent(struct work_struct *work)
5201 4545
5202 i = 0; 4546 i = 0;
5203 argv[i++] = agentbuf; 4547 argv[i++] = agentbuf;
5204 argv[i++] = pathbuf; 4548 argv[i++] = path;
5205 argv[i] = NULL; 4549 argv[i] = NULL;
5206 4550
5207 i = 0; 4551 i = 0;
@@ -5235,11 +4579,7 @@ static int __init cgroup_disable(char *str)
5235 if (!*token) 4579 if (!*token)
5236 continue; 4580 continue;
5237 4581
5238 /* 4582 for_each_subsys(ss, i) {
5239 * cgroup_disable, being at boot time, can't know about
5240 * module subsystems, so we don't worry about them.
5241 */
5242 for_each_builtin_subsys(ss, i) {
5243 if (!strcmp(token, ss->name)) { 4583 if (!strcmp(token, ss->name)) {
5244 ss->disabled = 1; 4584 ss->disabled = 1;
5245 printk(KERN_INFO "Disabling %s control group" 4585 printk(KERN_INFO "Disabling %s control group"
@@ -5253,28 +4593,42 @@ static int __init cgroup_disable(char *str)
5253__setup("cgroup_disable=", cgroup_disable); 4593__setup("cgroup_disable=", cgroup_disable);
5254 4594
5255/** 4595/**
5256 * css_from_dir - get corresponding css from the dentry of a cgroup dir 4596 * css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir
5257 * @dentry: directory dentry of interest 4597 * @dentry: directory dentry of interest
5258 * @ss: subsystem of interest 4598 * @ss: subsystem of interest
5259 * 4599 *
5260 * Must be called under cgroup_mutex or RCU read lock. The caller is 4600 * If @dentry is a directory for a cgroup which has @ss enabled on it, try
5261 * responsible for pinning the returned css if it needs to be accessed 4601 * to get the corresponding css and return it. If such css doesn't exist
5262 * outside the critical section. 4602 * or can't be pinned, an ERR_PTR value is returned.
5263 */ 4603 */
5264struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, 4604struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
5265 struct cgroup_subsys *ss) 4605 struct cgroup_subsys *ss)
5266{ 4606{
4607 struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
4608 struct cgroup_subsys_state *css = NULL;
5267 struct cgroup *cgrp; 4609 struct cgroup *cgrp;
5268 4610
5269 cgroup_assert_mutex_or_rcu_locked();
5270
5271 /* is @dentry a cgroup dir? */ 4611 /* is @dentry a cgroup dir? */
5272 if (!dentry->d_inode || 4612 if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
5273 dentry->d_inode->i_op != &cgroup_dir_inode_operations) 4613 kernfs_type(kn) != KERNFS_DIR)
5274 return ERR_PTR(-EBADF); 4614 return ERR_PTR(-EBADF);
5275 4615
5276 cgrp = __d_cgrp(dentry); 4616 rcu_read_lock();
5277 return cgroup_css(cgrp, ss) ?: ERR_PTR(-ENOENT); 4617
4618 /*
4619 * This path doesn't originate from kernfs and @kn could already
4620 * have been or be removed at any point. @kn->priv is RCU
4621 * protected for this access. See destroy_locked() for details.
4622 */
4623 cgrp = rcu_dereference(kn->priv);
4624 if (cgrp)
4625 css = cgroup_css(cgrp, ss);
4626
4627 if (!css || !css_tryget(css))
4628 css = ERR_PTR(-ENOENT);
4629
4630 rcu_read_unlock();
4631 return css;
5278} 4632}
5279 4633
5280/** 4634/**
@@ -5289,7 +4643,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
5289{ 4643{
5290 struct cgroup *cgrp; 4644 struct cgroup *cgrp;
5291 4645
5292 cgroup_assert_mutex_or_rcu_locked(); 4646 cgroup_assert_mutexes_or_rcu_locked();
5293 4647
5294 cgrp = idr_find(&ss->root->cgroup_idr, id); 4648 cgrp = idr_find(&ss->root->cgroup_idr, id);
5295 if (cgrp) 4649 if (cgrp)
@@ -5341,23 +4695,25 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
5341{ 4695{
5342 struct cgrp_cset_link *link; 4696 struct cgrp_cset_link *link;
5343 struct css_set *cset; 4697 struct css_set *cset;
4698 char *name_buf;
5344 4699
5345 read_lock(&css_set_lock); 4700 name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
4701 if (!name_buf)
4702 return -ENOMEM;
4703
4704 down_read(&css_set_rwsem);
5346 rcu_read_lock(); 4705 rcu_read_lock();
5347 cset = rcu_dereference(current->cgroups); 4706 cset = rcu_dereference(current->cgroups);
5348 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { 4707 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
5349 struct cgroup *c = link->cgrp; 4708 struct cgroup *c = link->cgrp;
5350 const char *name;
5351 4709
5352 if (c->dentry) 4710 cgroup_name(c, name_buf, NAME_MAX + 1);
5353 name = c->dentry->d_name.name;
5354 else
5355 name = "?";
5356 seq_printf(seq, "Root %d group %s\n", 4711 seq_printf(seq, "Root %d group %s\n",
5357 c->root->hierarchy_id, name); 4712 c->root->hierarchy_id, name_buf);
5358 } 4713 }
5359 rcu_read_unlock(); 4714 rcu_read_unlock();
5360 read_unlock(&css_set_lock); 4715 up_read(&css_set_rwsem);
4716 kfree(name_buf);
5361 return 0; 4717 return 0;
5362} 4718}
5363 4719
@@ -5367,23 +4723,30 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
5367 struct cgroup_subsys_state *css = seq_css(seq); 4723 struct cgroup_subsys_state *css = seq_css(seq);
5368 struct cgrp_cset_link *link; 4724 struct cgrp_cset_link *link;
5369 4725
5370 read_lock(&css_set_lock); 4726 down_read(&css_set_rwsem);
5371 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { 4727 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
5372 struct css_set *cset = link->cset; 4728 struct css_set *cset = link->cset;
5373 struct task_struct *task; 4729 struct task_struct *task;
5374 int count = 0; 4730 int count = 0;
4731
5375 seq_printf(seq, "css_set %p\n", cset); 4732 seq_printf(seq, "css_set %p\n", cset);
4733
5376 list_for_each_entry(task, &cset->tasks, cg_list) { 4734 list_for_each_entry(task, &cset->tasks, cg_list) {
5377 if (count++ > MAX_TASKS_SHOWN_PER_CSS) { 4735 if (count++ > MAX_TASKS_SHOWN_PER_CSS)
5378 seq_puts(seq, " ...\n"); 4736 goto overflow;
5379 break; 4737 seq_printf(seq, " task %d\n", task_pid_vnr(task));
5380 } else { 4738 }
5381 seq_printf(seq, " task %d\n", 4739
5382 task_pid_vnr(task)); 4740 list_for_each_entry(task, &cset->mg_tasks, cg_list) {
5383 } 4741 if (count++ > MAX_TASKS_SHOWN_PER_CSS)
4742 goto overflow;
4743 seq_printf(seq, " task %d\n", task_pid_vnr(task));
5384 } 4744 }
4745 continue;
4746 overflow:
4747 seq_puts(seq, " ...\n");
5385 } 4748 }
5386 read_unlock(&css_set_lock); 4749 up_read(&css_set_rwsem);
5387 return 0; 4750 return 0;
5388} 4751}
5389 4752
@@ -5426,11 +4789,9 @@ static struct cftype debug_files[] = {
5426 { } /* terminate */ 4789 { } /* terminate */
5427}; 4790};
5428 4791
5429struct cgroup_subsys debug_subsys = { 4792struct cgroup_subsys debug_cgrp_subsys = {
5430 .name = "debug",
5431 .css_alloc = debug_css_alloc, 4793 .css_alloc = debug_css_alloc,
5432 .css_free = debug_css_free, 4794 .css_free = debug_css_free,
5433 .subsys_id = debug_subsys_id,
5434 .base_cftypes = debug_files, 4795 .base_cftypes = debug_files,
5435}; 4796};
5436#endif /* CONFIG_CGROUP_DEBUG */ 4797#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 6c3154e477f6..2bc4a2256444 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -52,7 +52,7 @@ static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
52 52
53static inline struct freezer *task_freezer(struct task_struct *task) 53static inline struct freezer *task_freezer(struct task_struct *task)
54{ 54{
55 return css_freezer(task_css(task, freezer_subsys_id)); 55 return css_freezer(task_css(task, freezer_cgrp_id));
56} 56}
57 57
58static struct freezer *parent_freezer(struct freezer *freezer) 58static struct freezer *parent_freezer(struct freezer *freezer)
@@ -84,8 +84,6 @@ static const char *freezer_state_strs(unsigned int state)
84 return "THAWED"; 84 return "THAWED";
85}; 85};
86 86
87struct cgroup_subsys freezer_subsys;
88
89static struct cgroup_subsys_state * 87static struct cgroup_subsys_state *
90freezer_css_alloc(struct cgroup_subsys_state *parent_css) 88freezer_css_alloc(struct cgroup_subsys_state *parent_css)
91{ 89{
@@ -189,7 +187,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
189 * current state before executing the following - !frozen tasks may 187 * current state before executing the following - !frozen tasks may
190 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. 188 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
191 */ 189 */
192 cgroup_taskset_for_each(task, new_css, tset) { 190 cgroup_taskset_for_each(task, tset) {
193 if (!(freezer->state & CGROUP_FREEZING)) { 191 if (!(freezer->state & CGROUP_FREEZING)) {
194 __thaw_task(task); 192 __thaw_task(task);
195 } else { 193 } else {
@@ -216,6 +214,16 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
216 } 214 }
217} 215}
218 216
217/**
218 * freezer_fork - cgroup post fork callback
219 * @task: a task which has just been forked
220 *
221 * @task has just been created and should conform to the current state of
222 * the cgroup_freezer it belongs to. This function may race against
223 * freezer_attach(). Losing to freezer_attach() means that we don't have
224 * to do anything as freezer_attach() will put @task into the appropriate
225 * state.
226 */
219static void freezer_fork(struct task_struct *task) 227static void freezer_fork(struct task_struct *task)
220{ 228{
221 struct freezer *freezer; 229 struct freezer *freezer;
@@ -224,14 +232,26 @@ static void freezer_fork(struct task_struct *task)
224 freezer = task_freezer(task); 232 freezer = task_freezer(task);
225 233
226 /* 234 /*
227 * The root cgroup is non-freezable, so we can skip the 235 * The root cgroup is non-freezable, so we can skip locking the
228 * following check. 236 * freezer. This is safe regardless of race with task migration.
237 * If we didn't race or won, skipping is obviously the right thing
238 * to do. If we lost and root is the new cgroup, noop is still the
239 * right thing to do.
229 */ 240 */
230 if (!parent_freezer(freezer)) 241 if (!parent_freezer(freezer))
231 goto out; 242 goto out;
232 243
244 /*
245 * Grab @freezer->lock and freeze @task after verifying @task still
246 * belongs to @freezer and it's freezing. The former is for the
247 * case where we have raced against task migration and lost and
248 * @task is already in a different cgroup which may not be frozen.
249 * This isn't strictly necessary as freeze_task() is allowed to be
250 * called spuriously but let's do it anyway for, if nothing else,
251 * documentation.
252 */
233 spin_lock_irq(&freezer->lock); 253 spin_lock_irq(&freezer->lock);
234 if (freezer->state & CGROUP_FREEZING) 254 if (freezer == task_freezer(task) && (freezer->state & CGROUP_FREEZING))
235 freeze_task(task); 255 freeze_task(task);
236 spin_unlock_irq(&freezer->lock); 256 spin_unlock_irq(&freezer->lock);
237out: 257out:
@@ -422,7 +442,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
422} 442}
423 443
424static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft, 444static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft,
425 const char *buffer) 445 char *buffer)
426{ 446{
427 bool freeze; 447 bool freeze;
428 448
@@ -473,13 +493,11 @@ static struct cftype files[] = {
473 { } /* terminate */ 493 { } /* terminate */
474}; 494};
475 495
476struct cgroup_subsys freezer_subsys = { 496struct cgroup_subsys freezer_cgrp_subsys = {
477 .name = "freezer",
478 .css_alloc = freezer_css_alloc, 497 .css_alloc = freezer_css_alloc,
479 .css_online = freezer_css_online, 498 .css_online = freezer_css_online,
480 .css_offline = freezer_css_offline, 499 .css_offline = freezer_css_offline,
481 .css_free = freezer_css_free, 500 .css_free = freezer_css_free,
482 .subsys_id = freezer_subsys_id,
483 .attach = freezer_attach, 501 .attach = freezer_attach,
484 .fork = freezer_fork, 502 .fork = freezer_fork,
485 .base_cftypes = files, 503 .base_cftypes = files,
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f6fc7475f1a1..3d54c418bd06 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -119,7 +119,7 @@ static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
119/* Retrieve the cpuset for a task */ 119/* Retrieve the cpuset for a task */
120static inline struct cpuset *task_cs(struct task_struct *task) 120static inline struct cpuset *task_cs(struct task_struct *task)
121{ 121{
122 return css_cs(task_css(task, cpuset_subsys_id)); 122 return css_cs(task_css(task, cpuset_cgrp_id));
123} 123}
124 124
125static inline struct cpuset *parent_cs(struct cpuset *cs) 125static inline struct cpuset *parent_cs(struct cpuset *cs)
@@ -467,7 +467,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
467 * be changed to have empty cpus_allowed or mems_allowed. 467 * be changed to have empty cpus_allowed or mems_allowed.
468 */ 468 */
469 ret = -ENOSPC; 469 ret = -ENOSPC;
470 if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress)) { 470 if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) {
471 if (!cpumask_empty(cur->cpus_allowed) && 471 if (!cpumask_empty(cur->cpus_allowed) &&
472 cpumask_empty(trial->cpus_allowed)) 472 cpumask_empty(trial->cpus_allowed))
473 goto out; 473 goto out;
@@ -829,55 +829,36 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
829} 829}
830 830
831/** 831/**
832 * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's
833 * @tsk: task to test
834 * @data: cpuset to @tsk belongs to
835 *
836 * Called by css_scan_tasks() for each task in a cgroup whose cpus_allowed
837 * mask needs to be changed.
838 *
839 * We don't need to re-check for the cgroup/cpuset membership, since we're
840 * holding cpuset_mutex at this point.
841 */
842static void cpuset_change_cpumask(struct task_struct *tsk, void *data)
843{
844 struct cpuset *cs = data;
845 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
846
847 set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed);
848}
849
850/**
851 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. 832 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
852 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed 833 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
853 * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
854 *
855 * Called with cpuset_mutex held
856 * 834 *
857 * The css_scan_tasks() function will scan all the tasks in a cgroup, 835 * Iterate through each task of @cs updating its cpus_allowed to the
858 * calling callback functions for each. 836 * effective cpuset's. As this function is called with cpuset_mutex held,
859 * 837 * cpuset membership stays stable.
860 * No return value. It's guaranteed that css_scan_tasks() always returns 0
861 * if @heap != NULL.
862 */ 838 */
863static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) 839static void update_tasks_cpumask(struct cpuset *cs)
864{ 840{
865 css_scan_tasks(&cs->css, NULL, cpuset_change_cpumask, cs, heap); 841 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
842 struct css_task_iter it;
843 struct task_struct *task;
844
845 css_task_iter_start(&cs->css, &it);
846 while ((task = css_task_iter_next(&it)))
847 set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed);
848 css_task_iter_end(&it);
866} 849}
867 850
868/* 851/*
869 * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. 852 * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
870 * @root_cs: the root cpuset of the hierarchy 853 * @root_cs: the root cpuset of the hierarchy
871 * @update_root: update root cpuset or not? 854 * @update_root: update root cpuset or not?
872 * @heap: the heap used by css_scan_tasks()
873 * 855 *
874 * This will update cpumasks of tasks in @root_cs and all other empty cpusets 856 * This will update cpumasks of tasks in @root_cs and all other empty cpusets
875 * which take on cpumask of @root_cs. 857 * which take on cpumask of @root_cs.
876 * 858 *
877 * Called with cpuset_mutex held 859 * Called with cpuset_mutex held
878 */ 860 */
879static void update_tasks_cpumask_hier(struct cpuset *root_cs, 861static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)
880 bool update_root, struct ptr_heap *heap)
881{ 862{
882 struct cpuset *cp; 863 struct cpuset *cp;
883 struct cgroup_subsys_state *pos_css; 864 struct cgroup_subsys_state *pos_css;
@@ -898,7 +879,7 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs,
898 continue; 879 continue;
899 rcu_read_unlock(); 880 rcu_read_unlock();
900 881
901 update_tasks_cpumask(cp, heap); 882 update_tasks_cpumask(cp);
902 883
903 rcu_read_lock(); 884 rcu_read_lock();
904 css_put(&cp->css); 885 css_put(&cp->css);
@@ -914,7 +895,6 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs,
914static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, 895static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
915 const char *buf) 896 const char *buf)
916{ 897{
917 struct ptr_heap heap;
918 int retval; 898 int retval;
919 int is_load_balanced; 899 int is_load_balanced;
920 900
@@ -947,19 +927,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
947 if (retval < 0) 927 if (retval < 0)
948 return retval; 928 return retval;
949 929
950 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
951 if (retval)
952 return retval;
953
954 is_load_balanced = is_sched_load_balance(trialcs); 930 is_load_balanced = is_sched_load_balance(trialcs);
955 931
956 mutex_lock(&callback_mutex); 932 mutex_lock(&callback_mutex);
957 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); 933 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
958 mutex_unlock(&callback_mutex); 934 mutex_unlock(&callback_mutex);
959 935
960 update_tasks_cpumask_hier(cs, true, &heap); 936 update_tasks_cpumask_hier(cs, true);
961
962 heap_free(&heap);
963 937
964 if (is_load_balanced) 938 if (is_load_balanced)
965 rebuild_sched_domains_locked(); 939 rebuild_sched_domains_locked();
@@ -1048,53 +1022,22 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
1048 task_unlock(tsk); 1022 task_unlock(tsk);
1049} 1023}
1050 1024
1051struct cpuset_change_nodemask_arg {
1052 struct cpuset *cs;
1053 nodemask_t *newmems;
1054};
1055
1056/*
1057 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
1058 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
1059 * memory_migrate flag is set. Called with cpuset_mutex held.
1060 */
1061static void cpuset_change_nodemask(struct task_struct *p, void *data)
1062{
1063 struct cpuset_change_nodemask_arg *arg = data;
1064 struct cpuset *cs = arg->cs;
1065 struct mm_struct *mm;
1066 int migrate;
1067
1068 cpuset_change_task_nodemask(p, arg->newmems);
1069
1070 mm = get_task_mm(p);
1071 if (!mm)
1072 return;
1073
1074 migrate = is_memory_migrate(cs);
1075
1076 mpol_rebind_mm(mm, &cs->mems_allowed);
1077 if (migrate)
1078 cpuset_migrate_mm(mm, &cs->old_mems_allowed, arg->newmems);
1079 mmput(mm);
1080}
1081
1082static void *cpuset_being_rebound; 1025static void *cpuset_being_rebound;
1083 1026
1084/** 1027/**
1085 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. 1028 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
1086 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed 1029 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
1087 * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
1088 * 1030 *
1089 * Called with cpuset_mutex held. No return value. It's guaranteed that 1031 * Iterate through each task of @cs updating its mems_allowed to the
1090 * css_scan_tasks() always returns 0 if @heap != NULL. 1032 * effective cpuset's. As this function is called with cpuset_mutex held,
1033 * cpuset membership stays stable.
1091 */ 1034 */
1092static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) 1035static void update_tasks_nodemask(struct cpuset *cs)
1093{ 1036{
1094 static nodemask_t newmems; /* protected by cpuset_mutex */ 1037 static nodemask_t newmems; /* protected by cpuset_mutex */
1095 struct cpuset *mems_cs = effective_nodemask_cpuset(cs); 1038 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1096 struct cpuset_change_nodemask_arg arg = { .cs = cs, 1039 struct css_task_iter it;
1097 .newmems = &newmems }; 1040 struct task_struct *task;
1098 1041
1099 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 1042 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
1100 1043
@@ -1110,7 +1053,25 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1110 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 1053 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
1111 * is idempotent. Also migrate pages in each mm to new nodes. 1054 * is idempotent. Also migrate pages in each mm to new nodes.
1112 */ 1055 */
1113 css_scan_tasks(&cs->css, NULL, cpuset_change_nodemask, &arg, heap); 1056 css_task_iter_start(&cs->css, &it);
1057 while ((task = css_task_iter_next(&it))) {
1058 struct mm_struct *mm;
1059 bool migrate;
1060
1061 cpuset_change_task_nodemask(task, &newmems);
1062
1063 mm = get_task_mm(task);
1064 if (!mm)
1065 continue;
1066
1067 migrate = is_memory_migrate(cs);
1068
1069 mpol_rebind_mm(mm, &cs->mems_allowed);
1070 if (migrate)
1071 cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
1072 mmput(mm);
1073 }
1074 css_task_iter_end(&it);
1114 1075
1115 /* 1076 /*
1116 * All the tasks' nodemasks have been updated, update 1077 * All the tasks' nodemasks have been updated, update
@@ -1126,15 +1087,13 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1126 * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. 1087 * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
1127 * @cs: the root cpuset of the hierarchy 1088 * @cs: the root cpuset of the hierarchy
1128 * @update_root: update the root cpuset or not? 1089 * @update_root: update the root cpuset or not?
1129 * @heap: the heap used by css_scan_tasks()
1130 * 1090 *
1131 * This will update nodemasks of tasks in @root_cs and all other empty cpusets 1091 * This will update nodemasks of tasks in @root_cs and all other empty cpusets
1132 * which take on nodemask of @root_cs. 1092 * which take on nodemask of @root_cs.
1133 * 1093 *
1134 * Called with cpuset_mutex held 1094 * Called with cpuset_mutex held
1135 */ 1095 */
1136static void update_tasks_nodemask_hier(struct cpuset *root_cs, 1096static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root)
1137 bool update_root, struct ptr_heap *heap)
1138{ 1097{
1139 struct cpuset *cp; 1098 struct cpuset *cp;
1140 struct cgroup_subsys_state *pos_css; 1099 struct cgroup_subsys_state *pos_css;
@@ -1155,7 +1114,7 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs,
1155 continue; 1114 continue;
1156 rcu_read_unlock(); 1115 rcu_read_unlock();
1157 1116
1158 update_tasks_nodemask(cp, heap); 1117 update_tasks_nodemask(cp);
1159 1118
1160 rcu_read_lock(); 1119 rcu_read_lock();
1161 css_put(&cp->css); 1120 css_put(&cp->css);
@@ -1180,7 +1139,6 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1180 const char *buf) 1139 const char *buf)
1181{ 1140{
1182 int retval; 1141 int retval;
1183 struct ptr_heap heap;
1184 1142
1185 /* 1143 /*
1186 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; 1144 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
@@ -1219,17 +1177,11 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1219 if (retval < 0) 1177 if (retval < 0)
1220 goto done; 1178 goto done;
1221 1179
1222 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
1223 if (retval < 0)
1224 goto done;
1225
1226 mutex_lock(&callback_mutex); 1180 mutex_lock(&callback_mutex);
1227 cs->mems_allowed = trialcs->mems_allowed; 1181 cs->mems_allowed = trialcs->mems_allowed;
1228 mutex_unlock(&callback_mutex); 1182 mutex_unlock(&callback_mutex);
1229 1183
1230 update_tasks_nodemask_hier(cs, true, &heap); 1184 update_tasks_nodemask_hier(cs, true);
1231
1232 heap_free(&heap);
1233done: 1185done:
1234 return retval; 1186 return retval;
1235} 1187}
@@ -1257,38 +1209,22 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1257} 1209}
1258 1210
1259/** 1211/**
1260 * cpuset_change_flag - make a task's spread flags the same as its cpuset's
1261 * @tsk: task to be updated
1262 * @data: cpuset to @tsk belongs to
1263 *
1264 * Called by css_scan_tasks() for each task in a cgroup.
1265 *
1266 * We don't need to re-check for the cgroup/cpuset membership, since we're
1267 * holding cpuset_mutex at this point.
1268 */
1269static void cpuset_change_flag(struct task_struct *tsk, void *data)
1270{
1271 struct cpuset *cs = data;
1272
1273 cpuset_update_task_spread_flag(cs, tsk);
1274}
1275
1276/**
1277 * update_tasks_flags - update the spread flags of tasks in the cpuset. 1212 * update_tasks_flags - update the spread flags of tasks in the cpuset.
1278 * @cs: the cpuset in which each task's spread flags needs to be changed 1213 * @cs: the cpuset in which each task's spread flags needs to be changed
1279 * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
1280 *
1281 * Called with cpuset_mutex held
1282 * 1214 *
1283 * The css_scan_tasks() function will scan all the tasks in a cgroup, 1215 * Iterate through each task of @cs updating its spread flags. As this
1284 * calling callback functions for each. 1216 * function is called with cpuset_mutex held, cpuset membership stays
1285 * 1217 * stable.
1286 * No return value. It's guaranteed that css_scan_tasks() always returns 0
1287 * if @heap != NULL.
1288 */ 1218 */
1289static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) 1219static void update_tasks_flags(struct cpuset *cs)
1290{ 1220{
1291 css_scan_tasks(&cs->css, NULL, cpuset_change_flag, cs, heap); 1221 struct css_task_iter it;
1222 struct task_struct *task;
1223
1224 css_task_iter_start(&cs->css, &it);
1225 while ((task = css_task_iter_next(&it)))
1226 cpuset_update_task_spread_flag(cs, task);
1227 css_task_iter_end(&it);
1292} 1228}
1293 1229
1294/* 1230/*
@@ -1306,7 +1242,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1306 struct cpuset *trialcs; 1242 struct cpuset *trialcs;
1307 int balance_flag_changed; 1243 int balance_flag_changed;
1308 int spread_flag_changed; 1244 int spread_flag_changed;
1309 struct ptr_heap heap;
1310 int err; 1245 int err;
1311 1246
1312 trialcs = alloc_trial_cpuset(cs); 1247 trialcs = alloc_trial_cpuset(cs);
@@ -1322,10 +1257,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1322 if (err < 0) 1257 if (err < 0)
1323 goto out; 1258 goto out;
1324 1259
1325 err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
1326 if (err < 0)
1327 goto out;
1328
1329 balance_flag_changed = (is_sched_load_balance(cs) != 1260 balance_flag_changed = (is_sched_load_balance(cs) !=
1330 is_sched_load_balance(trialcs)); 1261 is_sched_load_balance(trialcs));
1331 1262
@@ -1340,8 +1271,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1340 rebuild_sched_domains_locked(); 1271 rebuild_sched_domains_locked();
1341 1272
1342 if (spread_flag_changed) 1273 if (spread_flag_changed)
1343 update_tasks_flags(cs, &heap); 1274 update_tasks_flags(cs);
1344 heap_free(&heap);
1345out: 1275out:
1346 free_trial_cpuset(trialcs); 1276 free_trial_cpuset(trialcs);
1347 return err; 1277 return err;
@@ -1445,6 +1375,8 @@ static int fmeter_getrate(struct fmeter *fmp)
1445 return val; 1375 return val;
1446} 1376}
1447 1377
1378static struct cpuset *cpuset_attach_old_cs;
1379
1448/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ 1380/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
1449static int cpuset_can_attach(struct cgroup_subsys_state *css, 1381static int cpuset_can_attach(struct cgroup_subsys_state *css,
1450 struct cgroup_taskset *tset) 1382 struct cgroup_taskset *tset)
@@ -1453,6 +1385,9 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
1453 struct task_struct *task; 1385 struct task_struct *task;
1454 int ret; 1386 int ret;
1455 1387
1388 /* used later by cpuset_attach() */
1389 cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset));
1390
1456 mutex_lock(&cpuset_mutex); 1391 mutex_lock(&cpuset_mutex);
1457 1392
1458 /* 1393 /*
@@ -1464,7 +1399,7 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
1464 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) 1399 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
1465 goto out_unlock; 1400 goto out_unlock;
1466 1401
1467 cgroup_taskset_for_each(task, css, tset) { 1402 cgroup_taskset_for_each(task, tset) {
1468 /* 1403 /*
1469 * Kthreads which disallow setaffinity shouldn't be moved 1404 * Kthreads which disallow setaffinity shouldn't be moved
1470 * to a new cpuset; we don't want to change their cpu 1405 * to a new cpuset; we don't want to change their cpu
@@ -1516,10 +1451,8 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
1516 struct mm_struct *mm; 1451 struct mm_struct *mm;
1517 struct task_struct *task; 1452 struct task_struct *task;
1518 struct task_struct *leader = cgroup_taskset_first(tset); 1453 struct task_struct *leader = cgroup_taskset_first(tset);
1519 struct cgroup_subsys_state *oldcss = cgroup_taskset_cur_css(tset,
1520 cpuset_subsys_id);
1521 struct cpuset *cs = css_cs(css); 1454 struct cpuset *cs = css_cs(css);
1522 struct cpuset *oldcs = css_cs(oldcss); 1455 struct cpuset *oldcs = cpuset_attach_old_cs;
1523 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); 1456 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
1524 struct cpuset *mems_cs = effective_nodemask_cpuset(cs); 1457 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1525 1458
@@ -1533,7 +1466,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
1533 1466
1534 guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); 1467 guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
1535 1468
1536 cgroup_taskset_for_each(task, css, tset) { 1469 cgroup_taskset_for_each(task, tset) {
1537 /* 1470 /*
1538 * can_attach beforehand should guarantee that this doesn't 1471 * can_attach beforehand should guarantee that this doesn't
1539 * fail. TODO: have a better way to handle failure here 1472 * fail. TODO: have a better way to handle failure here
@@ -1673,7 +1606,7 @@ out_unlock:
1673 * Common handling for a write to a "cpus" or "mems" file. 1606 * Common handling for a write to a "cpus" or "mems" file.
1674 */ 1607 */
1675static int cpuset_write_resmask(struct cgroup_subsys_state *css, 1608static int cpuset_write_resmask(struct cgroup_subsys_state *css,
1676 struct cftype *cft, const char *buf) 1609 struct cftype *cft, char *buf)
1677{ 1610{
1678 struct cpuset *cs = css_cs(css); 1611 struct cpuset *cs = css_cs(css);
1679 struct cpuset *trialcs; 1612 struct cpuset *trialcs;
@@ -2020,8 +1953,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
2020 kfree(cs); 1953 kfree(cs);
2021} 1954}
2022 1955
2023struct cgroup_subsys cpuset_subsys = { 1956struct cgroup_subsys cpuset_cgrp_subsys = {
2024 .name = "cpuset",
2025 .css_alloc = cpuset_css_alloc, 1957 .css_alloc = cpuset_css_alloc,
2026 .css_online = cpuset_css_online, 1958 .css_online = cpuset_css_online,
2027 .css_offline = cpuset_css_offline, 1959 .css_offline = cpuset_css_offline,
@@ -2029,7 +1961,6 @@ struct cgroup_subsys cpuset_subsys = {
2029 .can_attach = cpuset_can_attach, 1961 .can_attach = cpuset_can_attach,
2030 .cancel_attach = cpuset_cancel_attach, 1962 .cancel_attach = cpuset_cancel_attach,
2031 .attach = cpuset_attach, 1963 .attach = cpuset_attach,
2032 .subsys_id = cpuset_subsys_id,
2033 .base_cftypes = files, 1964 .base_cftypes = files,
2034 .early_init = 1, 1965 .early_init = 1,
2035}; 1966};
@@ -2086,10 +2017,9 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2086 parent = parent_cs(parent); 2017 parent = parent_cs(parent);
2087 2018
2088 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { 2019 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
2089 rcu_read_lock(); 2020 printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset ");
2090 printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset %s\n", 2021 pr_cont_cgroup_name(cs->css.cgroup);
2091 cgroup_name(cs->css.cgroup)); 2022 pr_cont("\n");
2092 rcu_read_unlock();
2093 } 2023 }
2094} 2024}
2095 2025
@@ -2137,7 +2067,7 @@ retry:
2137 */ 2067 */
2138 if ((sane && cpumask_empty(cs->cpus_allowed)) || 2068 if ((sane && cpumask_empty(cs->cpus_allowed)) ||
2139 (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed))) 2069 (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed)))
2140 update_tasks_cpumask(cs, NULL); 2070 update_tasks_cpumask(cs);
2141 2071
2142 mutex_lock(&callback_mutex); 2072 mutex_lock(&callback_mutex);
2143 nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); 2073 nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
@@ -2151,7 +2081,7 @@ retry:
2151 */ 2081 */
2152 if ((sane && nodes_empty(cs->mems_allowed)) || 2082 if ((sane && nodes_empty(cs->mems_allowed)) ||
2153 (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed))) 2083 (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed)))
2154 update_tasks_nodemask(cs, NULL); 2084 update_tasks_nodemask(cs);
2155 2085
2156 is_empty = cpumask_empty(cs->cpus_allowed) || 2086 is_empty = cpumask_empty(cs->cpus_allowed) ||
2157 nodes_empty(cs->mems_allowed); 2087 nodes_empty(cs->mems_allowed);
@@ -2213,7 +2143,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2213 mutex_lock(&callback_mutex); 2143 mutex_lock(&callback_mutex);
2214 top_cpuset.mems_allowed = new_mems; 2144 top_cpuset.mems_allowed = new_mems;
2215 mutex_unlock(&callback_mutex); 2145 mutex_unlock(&callback_mutex);
2216 update_tasks_nodemask(&top_cpuset, NULL); 2146 update_tasks_nodemask(&top_cpuset);
2217 } 2147 }
2218 2148
2219 mutex_unlock(&cpuset_mutex); 2149 mutex_unlock(&cpuset_mutex);
@@ -2305,10 +2235,10 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2305 struct cpuset *cpus_cs; 2235 struct cpuset *cpus_cs;
2306 2236
2307 mutex_lock(&callback_mutex); 2237 mutex_lock(&callback_mutex);
2308 task_lock(tsk); 2238 rcu_read_lock();
2309 cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); 2239 cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
2310 guarantee_online_cpus(cpus_cs, pmask); 2240 guarantee_online_cpus(cpus_cs, pmask);
2311 task_unlock(tsk); 2241 rcu_read_unlock();
2312 mutex_unlock(&callback_mutex); 2242 mutex_unlock(&callback_mutex);
2313} 2243}
2314 2244
@@ -2361,10 +2291,10 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2361 nodemask_t mask; 2291 nodemask_t mask;
2362 2292
2363 mutex_lock(&callback_mutex); 2293 mutex_lock(&callback_mutex);
2364 task_lock(tsk); 2294 rcu_read_lock();
2365 mems_cs = effective_nodemask_cpuset(task_cs(tsk)); 2295 mems_cs = effective_nodemask_cpuset(task_cs(tsk));
2366 guarantee_online_mems(mems_cs, &mask); 2296 guarantee_online_mems(mems_cs, &mask);
2367 task_unlock(tsk); 2297 rcu_read_unlock();
2368 mutex_unlock(&callback_mutex); 2298 mutex_unlock(&callback_mutex);
2369 2299
2370 return mask; 2300 return mask;
@@ -2480,10 +2410,10 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
2480 /* Not hardwall and node outside mems_allowed: scan up cpusets */ 2410 /* Not hardwall and node outside mems_allowed: scan up cpusets */
2481 mutex_lock(&callback_mutex); 2411 mutex_lock(&callback_mutex);
2482 2412
2483 task_lock(current); 2413 rcu_read_lock();
2484 cs = nearest_hardwall_ancestor(task_cs(current)); 2414 cs = nearest_hardwall_ancestor(task_cs(current));
2485 allowed = node_isset(node, cs->mems_allowed); 2415 allowed = node_isset(node, cs->mems_allowed);
2486 task_unlock(current); 2416 rcu_read_unlock();
2487 2417
2488 mutex_unlock(&callback_mutex); 2418 mutex_unlock(&callback_mutex);
2489 return allowed; 2419 return allowed;
@@ -2609,27 +2539,27 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2609 * @task: pointer to task_struct of some task. 2539 * @task: pointer to task_struct of some task.
2610 * 2540 *
2611 * Description: Prints @task's name, cpuset name, and cached copy of its 2541 * Description: Prints @task's name, cpuset name, and cached copy of its
2612 * mems_allowed to the kernel log. Must hold task_lock(task) to allow 2542 * mems_allowed to the kernel log.
2613 * dereferencing task_cs(task).
2614 */ 2543 */
2615void cpuset_print_task_mems_allowed(struct task_struct *tsk) 2544void cpuset_print_task_mems_allowed(struct task_struct *tsk)
2616{ 2545{
2617 /* Statically allocated to prevent using excess stack. */ 2546 /* Statically allocated to prevent using excess stack. */
2618 static char cpuset_nodelist[CPUSET_NODELIST_LEN]; 2547 static char cpuset_nodelist[CPUSET_NODELIST_LEN];
2619 static DEFINE_SPINLOCK(cpuset_buffer_lock); 2548 static DEFINE_SPINLOCK(cpuset_buffer_lock);
2549 struct cgroup *cgrp;
2620 2550
2621 struct cgroup *cgrp = task_cs(tsk)->css.cgroup;
2622
2623 rcu_read_lock();
2624 spin_lock(&cpuset_buffer_lock); 2551 spin_lock(&cpuset_buffer_lock);
2552 rcu_read_lock();
2625 2553
2554 cgrp = task_cs(tsk)->css.cgroup;
2626 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, 2555 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
2627 tsk->mems_allowed); 2556 tsk->mems_allowed);
2628 printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", 2557 printk(KERN_INFO "%s cpuset=", tsk->comm);
2629 tsk->comm, cgroup_name(cgrp), cpuset_nodelist); 2558 pr_cont_cgroup_name(cgrp);
2559 pr_cont(" mems_allowed=%s\n", cpuset_nodelist);
2630 2560
2631 spin_unlock(&cpuset_buffer_lock);
2632 rcu_read_unlock(); 2561 rcu_read_unlock();
2562 spin_unlock(&cpuset_buffer_lock);
2633} 2563}
2634 2564
2635/* 2565/*
@@ -2660,9 +2590,9 @@ int cpuset_memory_pressure_enabled __read_mostly;
2660 2590
2661void __cpuset_memory_pressure_bump(void) 2591void __cpuset_memory_pressure_bump(void)
2662{ 2592{
2663 task_lock(current); 2593 rcu_read_lock();
2664 fmeter_markevent(&task_cs(current)->fmeter); 2594 fmeter_markevent(&task_cs(current)->fmeter);
2665 task_unlock(current); 2595 rcu_read_unlock();
2666} 2596}
2667 2597
2668#ifdef CONFIG_PROC_PID_CPUSET 2598#ifdef CONFIG_PROC_PID_CPUSET
@@ -2679,12 +2609,12 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v)
2679{ 2609{
2680 struct pid *pid; 2610 struct pid *pid;
2681 struct task_struct *tsk; 2611 struct task_struct *tsk;
2682 char *buf; 2612 char *buf, *p;
2683 struct cgroup_subsys_state *css; 2613 struct cgroup_subsys_state *css;
2684 int retval; 2614 int retval;
2685 2615
2686 retval = -ENOMEM; 2616 retval = -ENOMEM;
2687 buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 2617 buf = kmalloc(PATH_MAX, GFP_KERNEL);
2688 if (!buf) 2618 if (!buf)
2689 goto out; 2619 goto out;
2690 2620
@@ -2694,14 +2624,16 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v)
2694 if (!tsk) 2624 if (!tsk)
2695 goto out_free; 2625 goto out_free;
2696 2626
2627 retval = -ENAMETOOLONG;
2697 rcu_read_lock(); 2628 rcu_read_lock();
2698 css = task_css(tsk, cpuset_subsys_id); 2629 css = task_css(tsk, cpuset_cgrp_id);
2699 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); 2630 p = cgroup_path(css->cgroup, buf, PATH_MAX);
2700 rcu_read_unlock(); 2631 rcu_read_unlock();
2701 if (retval < 0) 2632 if (!p)
2702 goto out_put_task; 2633 goto out_put_task;
2703 seq_puts(m, buf); 2634 seq_puts(m, p);
2704 seq_putc(m, '\n'); 2635 seq_putc(m, '\n');
2636 retval = 0;
2705out_put_task: 2637out_put_task:
2706 put_task_struct(tsk); 2638 put_task_struct(tsk);
2707out_free: 2639out_free:
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 661951ab8ae7..f83a71a3e46d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -361,7 +361,7 @@ struct perf_cgroup {
361static inline struct perf_cgroup * 361static inline struct perf_cgroup *
362perf_cgroup_from_task(struct task_struct *task) 362perf_cgroup_from_task(struct task_struct *task)
363{ 363{
364 return container_of(task_css(task, perf_subsys_id), 364 return container_of(task_css(task, perf_event_cgrp_id),
365 struct perf_cgroup, css); 365 struct perf_cgroup, css);
366} 366}
367 367
@@ -389,11 +389,6 @@ perf_cgroup_match(struct perf_event *event)
389 event->cgrp->css.cgroup); 389 event->cgrp->css.cgroup);
390} 390}
391 391
392static inline bool perf_tryget_cgroup(struct perf_event *event)
393{
394 return css_tryget(&event->cgrp->css);
395}
396
397static inline void perf_put_cgroup(struct perf_event *event) 392static inline void perf_put_cgroup(struct perf_event *event)
398{ 393{
399 css_put(&event->cgrp->css); 394 css_put(&event->cgrp->css);
@@ -612,9 +607,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
612 if (!f.file) 607 if (!f.file)
613 return -EBADF; 608 return -EBADF;
614 609
615 rcu_read_lock(); 610 css = css_tryget_from_dir(f.file->f_dentry, &perf_event_cgrp_subsys);
616
617 css = css_from_dir(f.file->f_dentry, &perf_subsys);
618 if (IS_ERR(css)) { 611 if (IS_ERR(css)) {
619 ret = PTR_ERR(css); 612 ret = PTR_ERR(css);
620 goto out; 613 goto out;
@@ -623,13 +616,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
623 cgrp = container_of(css, struct perf_cgroup, css); 616 cgrp = container_of(css, struct perf_cgroup, css);
624 event->cgrp = cgrp; 617 event->cgrp = cgrp;
625 618
626 /* must be done before we fput() the file */
627 if (!perf_tryget_cgroup(event)) {
628 event->cgrp = NULL;
629 ret = -ENOENT;
630 goto out;
631 }
632
633 /* 619 /*
634 * all events in a group must monitor 620 * all events in a group must monitor
635 * the same cgroup because a task belongs 621 * the same cgroup because a task belongs
@@ -640,7 +626,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
640 ret = -EINVAL; 626 ret = -EINVAL;
641 } 627 }
642out: 628out:
643 rcu_read_unlock();
644 fdput(f); 629 fdput(f);
645 return ret; 630 return ret;
646} 631}
@@ -8053,7 +8038,7 @@ static void perf_cgroup_attach(struct cgroup_subsys_state *css,
8053{ 8038{
8054 struct task_struct *task; 8039 struct task_struct *task;
8055 8040
8056 cgroup_taskset_for_each(task, css, tset) 8041 cgroup_taskset_for_each(task, tset)
8057 task_function_call(task, __perf_cgroup_move, task); 8042 task_function_call(task, __perf_cgroup_move, task);
8058} 8043}
8059 8044
@@ -8072,9 +8057,7 @@ static void perf_cgroup_exit(struct cgroup_subsys_state *css,
8072 task_function_call(task, __perf_cgroup_move, task); 8057 task_function_call(task, __perf_cgroup_move, task);
8073} 8058}
8074 8059
8075struct cgroup_subsys perf_subsys = { 8060struct cgroup_subsys perf_event_cgrp_subsys = {
8076 .name = "perf_event",
8077 .subsys_id = perf_subsys_id,
8078 .css_alloc = perf_cgroup_css_alloc, 8061 .css_alloc = perf_cgroup_css_alloc,
8079 .css_free = perf_cgroup_css_free, 8062 .css_free = perf_cgroup_css_free,
8080 .exit = perf_cgroup_exit, 8063 .exit = perf_cgroup_exit,
diff --git a/kernel/exit.c b/kernel/exit.c
index 1e77fc645317..6480d1c85d7a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -797,7 +797,7 @@ void do_exit(long code)
797 */ 797 */
798 perf_event_exit_task(tsk); 798 perf_event_exit_task(tsk);
799 799
800 cgroup_exit(tsk, 1); 800 cgroup_exit(tsk);
801 801
802 if (group_dead) 802 if (group_dead)
803 disassociate_ctty(1); 803 disassociate_ctty(1);
diff --git a/kernel/fork.c b/kernel/fork.c
index 332688e5e7b4..abc45890f0a5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1272,7 +1272,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1272 if (IS_ERR(p->mempolicy)) { 1272 if (IS_ERR(p->mempolicy)) {
1273 retval = PTR_ERR(p->mempolicy); 1273 retval = PTR_ERR(p->mempolicy);
1274 p->mempolicy = NULL; 1274 p->mempolicy = NULL;
1275 goto bad_fork_cleanup_cgroup; 1275 goto bad_fork_cleanup_threadgroup_lock;
1276 } 1276 }
1277 mpol_fix_fork_child_flag(p); 1277 mpol_fix_fork_child_flag(p);
1278#endif 1278#endif
@@ -1525,11 +1525,10 @@ bad_fork_cleanup_policy:
1525 perf_event_free_task(p); 1525 perf_event_free_task(p);
1526#ifdef CONFIG_NUMA 1526#ifdef CONFIG_NUMA
1527 mpol_put(p->mempolicy); 1527 mpol_put(p->mempolicy);
1528bad_fork_cleanup_cgroup: 1528bad_fork_cleanup_threadgroup_lock:
1529#endif 1529#endif
1530 if (clone_flags & CLONE_THREAD) 1530 if (clone_flags & CLONE_THREAD)
1531 threadgroup_change_end(current); 1531 threadgroup_change_end(current);
1532 cgroup_exit(p, 0);
1533 delayacct_tsk_free(p); 1532 delayacct_tsk_free(p);
1534 module_put(task_thread_info(p)->exec_domain->module); 1533 module_put(task_thread_info(p)->exec_domain->module);
1535bad_fork_cleanup_count: 1534bad_fork_cleanup_count:
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9cae286824bb..1d1b87b36778 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7230,7 +7230,7 @@ void sched_move_task(struct task_struct *tsk)
7230 if (unlikely(running)) 7230 if (unlikely(running))
7231 tsk->sched_class->put_prev_task(rq, tsk); 7231 tsk->sched_class->put_prev_task(rq, tsk);
7232 7232
7233 tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id, 7233 tg = container_of(task_css_check(tsk, cpu_cgrp_id,
7234 lockdep_is_held(&tsk->sighand->siglock)), 7234 lockdep_is_held(&tsk->sighand->siglock)),
7235 struct task_group, css); 7235 struct task_group, css);
7236 tg = autogroup_task_group(tsk, tg); 7236 tg = autogroup_task_group(tsk, tg);
@@ -7657,7 +7657,7 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
7657{ 7657{
7658 struct task_struct *task; 7658 struct task_struct *task;
7659 7659
7660 cgroup_taskset_for_each(task, css, tset) { 7660 cgroup_taskset_for_each(task, tset) {
7661#ifdef CONFIG_RT_GROUP_SCHED 7661#ifdef CONFIG_RT_GROUP_SCHED
7662 if (!sched_rt_can_attach(css_tg(css), task)) 7662 if (!sched_rt_can_attach(css_tg(css), task))
7663 return -EINVAL; 7663 return -EINVAL;
@@ -7675,7 +7675,7 @@ static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
7675{ 7675{
7676 struct task_struct *task; 7676 struct task_struct *task;
7677 7677
7678 cgroup_taskset_for_each(task, css, tset) 7678 cgroup_taskset_for_each(task, tset)
7679 sched_move_task(task); 7679 sched_move_task(task);
7680} 7680}
7681 7681
@@ -8014,8 +8014,7 @@ static struct cftype cpu_files[] = {
8014 { } /* terminate */ 8014 { } /* terminate */
8015}; 8015};
8016 8016
8017struct cgroup_subsys cpu_cgroup_subsys = { 8017struct cgroup_subsys cpu_cgrp_subsys = {
8018 .name = "cpu",
8019 .css_alloc = cpu_cgroup_css_alloc, 8018 .css_alloc = cpu_cgroup_css_alloc,
8020 .css_free = cpu_cgroup_css_free, 8019 .css_free = cpu_cgroup_css_free,
8021 .css_online = cpu_cgroup_css_online, 8020 .css_online = cpu_cgroup_css_online,
@@ -8023,7 +8022,6 @@ struct cgroup_subsys cpu_cgroup_subsys = {
8023 .can_attach = cpu_cgroup_can_attach, 8022 .can_attach = cpu_cgroup_can_attach,
8024 .attach = cpu_cgroup_attach, 8023 .attach = cpu_cgroup_attach,
8025 .exit = cpu_cgroup_exit, 8024 .exit = cpu_cgroup_exit,
8026 .subsys_id = cpu_cgroup_subsys_id,
8027 .base_cftypes = cpu_files, 8025 .base_cftypes = cpu_files,
8028 .early_init = 1, 8026 .early_init = 1,
8029}; 8027};
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 622e0818f905..c143ee380e3a 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -41,7 +41,7 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
41/* return cpu accounting group to which this task belongs */ 41/* return cpu accounting group to which this task belongs */
42static inline struct cpuacct *task_ca(struct task_struct *tsk) 42static inline struct cpuacct *task_ca(struct task_struct *tsk)
43{ 43{
44 return css_ca(task_css(tsk, cpuacct_subsys_id)); 44 return css_ca(task_css(tsk, cpuacct_cgrp_id));
45} 45}
46 46
47static inline struct cpuacct *parent_ca(struct cpuacct *ca) 47static inline struct cpuacct *parent_ca(struct cpuacct *ca)
@@ -275,11 +275,9 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val)
275 rcu_read_unlock(); 275 rcu_read_unlock();
276} 276}
277 277
278struct cgroup_subsys cpuacct_subsys = { 278struct cgroup_subsys cpuacct_cgrp_subsys = {
279 .name = "cpuacct",
280 .css_alloc = cpuacct_css_alloc, 279 .css_alloc = cpuacct_css_alloc,
281 .css_free = cpuacct_css_free, 280 .css_free = cpuacct_css_free,
282 .subsys_id = cpuacct_subsys_id,
283 .base_cftypes = files, 281 .base_cftypes = files,
284 .early_init = 1, 282 .early_init = 1,
285}; 283};
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index f3344c31632a..695f9773bb60 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -111,8 +111,7 @@ static char *task_group_path(struct task_group *tg)
111 if (autogroup_path(tg, group_path, PATH_MAX)) 111 if (autogroup_path(tg, group_path, PATH_MAX))
112 return group_path; 112 return group_path;
113 113
114 cgroup_path(tg->css.cgroup, group_path, PATH_MAX); 114 return cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
115 return group_path;
116} 115}
117#endif 116#endif
118 117
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 4f3a3c03eadb..c1bd4ada2a04 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1429,7 +1429,8 @@ static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
1429 return print_one_line(iter, true); 1429 return print_one_line(iter, true);
1430} 1430}
1431 1431
1432static int blk_tracer_set_flag(u32 old_flags, u32 bit, int set) 1432static int
1433blk_tracer_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
1433{ 1434{
1434 /* don't output context-info for blk_classic output */ 1435 /* don't output context-info for blk_classic output */
1435 if (bit == TRACE_BLK_OPT_CLASSIC) { 1436 if (bit == TRACE_BLK_OPT_CLASSIC) {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index cd7f76d1eb86..1fd4b9479210 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -237,14 +237,13 @@ static int control_ops_alloc(struct ftrace_ops *ops)
237 return 0; 237 return 0;
238} 238}
239 239
240static void control_ops_free(struct ftrace_ops *ops)
241{
242 free_percpu(ops->disabled);
243}
244
245static void update_global_ops(void) 240static void update_global_ops(void)
246{ 241{
247 ftrace_func_t func; 242 ftrace_func_t func = ftrace_global_list_func;
243 void *private = NULL;
244
245 /* The list has its own recursion protection. */
246 global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
248 247
249 /* 248 /*
250 * If there's only one function registered, then call that 249 * If there's only one function registered, then call that
@@ -254,23 +253,17 @@ static void update_global_ops(void)
254 if (ftrace_global_list == &ftrace_list_end || 253 if (ftrace_global_list == &ftrace_list_end ||
255 ftrace_global_list->next == &ftrace_list_end) { 254 ftrace_global_list->next == &ftrace_list_end) {
256 func = ftrace_global_list->func; 255 func = ftrace_global_list->func;
256 private = ftrace_global_list->private;
257 /* 257 /*
258 * As we are calling the function directly. 258 * As we are calling the function directly.
259 * If it does not have recursion protection, 259 * If it does not have recursion protection,
260 * the function_trace_op needs to be updated 260 * the function_trace_op needs to be updated
261 * accordingly. 261 * accordingly.
262 */ 262 */
263 if (ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) 263 if (!(ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE))
264 global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
265 else
266 global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE; 264 global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE;
267 } else {
268 func = ftrace_global_list_func;
269 /* The list has its own recursion protection. */
270 global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
271 } 265 }
272 266
273
274 /* If we filter on pids, update to use the pid function */ 267 /* If we filter on pids, update to use the pid function */
275 if (!list_empty(&ftrace_pids)) { 268 if (!list_empty(&ftrace_pids)) {
276 set_ftrace_pid_function(func); 269 set_ftrace_pid_function(func);
@@ -278,6 +271,7 @@ static void update_global_ops(void)
278 } 271 }
279 272
280 global_ops.func = func; 273 global_ops.func = func;
274 global_ops.private = private;
281} 275}
282 276
283static void ftrace_sync(struct work_struct *work) 277static void ftrace_sync(struct work_struct *work)
@@ -437,6 +431,9 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list,
437 431
438static int __register_ftrace_function(struct ftrace_ops *ops) 432static int __register_ftrace_function(struct ftrace_ops *ops)
439{ 433{
434 if (ops->flags & FTRACE_OPS_FL_DELETED)
435 return -EINVAL;
436
440 if (FTRACE_WARN_ON(ops == &global_ops)) 437 if (FTRACE_WARN_ON(ops == &global_ops))
441 return -EINVAL; 438 return -EINVAL;
442 439
@@ -1172,8 +1169,6 @@ struct ftrace_page {
1172 int size; 1169 int size;
1173}; 1170};
1174 1171
1175static struct ftrace_page *ftrace_new_pgs;
1176
1177#define ENTRY_SIZE sizeof(struct dyn_ftrace) 1172#define ENTRY_SIZE sizeof(struct dyn_ftrace)
1178#define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE) 1173#define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE)
1179 1174
@@ -1560,7 +1555,7 @@ unsigned long ftrace_location(unsigned long ip)
1560 * the function tracer. It checks the ftrace internal tables to 1555 * the function tracer. It checks the ftrace internal tables to
1561 * determine if the address belongs or not. 1556 * determine if the address belongs or not.
1562 */ 1557 */
1563int ftrace_text_reserved(void *start, void *end) 1558int ftrace_text_reserved(const void *start, const void *end)
1564{ 1559{
1565 unsigned long ret; 1560 unsigned long ret;
1566 1561
@@ -1994,6 +1989,7 @@ int __weak ftrace_arch_code_modify_post_process(void)
1994void ftrace_modify_all_code(int command) 1989void ftrace_modify_all_code(int command)
1995{ 1990{
1996 int update = command & FTRACE_UPDATE_TRACE_FUNC; 1991 int update = command & FTRACE_UPDATE_TRACE_FUNC;
1992 int err = 0;
1997 1993
1998 /* 1994 /*
1999 * If the ftrace_caller calls a ftrace_ops func directly, 1995 * If the ftrace_caller calls a ftrace_ops func directly,
@@ -2005,8 +2001,11 @@ void ftrace_modify_all_code(int command)
2005 * to make sure the ops are having the right functions 2001 * to make sure the ops are having the right functions
2006 * traced. 2002 * traced.
2007 */ 2003 */
2008 if (update) 2004 if (update) {
2009 ftrace_update_ftrace_func(ftrace_ops_list_func); 2005 err = ftrace_update_ftrace_func(ftrace_ops_list_func);
2006 if (FTRACE_WARN_ON(err))
2007 return;
2008 }
2010 2009
2011 if (command & FTRACE_UPDATE_CALLS) 2010 if (command & FTRACE_UPDATE_CALLS)
2012 ftrace_replace_code(1); 2011 ftrace_replace_code(1);
@@ -2019,13 +2018,16 @@ void ftrace_modify_all_code(int command)
2019 /* If irqs are disabled, we are in stop machine */ 2018 /* If irqs are disabled, we are in stop machine */
2020 if (!irqs_disabled()) 2019 if (!irqs_disabled())
2021 smp_call_function(ftrace_sync_ipi, NULL, 1); 2020 smp_call_function(ftrace_sync_ipi, NULL, 1);
2022 ftrace_update_ftrace_func(ftrace_trace_function); 2021 err = ftrace_update_ftrace_func(ftrace_trace_function);
2022 if (FTRACE_WARN_ON(err))
2023 return;
2023 } 2024 }
2024 2025
2025 if (command & FTRACE_START_FUNC_RET) 2026 if (command & FTRACE_START_FUNC_RET)
2026 ftrace_enable_ftrace_graph_caller(); 2027 err = ftrace_enable_ftrace_graph_caller();
2027 else if (command & FTRACE_STOP_FUNC_RET) 2028 else if (command & FTRACE_STOP_FUNC_RET)
2028 ftrace_disable_ftrace_graph_caller(); 2029 err = ftrace_disable_ftrace_graph_caller();
2030 FTRACE_WARN_ON(err);
2029} 2031}
2030 2032
2031static int __ftrace_modify_code(void *data) 2033static int __ftrace_modify_code(void *data)
@@ -2093,6 +2095,11 @@ static ftrace_func_t saved_ftrace_func;
2093static int ftrace_start_up; 2095static int ftrace_start_up;
2094static int global_start_up; 2096static int global_start_up;
2095 2097
2098static void control_ops_free(struct ftrace_ops *ops)
2099{
2100 free_percpu(ops->disabled);
2101}
2102
2096static void ftrace_startup_enable(int command) 2103static void ftrace_startup_enable(int command)
2097{ 2104{
2098 if (saved_ftrace_func != ftrace_trace_function) { 2105 if (saved_ftrace_func != ftrace_trace_function) {
@@ -2244,7 +2251,6 @@ static void ftrace_shutdown_sysctl(void)
2244} 2251}
2245 2252
2246static cycle_t ftrace_update_time; 2253static cycle_t ftrace_update_time;
2247static unsigned long ftrace_update_cnt;
2248unsigned long ftrace_update_tot_cnt; 2254unsigned long ftrace_update_tot_cnt;
2249 2255
2250static inline int ops_traces_mod(struct ftrace_ops *ops) 2256static inline int ops_traces_mod(struct ftrace_ops *ops)
@@ -2300,11 +2306,12 @@ static int referenced_filters(struct dyn_ftrace *rec)
2300 return cnt; 2306 return cnt;
2301} 2307}
2302 2308
2303static int ftrace_update_code(struct module *mod) 2309static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
2304{ 2310{
2305 struct ftrace_page *pg; 2311 struct ftrace_page *pg;
2306 struct dyn_ftrace *p; 2312 struct dyn_ftrace *p;
2307 cycle_t start, stop; 2313 cycle_t start, stop;
2314 unsigned long update_cnt = 0;
2308 unsigned long ref = 0; 2315 unsigned long ref = 0;
2309 bool test = false; 2316 bool test = false;
2310 int i; 2317 int i;
@@ -2330,9 +2337,8 @@ static int ftrace_update_code(struct module *mod)
2330 } 2337 }
2331 2338
2332 start = ftrace_now(raw_smp_processor_id()); 2339 start = ftrace_now(raw_smp_processor_id());
2333 ftrace_update_cnt = 0;
2334 2340
2335 for (pg = ftrace_new_pgs; pg; pg = pg->next) { 2341 for (pg = new_pgs; pg; pg = pg->next) {
2336 2342
2337 for (i = 0; i < pg->index; i++) { 2343 for (i = 0; i < pg->index; i++) {
2338 int cnt = ref; 2344 int cnt = ref;
@@ -2353,7 +2359,7 @@ static int ftrace_update_code(struct module *mod)
2353 if (!ftrace_code_disable(mod, p)) 2359 if (!ftrace_code_disable(mod, p))
2354 break; 2360 break;
2355 2361
2356 ftrace_update_cnt++; 2362 update_cnt++;
2357 2363
2358 /* 2364 /*
2359 * If the tracing is enabled, go ahead and enable the record. 2365 * If the tracing is enabled, go ahead and enable the record.
@@ -2372,11 +2378,9 @@ static int ftrace_update_code(struct module *mod)
2372 } 2378 }
2373 } 2379 }
2374 2380
2375 ftrace_new_pgs = NULL;
2376
2377 stop = ftrace_now(raw_smp_processor_id()); 2381 stop = ftrace_now(raw_smp_processor_id());
2378 ftrace_update_time = stop - start; 2382 ftrace_update_time = stop - start;
2379 ftrace_update_tot_cnt += ftrace_update_cnt; 2383 ftrace_update_tot_cnt += update_cnt;
2380 2384
2381 return 0; 2385 return 0;
2382} 2386}
@@ -2468,22 +2472,6 @@ ftrace_allocate_pages(unsigned long num_to_init)
2468 return NULL; 2472 return NULL;
2469} 2473}
2470 2474
2471static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
2472{
2473 int cnt;
2474
2475 if (!num_to_init) {
2476 pr_info("ftrace: No functions to be traced?\n");
2477 return -1;
2478 }
2479
2480 cnt = num_to_init / ENTRIES_PER_PAGE;
2481 pr_info("ftrace: allocating %ld entries in %d pages\n",
2482 num_to_init, cnt + 1);
2483
2484 return 0;
2485}
2486
2487#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ 2475#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
2488 2476
2489struct ftrace_iterator { 2477struct ftrace_iterator {
@@ -2871,7 +2859,9 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
2871static int 2859static int
2872ftrace_filter_open(struct inode *inode, struct file *file) 2860ftrace_filter_open(struct inode *inode, struct file *file)
2873{ 2861{
2874 return ftrace_regex_open(&global_ops, 2862 struct ftrace_ops *ops = inode->i_private;
2863
2864 return ftrace_regex_open(ops,
2875 FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH, 2865 FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH,
2876 inode, file); 2866 inode, file);
2877} 2867}
@@ -2879,7 +2869,9 @@ ftrace_filter_open(struct inode *inode, struct file *file)
2879static int 2869static int
2880ftrace_notrace_open(struct inode *inode, struct file *file) 2870ftrace_notrace_open(struct inode *inode, struct file *file)
2881{ 2871{
2882 return ftrace_regex_open(&global_ops, FTRACE_ITER_NOTRACE, 2872 struct ftrace_ops *ops = inode->i_private;
2873
2874 return ftrace_regex_open(ops, FTRACE_ITER_NOTRACE,
2883 inode, file); 2875 inode, file);
2884} 2876}
2885 2877
@@ -4109,6 +4101,36 @@ static const struct file_operations ftrace_graph_notrace_fops = {
4109}; 4101};
4110#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 4102#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
4111 4103
4104void ftrace_create_filter_files(struct ftrace_ops *ops,
4105 struct dentry *parent)
4106{
4107
4108 trace_create_file("set_ftrace_filter", 0644, parent,
4109 ops, &ftrace_filter_fops);
4110
4111 trace_create_file("set_ftrace_notrace", 0644, parent,
4112 ops, &ftrace_notrace_fops);
4113}
4114
4115/*
4116 * The name "destroy_filter_files" is really a misnomer. Although
4117 * in the future, it may actualy delete the files, but this is
4118 * really intended to make sure the ops passed in are disabled
4119 * and that when this function returns, the caller is free to
4120 * free the ops.
4121 *
4122 * The "destroy" name is only to match the "create" name that this
4123 * should be paired with.
4124 */
4125void ftrace_destroy_filter_files(struct ftrace_ops *ops)
4126{
4127 mutex_lock(&ftrace_lock);
4128 if (ops->flags & FTRACE_OPS_FL_ENABLED)
4129 ftrace_shutdown(ops, 0);
4130 ops->flags |= FTRACE_OPS_FL_DELETED;
4131 mutex_unlock(&ftrace_lock);
4132}
4133
4112static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) 4134static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
4113{ 4135{
4114 4136
@@ -4118,11 +4140,7 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
4118 trace_create_file("enabled_functions", 0444, 4140 trace_create_file("enabled_functions", 0444,
4119 d_tracer, NULL, &ftrace_enabled_fops); 4141 d_tracer, NULL, &ftrace_enabled_fops);
4120 4142
4121 trace_create_file("set_ftrace_filter", 0644, d_tracer, 4143 ftrace_create_filter_files(&global_ops, d_tracer);
4122 NULL, &ftrace_filter_fops);
4123
4124 trace_create_file("set_ftrace_notrace", 0644, d_tracer,
4125 NULL, &ftrace_notrace_fops);
4126 4144
4127#ifdef CONFIG_FUNCTION_GRAPH_TRACER 4145#ifdef CONFIG_FUNCTION_GRAPH_TRACER
4128 trace_create_file("set_graph_function", 0444, d_tracer, 4146 trace_create_file("set_graph_function", 0444, d_tracer,
@@ -4238,9 +4256,6 @@ static int ftrace_process_locs(struct module *mod,
4238 /* Assign the last page to ftrace_pages */ 4256 /* Assign the last page to ftrace_pages */
4239 ftrace_pages = pg; 4257 ftrace_pages = pg;
4240 4258
4241 /* These new locations need to be initialized */
4242 ftrace_new_pgs = start_pg;
4243
4244 /* 4259 /*
4245 * We only need to disable interrupts on start up 4260 * We only need to disable interrupts on start up
4246 * because we are modifying code that an interrupt 4261 * because we are modifying code that an interrupt
@@ -4251,7 +4266,7 @@ static int ftrace_process_locs(struct module *mod,
4251 */ 4266 */
4252 if (!mod) 4267 if (!mod)
4253 local_irq_save(flags); 4268 local_irq_save(flags);
4254 ftrace_update_code(mod); 4269 ftrace_update_code(mod, start_pg);
4255 if (!mod) 4270 if (!mod)
4256 local_irq_restore(flags); 4271 local_irq_restore(flags);
4257 ret = 0; 4272 ret = 0;
@@ -4360,30 +4375,27 @@ struct notifier_block ftrace_module_exit_nb = {
4360 .priority = INT_MIN, /* Run after anything that can remove kprobes */ 4375 .priority = INT_MIN, /* Run after anything that can remove kprobes */
4361}; 4376};
4362 4377
4363extern unsigned long __start_mcount_loc[];
4364extern unsigned long __stop_mcount_loc[];
4365
4366void __init ftrace_init(void) 4378void __init ftrace_init(void)
4367{ 4379{
4368 unsigned long count, addr, flags; 4380 extern unsigned long __start_mcount_loc[];
4381 extern unsigned long __stop_mcount_loc[];
4382 unsigned long count, flags;
4369 int ret; 4383 int ret;
4370 4384
4371 /* Keep the ftrace pointer to the stub */
4372 addr = (unsigned long)ftrace_stub;
4373
4374 local_irq_save(flags); 4385 local_irq_save(flags);
4375 ftrace_dyn_arch_init(&addr); 4386 ret = ftrace_dyn_arch_init();
4376 local_irq_restore(flags); 4387 local_irq_restore(flags);
4377 4388 if (ret)
4378 /* ftrace_dyn_arch_init places the return code in addr */
4379 if (addr)
4380 goto failed; 4389 goto failed;
4381 4390
4382 count = __stop_mcount_loc - __start_mcount_loc; 4391 count = __stop_mcount_loc - __start_mcount_loc;
4383 4392 if (!count) {
4384 ret = ftrace_dyn_table_alloc(count); 4393 pr_info("ftrace: No functions to be traced?\n");
4385 if (ret)
4386 goto failed; 4394 goto failed;
4395 }
4396
4397 pr_info("ftrace: allocating %ld entries in %ld pages\n",
4398 count, count / ENTRIES_PER_PAGE + 1);
4387 4399
4388 last_ftrace_enabled = ftrace_enabled = 1; 4400 last_ftrace_enabled = ftrace_enabled = 1;
4389 4401
@@ -4431,7 +4443,13 @@ static inline void ftrace_startup_enable(int command) { }
4431 (ops)->flags |= FTRACE_OPS_FL_ENABLED; \ 4443 (ops)->flags |= FTRACE_OPS_FL_ENABLED; \
4432 ___ret; \ 4444 ___ret; \
4433 }) 4445 })
4434# define ftrace_shutdown(ops, command) __unregister_ftrace_function(ops) 4446# define ftrace_shutdown(ops, command) \
4447 ({ \
4448 int ___ret = __unregister_ftrace_function(ops); \
4449 if (!___ret) \
4450 (ops)->flags &= ~FTRACE_OPS_FL_ENABLED; \
4451 ___ret; \
4452 })
4435 4453
4436# define ftrace_startup_sysctl() do { } while (0) 4454# define ftrace_startup_sysctl() do { } while (0)
4437# define ftrace_shutdown_sysctl() do { } while (0) 4455# define ftrace_shutdown_sysctl() do { } while (0)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 24c1f2382557..9be67c5e5b0f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -73,7 +73,8 @@ static struct tracer_flags dummy_tracer_flags = {
73 .opts = dummy_tracer_opt 73 .opts = dummy_tracer_opt
74}; 74};
75 75
76static int dummy_set_flag(u32 old_flags, u32 bit, int set) 76static int
77dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
77{ 78{
78 return 0; 79 return 0;
79} 80}
@@ -118,7 +119,7 @@ enum ftrace_dump_mode ftrace_dump_on_oops;
118/* When set, tracing will stop when a WARN*() is hit */ 119/* When set, tracing will stop when a WARN*() is hit */
119int __disable_trace_on_warning; 120int __disable_trace_on_warning;
120 121
121static int tracing_set_tracer(const char *buf); 122static int tracing_set_tracer(struct trace_array *tr, const char *buf);
122 123
123#define MAX_TRACER_SIZE 100 124#define MAX_TRACER_SIZE 100
124static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; 125static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
@@ -180,6 +181,17 @@ static int __init set_trace_boot_options(char *str)
180} 181}
181__setup("trace_options=", set_trace_boot_options); 182__setup("trace_options=", set_trace_boot_options);
182 183
184static char trace_boot_clock_buf[MAX_TRACER_SIZE] __initdata;
185static char *trace_boot_clock __initdata;
186
187static int __init set_trace_boot_clock(char *str)
188{
189 strlcpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE);
190 trace_boot_clock = trace_boot_clock_buf;
191 return 0;
192}
193__setup("trace_clock=", set_trace_boot_clock);
194
183 195
184unsigned long long ns2usecs(cycle_t nsec) 196unsigned long long ns2usecs(cycle_t nsec)
185{ 197{
@@ -1230,7 +1242,7 @@ int register_tracer(struct tracer *type)
1230 1242
1231 printk(KERN_INFO "Starting tracer '%s'\n", type->name); 1243 printk(KERN_INFO "Starting tracer '%s'\n", type->name);
1232 /* Do we want this tracer to start on bootup? */ 1244 /* Do we want this tracer to start on bootup? */
1233 tracing_set_tracer(type->name); 1245 tracing_set_tracer(&global_trace, type->name);
1234 default_bootup_tracer = NULL; 1246 default_bootup_tracer = NULL;
1235 /* disable other selftests, since this will break it. */ 1247 /* disable other selftests, since this will break it. */
1236 tracing_selftest_disabled = true; 1248 tracing_selftest_disabled = true;
@@ -3137,27 +3149,52 @@ static int tracing_open(struct inode *inode, struct file *file)
3137 return ret; 3149 return ret;
3138} 3150}
3139 3151
3152/*
3153 * Some tracers are not suitable for instance buffers.
3154 * A tracer is always available for the global array (toplevel)
3155 * or if it explicitly states that it is.
3156 */
3157static bool
3158trace_ok_for_array(struct tracer *t, struct trace_array *tr)
3159{
3160 return (tr->flags & TRACE_ARRAY_FL_GLOBAL) || t->allow_instances;
3161}
3162
3163/* Find the next tracer that this trace array may use */
3164static struct tracer *
3165get_tracer_for_array(struct trace_array *tr, struct tracer *t)
3166{
3167 while (t && !trace_ok_for_array(t, tr))
3168 t = t->next;
3169
3170 return t;
3171}
3172
3140static void * 3173static void *
3141t_next(struct seq_file *m, void *v, loff_t *pos) 3174t_next(struct seq_file *m, void *v, loff_t *pos)
3142{ 3175{
3176 struct trace_array *tr = m->private;
3143 struct tracer *t = v; 3177 struct tracer *t = v;
3144 3178
3145 (*pos)++; 3179 (*pos)++;
3146 3180
3147 if (t) 3181 if (t)
3148 t = t->next; 3182 t = get_tracer_for_array(tr, t->next);
3149 3183
3150 return t; 3184 return t;
3151} 3185}
3152 3186
3153static void *t_start(struct seq_file *m, loff_t *pos) 3187static void *t_start(struct seq_file *m, loff_t *pos)
3154{ 3188{
3189 struct trace_array *tr = m->private;
3155 struct tracer *t; 3190 struct tracer *t;
3156 loff_t l = 0; 3191 loff_t l = 0;
3157 3192
3158 mutex_lock(&trace_types_lock); 3193 mutex_lock(&trace_types_lock);
3159 for (t = trace_types; t && l < *pos; t = t_next(m, t, &l)) 3194
3160 ; 3195 t = get_tracer_for_array(tr, trace_types);
3196 for (; t && l < *pos; t = t_next(m, t, &l))
3197 ;
3161 3198
3162 return t; 3199 return t;
3163} 3200}
@@ -3192,10 +3229,21 @@ static const struct seq_operations show_traces_seq_ops = {
3192 3229
3193static int show_traces_open(struct inode *inode, struct file *file) 3230static int show_traces_open(struct inode *inode, struct file *file)
3194{ 3231{
3232 struct trace_array *tr = inode->i_private;
3233 struct seq_file *m;
3234 int ret;
3235
3195 if (tracing_disabled) 3236 if (tracing_disabled)
3196 return -ENODEV; 3237 return -ENODEV;
3197 3238
3198 return seq_open(file, &show_traces_seq_ops); 3239 ret = seq_open(file, &show_traces_seq_ops);
3240 if (ret)
3241 return ret;
3242
3243 m = file->private_data;
3244 m->private = tr;
3245
3246 return 0;
3199} 3247}
3200 3248
3201static ssize_t 3249static ssize_t
@@ -3355,13 +3403,14 @@ static int tracing_trace_options_show(struct seq_file *m, void *v)
3355 return 0; 3403 return 0;
3356} 3404}
3357 3405
3358static int __set_tracer_option(struct tracer *trace, 3406static int __set_tracer_option(struct trace_array *tr,
3359 struct tracer_flags *tracer_flags, 3407 struct tracer_flags *tracer_flags,
3360 struct tracer_opt *opts, int neg) 3408 struct tracer_opt *opts, int neg)
3361{ 3409{
3410 struct tracer *trace = tr->current_trace;
3362 int ret; 3411 int ret;
3363 3412
3364 ret = trace->set_flag(tracer_flags->val, opts->bit, !neg); 3413 ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg);
3365 if (ret) 3414 if (ret)
3366 return ret; 3415 return ret;
3367 3416
@@ -3373,8 +3422,9 @@ static int __set_tracer_option(struct tracer *trace,
3373} 3422}
3374 3423
3375/* Try to assign a tracer specific option */ 3424/* Try to assign a tracer specific option */
3376static int set_tracer_option(struct tracer *trace, char *cmp, int neg) 3425static int set_tracer_option(struct trace_array *tr, char *cmp, int neg)
3377{ 3426{
3427 struct tracer *trace = tr->current_trace;
3378 struct tracer_flags *tracer_flags = trace->flags; 3428 struct tracer_flags *tracer_flags = trace->flags;
3379 struct tracer_opt *opts = NULL; 3429 struct tracer_opt *opts = NULL;
3380 int i; 3430 int i;
@@ -3383,8 +3433,7 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
3383 opts = &tracer_flags->opts[i]; 3433 opts = &tracer_flags->opts[i];
3384 3434
3385 if (strcmp(cmp, opts->name) == 0) 3435 if (strcmp(cmp, opts->name) == 0)
3386 return __set_tracer_option(trace, trace->flags, 3436 return __set_tracer_option(tr, trace->flags, opts, neg);
3387 opts, neg);
3388 } 3437 }
3389 3438
3390 return -EINVAL; 3439 return -EINVAL;
@@ -3407,7 +3456,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
3407 3456
3408 /* Give the tracer a chance to approve the change */ 3457 /* Give the tracer a chance to approve the change */
3409 if (tr->current_trace->flag_changed) 3458 if (tr->current_trace->flag_changed)
3410 if (tr->current_trace->flag_changed(tr->current_trace, mask, !!enabled)) 3459 if (tr->current_trace->flag_changed(tr, mask, !!enabled))
3411 return -EINVAL; 3460 return -EINVAL;
3412 3461
3413 if (enabled) 3462 if (enabled)
@@ -3456,7 +3505,7 @@ static int trace_set_options(struct trace_array *tr, char *option)
3456 3505
3457 /* If no option could be set, test the specific tracer options */ 3506 /* If no option could be set, test the specific tracer options */
3458 if (!trace_options[i]) 3507 if (!trace_options[i])
3459 ret = set_tracer_option(tr->current_trace, cmp, neg); 3508 ret = set_tracer_option(tr, cmp, neg);
3460 3509
3461 mutex_unlock(&trace_types_lock); 3510 mutex_unlock(&trace_types_lock);
3462 3511
@@ -3885,10 +3934,26 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer);
3885static void 3934static void
3886destroy_trace_option_files(struct trace_option_dentry *topts); 3935destroy_trace_option_files(struct trace_option_dentry *topts);
3887 3936
3888static int tracing_set_tracer(const char *buf) 3937/*
3938 * Used to clear out the tracer before deletion of an instance.
3939 * Must have trace_types_lock held.
3940 */
3941static void tracing_set_nop(struct trace_array *tr)
3942{
3943 if (tr->current_trace == &nop_trace)
3944 return;
3945
3946 tr->current_trace->enabled--;
3947
3948 if (tr->current_trace->reset)
3949 tr->current_trace->reset(tr);
3950
3951 tr->current_trace = &nop_trace;
3952}
3953
3954static int tracing_set_tracer(struct trace_array *tr, const char *buf)
3889{ 3955{
3890 static struct trace_option_dentry *topts; 3956 static struct trace_option_dentry *topts;
3891 struct trace_array *tr = &global_trace;
3892 struct tracer *t; 3957 struct tracer *t;
3893#ifdef CONFIG_TRACER_MAX_TRACE 3958#ifdef CONFIG_TRACER_MAX_TRACE
3894 bool had_max_tr; 3959 bool had_max_tr;
@@ -3916,9 +3981,15 @@ static int tracing_set_tracer(const char *buf)
3916 if (t == tr->current_trace) 3981 if (t == tr->current_trace)
3917 goto out; 3982 goto out;
3918 3983
3984 /* Some tracers are only allowed for the top level buffer */
3985 if (!trace_ok_for_array(t, tr)) {
3986 ret = -EINVAL;
3987 goto out;
3988 }
3989
3919 trace_branch_disable(); 3990 trace_branch_disable();
3920 3991
3921 tr->current_trace->enabled = false; 3992 tr->current_trace->enabled--;
3922 3993
3923 if (tr->current_trace->reset) 3994 if (tr->current_trace->reset)
3924 tr->current_trace->reset(tr); 3995 tr->current_trace->reset(tr);
@@ -3941,9 +4012,11 @@ static int tracing_set_tracer(const char *buf)
3941 free_snapshot(tr); 4012 free_snapshot(tr);
3942 } 4013 }
3943#endif 4014#endif
3944 destroy_trace_option_files(topts); 4015 /* Currently, only the top instance has options */
3945 4016 if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
3946 topts = create_trace_option_files(tr, t); 4017 destroy_trace_option_files(topts);
4018 topts = create_trace_option_files(tr, t);
4019 }
3947 4020
3948#ifdef CONFIG_TRACER_MAX_TRACE 4021#ifdef CONFIG_TRACER_MAX_TRACE
3949 if (t->use_max_tr && !had_max_tr) { 4022 if (t->use_max_tr && !had_max_tr) {
@@ -3960,7 +4033,7 @@ static int tracing_set_tracer(const char *buf)
3960 } 4033 }
3961 4034
3962 tr->current_trace = t; 4035 tr->current_trace = t;
3963 tr->current_trace->enabled = true; 4036 tr->current_trace->enabled++;
3964 trace_branch_enable(tr); 4037 trace_branch_enable(tr);
3965 out: 4038 out:
3966 mutex_unlock(&trace_types_lock); 4039 mutex_unlock(&trace_types_lock);
@@ -3972,6 +4045,7 @@ static ssize_t
3972tracing_set_trace_write(struct file *filp, const char __user *ubuf, 4045tracing_set_trace_write(struct file *filp, const char __user *ubuf,
3973 size_t cnt, loff_t *ppos) 4046 size_t cnt, loff_t *ppos)
3974{ 4047{
4048 struct trace_array *tr = filp->private_data;
3975 char buf[MAX_TRACER_SIZE+1]; 4049 char buf[MAX_TRACER_SIZE+1];
3976 int i; 4050 int i;
3977 size_t ret; 4051 size_t ret;
@@ -3991,7 +4065,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
3991 for (i = cnt - 1; i > 0 && isspace(buf[i]); i--) 4065 for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
3992 buf[i] = 0; 4066 buf[i] = 0;
3993 4067
3994 err = tracing_set_tracer(buf); 4068 err = tracing_set_tracer(tr, buf);
3995 if (err) 4069 if (err)
3996 return err; 4070 return err;
3997 4071
@@ -4699,25 +4773,10 @@ static int tracing_clock_show(struct seq_file *m, void *v)
4699 return 0; 4773 return 0;
4700} 4774}
4701 4775
4702static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, 4776static int tracing_set_clock(struct trace_array *tr, const char *clockstr)
4703 size_t cnt, loff_t *fpos)
4704{ 4777{
4705 struct seq_file *m = filp->private_data;
4706 struct trace_array *tr = m->private;
4707 char buf[64];
4708 const char *clockstr;
4709 int i; 4778 int i;
4710 4779
4711 if (cnt >= sizeof(buf))
4712 return -EINVAL;
4713
4714 if (copy_from_user(&buf, ubuf, cnt))
4715 return -EFAULT;
4716
4717 buf[cnt] = 0;
4718
4719 clockstr = strstrip(buf);
4720
4721 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) { 4780 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) {
4722 if (strcmp(trace_clocks[i].name, clockstr) == 0) 4781 if (strcmp(trace_clocks[i].name, clockstr) == 0)
4723 break; 4782 break;
@@ -4745,6 +4804,32 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
4745 4804
4746 mutex_unlock(&trace_types_lock); 4805 mutex_unlock(&trace_types_lock);
4747 4806
4807 return 0;
4808}
4809
4810static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
4811 size_t cnt, loff_t *fpos)
4812{
4813 struct seq_file *m = filp->private_data;
4814 struct trace_array *tr = m->private;
4815 char buf[64];
4816 const char *clockstr;
4817 int ret;
4818
4819 if (cnt >= sizeof(buf))
4820 return -EINVAL;
4821
4822 if (copy_from_user(&buf, ubuf, cnt))
4823 return -EFAULT;
4824
4825 buf[cnt] = 0;
4826
4827 clockstr = strstrip(buf);
4828
4829 ret = tracing_set_clock(tr, clockstr);
4830 if (ret)
4831 return ret;
4832
4748 *fpos += cnt; 4833 *fpos += cnt;
4749 4834
4750 return cnt; 4835 return cnt;
@@ -5705,7 +5790,7 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
5705 5790
5706 if (!!(topt->flags->val & topt->opt->bit) != val) { 5791 if (!!(topt->flags->val & topt->opt->bit) != val) {
5707 mutex_lock(&trace_types_lock); 5792 mutex_lock(&trace_types_lock);
5708 ret = __set_tracer_option(topt->tr->current_trace, topt->flags, 5793 ret = __set_tracer_option(topt->tr, topt->flags,
5709 topt->opt, !val); 5794 topt->opt, !val);
5710 mutex_unlock(&trace_types_lock); 5795 mutex_unlock(&trace_types_lock);
5711 if (ret) 5796 if (ret)
@@ -6112,7 +6197,9 @@ static int instance_delete(const char *name)
6112 6197
6113 list_del(&tr->list); 6198 list_del(&tr->list);
6114 6199
6200 tracing_set_nop(tr);
6115 event_trace_del_tracer(tr); 6201 event_trace_del_tracer(tr);
6202 ftrace_destroy_function_files(tr);
6116 debugfs_remove_recursive(tr->dir); 6203 debugfs_remove_recursive(tr->dir);
6117 free_percpu(tr->trace_buffer.data); 6204 free_percpu(tr->trace_buffer.data);
6118 ring_buffer_free(tr->trace_buffer.buffer); 6205 ring_buffer_free(tr->trace_buffer.buffer);
@@ -6207,6 +6294,12 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
6207{ 6294{
6208 int cpu; 6295 int cpu;
6209 6296
6297 trace_create_file("available_tracers", 0444, d_tracer,
6298 tr, &show_traces_fops);
6299
6300 trace_create_file("current_tracer", 0644, d_tracer,
6301 tr, &set_tracer_fops);
6302
6210 trace_create_file("tracing_cpumask", 0644, d_tracer, 6303 trace_create_file("tracing_cpumask", 0644, d_tracer,
6211 tr, &tracing_cpumask_fops); 6304 tr, &tracing_cpumask_fops);
6212 6305
@@ -6237,6 +6330,9 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
6237 trace_create_file("tracing_on", 0644, d_tracer, 6330 trace_create_file("tracing_on", 0644, d_tracer,
6238 tr, &rb_simple_fops); 6331 tr, &rb_simple_fops);
6239 6332
6333 if (ftrace_create_function_files(tr, d_tracer))
6334 WARN(1, "Could not allocate function filter files");
6335
6240#ifdef CONFIG_TRACER_SNAPSHOT 6336#ifdef CONFIG_TRACER_SNAPSHOT
6241 trace_create_file("snapshot", 0644, d_tracer, 6337 trace_create_file("snapshot", 0644, d_tracer,
6242 tr, &snapshot_fops); 6338 tr, &snapshot_fops);
@@ -6259,12 +6355,6 @@ static __init int tracer_init_debugfs(void)
6259 6355
6260 init_tracer_debugfs(&global_trace, d_tracer); 6356 init_tracer_debugfs(&global_trace, d_tracer);
6261 6357
6262 trace_create_file("available_tracers", 0444, d_tracer,
6263 &global_trace, &show_traces_fops);
6264
6265 trace_create_file("current_tracer", 0644, d_tracer,
6266 &global_trace, &set_tracer_fops);
6267
6268#ifdef CONFIG_TRACER_MAX_TRACE 6358#ifdef CONFIG_TRACER_MAX_TRACE
6269 trace_create_file("tracing_max_latency", 0644, d_tracer, 6359 trace_create_file("tracing_max_latency", 0644, d_tracer,
6270 &tracing_max_latency, &tracing_max_lat_fops); 6360 &tracing_max_latency, &tracing_max_lat_fops);
@@ -6527,6 +6617,13 @@ __init static int tracer_alloc_buffers(void)
6527 6617
6528 trace_init_cmdlines(); 6618 trace_init_cmdlines();
6529 6619
6620 if (trace_boot_clock) {
6621 ret = tracing_set_clock(&global_trace, trace_boot_clock);
6622 if (ret < 0)
6623 pr_warning("Trace clock %s not defined, going back to default\n",
6624 trace_boot_clock);
6625 }
6626
6530 /* 6627 /*
6531 * register_tracer() might reference current_trace, so it 6628 * register_tracer() might reference current_trace, so it
6532 * needs to be set before we register anything. This is 6629 * needs to be set before we register anything. This is
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 02b592f2d4b7..ffc314b7e92b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -210,6 +210,11 @@ struct trace_array {
210 struct list_head events; 210 struct list_head events;
211 cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ 211 cpumask_var_t tracing_cpumask; /* only trace on set CPUs */
212 int ref; 212 int ref;
213#ifdef CONFIG_FUNCTION_TRACER
214 struct ftrace_ops *ops;
215 /* function tracing enabled */
216 int function_enabled;
217#endif
213}; 218};
214 219
215enum { 220enum {
@@ -355,14 +360,16 @@ struct tracer {
355 void (*print_header)(struct seq_file *m); 360 void (*print_header)(struct seq_file *m);
356 enum print_line_t (*print_line)(struct trace_iterator *iter); 361 enum print_line_t (*print_line)(struct trace_iterator *iter);
357 /* If you handled the flag setting, return 0 */ 362 /* If you handled the flag setting, return 0 */
358 int (*set_flag)(u32 old_flags, u32 bit, int set); 363 int (*set_flag)(struct trace_array *tr,
364 u32 old_flags, u32 bit, int set);
359 /* Return 0 if OK with change, else return non-zero */ 365 /* Return 0 if OK with change, else return non-zero */
360 int (*flag_changed)(struct tracer *tracer, 366 int (*flag_changed)(struct trace_array *tr,
361 u32 mask, int set); 367 u32 mask, int set);
362 struct tracer *next; 368 struct tracer *next;
363 struct tracer_flags *flags; 369 struct tracer_flags *flags;
370 int enabled;
364 bool print_max; 371 bool print_max;
365 bool enabled; 372 bool allow_instances;
366#ifdef CONFIG_TRACER_MAX_TRACE 373#ifdef CONFIG_TRACER_MAX_TRACE
367 bool use_max_tr; 374 bool use_max_tr;
368#endif 375#endif
@@ -812,13 +819,36 @@ static inline int ftrace_trace_task(struct task_struct *task)
812 return test_tsk_trace_trace(task); 819 return test_tsk_trace_trace(task);
813} 820}
814extern int ftrace_is_dead(void); 821extern int ftrace_is_dead(void);
822int ftrace_create_function_files(struct trace_array *tr,
823 struct dentry *parent);
824void ftrace_destroy_function_files(struct trace_array *tr);
815#else 825#else
816static inline int ftrace_trace_task(struct task_struct *task) 826static inline int ftrace_trace_task(struct task_struct *task)
817{ 827{
818 return 1; 828 return 1;
819} 829}
820static inline int ftrace_is_dead(void) { return 0; } 830static inline int ftrace_is_dead(void) { return 0; }
821#endif 831static inline int
832ftrace_create_function_files(struct trace_array *tr,
833 struct dentry *parent)
834{
835 return 0;
836}
837static inline void ftrace_destroy_function_files(struct trace_array *tr) { }
838#endif /* CONFIG_FUNCTION_TRACER */
839
840#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE)
841void ftrace_create_filter_files(struct ftrace_ops *ops,
842 struct dentry *parent);
843void ftrace_destroy_filter_files(struct ftrace_ops *ops);
844#else
845/*
846 * The ops parameter passed in is usually undefined.
847 * This must be a macro.
848 */
849#define ftrace_create_filter_files(ops, parent) do { } while (0)
850#define ftrace_destroy_filter_files(ops) do { } while (0)
851#endif /* CONFIG_FUNCTION_TRACER && CONFIG_DYNAMIC_FTRACE */
822 852
823int ftrace_event_is_function(struct ftrace_event_call *call); 853int ftrace_event_is_function(struct ftrace_event_call *call);
824 854
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 7b16d40bd64d..83a4378dc5e0 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -188,6 +188,36 @@ int trace_event_raw_init(struct ftrace_event_call *call)
188} 188}
189EXPORT_SYMBOL_GPL(trace_event_raw_init); 189EXPORT_SYMBOL_GPL(trace_event_raw_init);
190 190
191void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer,
192 struct ftrace_event_file *ftrace_file,
193 unsigned long len)
194{
195 struct ftrace_event_call *event_call = ftrace_file->event_call;
196
197 local_save_flags(fbuffer->flags);
198 fbuffer->pc = preempt_count();
199 fbuffer->ftrace_file = ftrace_file;
200
201 fbuffer->event =
202 trace_event_buffer_lock_reserve(&fbuffer->buffer, ftrace_file,
203 event_call->event.type, len,
204 fbuffer->flags, fbuffer->pc);
205 if (!fbuffer->event)
206 return NULL;
207
208 fbuffer->entry = ring_buffer_event_data(fbuffer->event);
209 return fbuffer->entry;
210}
211EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve);
212
213void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer)
214{
215 event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer,
216 fbuffer->event, fbuffer->entry,
217 fbuffer->flags, fbuffer->pc);
218}
219EXPORT_SYMBOL_GPL(ftrace_event_buffer_commit);
220
191int ftrace_event_reg(struct ftrace_event_call *call, 221int ftrace_event_reg(struct ftrace_event_call *call,
192 enum trace_reg type, void *data) 222 enum trace_reg type, void *data)
193{ 223{
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 38fe1483c508..5b781d2be383 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -13,32 +13,106 @@
13#include <linux/debugfs.h> 13#include <linux/debugfs.h>
14#include <linux/uaccess.h> 14#include <linux/uaccess.h>
15#include <linux/ftrace.h> 15#include <linux/ftrace.h>
16#include <linux/slab.h>
16#include <linux/fs.h> 17#include <linux/fs.h>
17 18
18#include "trace.h" 19#include "trace.h"
19 20
20/* function tracing enabled */ 21static void tracing_start_function_trace(struct trace_array *tr);
21static int ftrace_function_enabled; 22static void tracing_stop_function_trace(struct trace_array *tr);
23static void
24function_trace_call(unsigned long ip, unsigned long parent_ip,
25 struct ftrace_ops *op, struct pt_regs *pt_regs);
26static void
27function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
28 struct ftrace_ops *op, struct pt_regs *pt_regs);
29static struct ftrace_ops trace_ops;
30static struct ftrace_ops trace_stack_ops;
31static struct tracer_flags func_flags;
32
33/* Our option */
34enum {
35 TRACE_FUNC_OPT_STACK = 0x1,
36};
37
38static int allocate_ftrace_ops(struct trace_array *tr)
39{
40 struct ftrace_ops *ops;
41
42 ops = kzalloc(sizeof(*ops), GFP_KERNEL);
43 if (!ops)
44 return -ENOMEM;
22 45
23static struct trace_array *func_trace; 46 /* Currently only the non stack verision is supported */
47 ops->func = function_trace_call;
48 ops->flags = FTRACE_OPS_FL_RECURSION_SAFE;
49
50 tr->ops = ops;
51 ops->private = tr;
52 return 0;
53}
54
55
56int ftrace_create_function_files(struct trace_array *tr,
57 struct dentry *parent)
58{
59 int ret;
60
61 /* The top level array uses the "global_ops". */
62 if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL)) {
63 ret = allocate_ftrace_ops(tr);
64 if (ret)
65 return ret;
66 }
67
68 ftrace_create_filter_files(tr->ops, parent);
69
70 return 0;
71}
24 72
25static void tracing_start_function_trace(void); 73void ftrace_destroy_function_files(struct trace_array *tr)
26static void tracing_stop_function_trace(void); 74{
75 ftrace_destroy_filter_files(tr->ops);
76 kfree(tr->ops);
77 tr->ops = NULL;
78}
27 79
28static int function_trace_init(struct trace_array *tr) 80static int function_trace_init(struct trace_array *tr)
29{ 81{
30 func_trace = tr; 82 struct ftrace_ops *ops;
83
84 if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
85 /* There's only one global tr */
86 if (!trace_ops.private) {
87 trace_ops.private = tr;
88 trace_stack_ops.private = tr;
89 }
90
91 if (func_flags.val & TRACE_FUNC_OPT_STACK)
92 ops = &trace_stack_ops;
93 else
94 ops = &trace_ops;
95 tr->ops = ops;
96 } else if (!tr->ops) {
97 /*
98 * Instance trace_arrays get their ops allocated
99 * at instance creation. Unless it failed
100 * the allocation.
101 */
102 return -ENOMEM;
103 }
104
31 tr->trace_buffer.cpu = get_cpu(); 105 tr->trace_buffer.cpu = get_cpu();
32 put_cpu(); 106 put_cpu();
33 107
34 tracing_start_cmdline_record(); 108 tracing_start_cmdline_record();
35 tracing_start_function_trace(); 109 tracing_start_function_trace(tr);
36 return 0; 110 return 0;
37} 111}
38 112
39static void function_trace_reset(struct trace_array *tr) 113static void function_trace_reset(struct trace_array *tr)
40{ 114{
41 tracing_stop_function_trace(); 115 tracing_stop_function_trace(tr);
42 tracing_stop_cmdline_record(); 116 tracing_stop_cmdline_record();
43} 117}
44 118
@@ -47,25 +121,18 @@ static void function_trace_start(struct trace_array *tr)
47 tracing_reset_online_cpus(&tr->trace_buffer); 121 tracing_reset_online_cpus(&tr->trace_buffer);
48} 122}
49 123
50/* Our option */
51enum {
52 TRACE_FUNC_OPT_STACK = 0x1,
53};
54
55static struct tracer_flags func_flags;
56
57static void 124static void
58function_trace_call(unsigned long ip, unsigned long parent_ip, 125function_trace_call(unsigned long ip, unsigned long parent_ip,
59 struct ftrace_ops *op, struct pt_regs *pt_regs) 126 struct ftrace_ops *op, struct pt_regs *pt_regs)
60{ 127{
61 struct trace_array *tr = func_trace; 128 struct trace_array *tr = op->private;
62 struct trace_array_cpu *data; 129 struct trace_array_cpu *data;
63 unsigned long flags; 130 unsigned long flags;
64 int bit; 131 int bit;
65 int cpu; 132 int cpu;
66 int pc; 133 int pc;
67 134
68 if (unlikely(!ftrace_function_enabled)) 135 if (unlikely(!tr->function_enabled))
69 return; 136 return;
70 137
71 pc = preempt_count(); 138 pc = preempt_count();
@@ -91,14 +158,14 @@ static void
91function_stack_trace_call(unsigned long ip, unsigned long parent_ip, 158function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
92 struct ftrace_ops *op, struct pt_regs *pt_regs) 159 struct ftrace_ops *op, struct pt_regs *pt_regs)
93{ 160{
94 struct trace_array *tr = func_trace; 161 struct trace_array *tr = op->private;
95 struct trace_array_cpu *data; 162 struct trace_array_cpu *data;
96 unsigned long flags; 163 unsigned long flags;
97 long disabled; 164 long disabled;
98 int cpu; 165 int cpu;
99 int pc; 166 int pc;
100 167
101 if (unlikely(!ftrace_function_enabled)) 168 if (unlikely(!tr->function_enabled))
102 return; 169 return;
103 170
104 /* 171 /*
@@ -128,7 +195,6 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
128 local_irq_restore(flags); 195 local_irq_restore(flags);
129} 196}
130 197
131
132static struct ftrace_ops trace_ops __read_mostly = 198static struct ftrace_ops trace_ops __read_mostly =
133{ 199{
134 .func = function_trace_call, 200 .func = function_trace_call,
@@ -153,29 +219,21 @@ static struct tracer_flags func_flags = {
153 .opts = func_opts 219 .opts = func_opts
154}; 220};
155 221
156static void tracing_start_function_trace(void) 222static void tracing_start_function_trace(struct trace_array *tr)
157{ 223{
158 ftrace_function_enabled = 0; 224 tr->function_enabled = 0;
159 225 register_ftrace_function(tr->ops);
160 if (func_flags.val & TRACE_FUNC_OPT_STACK) 226 tr->function_enabled = 1;
161 register_ftrace_function(&trace_stack_ops);
162 else
163 register_ftrace_function(&trace_ops);
164
165 ftrace_function_enabled = 1;
166} 227}
167 228
168static void tracing_stop_function_trace(void) 229static void tracing_stop_function_trace(struct trace_array *tr)
169{ 230{
170 ftrace_function_enabled = 0; 231 tr->function_enabled = 0;
171 232 unregister_ftrace_function(tr->ops);
172 if (func_flags.val & TRACE_FUNC_OPT_STACK)
173 unregister_ftrace_function(&trace_stack_ops);
174 else
175 unregister_ftrace_function(&trace_ops);
176} 233}
177 234
178static int func_set_flag(u32 old_flags, u32 bit, int set) 235static int
236func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
179{ 237{
180 switch (bit) { 238 switch (bit) {
181 case TRACE_FUNC_OPT_STACK: 239 case TRACE_FUNC_OPT_STACK:
@@ -183,12 +241,14 @@ static int func_set_flag(u32 old_flags, u32 bit, int set)
183 if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK)) 241 if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK))
184 break; 242 break;
185 243
244 unregister_ftrace_function(tr->ops);
245
186 if (set) { 246 if (set) {
187 unregister_ftrace_function(&trace_ops); 247 tr->ops = &trace_stack_ops;
188 register_ftrace_function(&trace_stack_ops); 248 register_ftrace_function(tr->ops);
189 } else { 249 } else {
190 unregister_ftrace_function(&trace_stack_ops); 250 tr->ops = &trace_ops;
191 register_ftrace_function(&trace_ops); 251 register_ftrace_function(tr->ops);
192 } 252 }
193 253
194 break; 254 break;
@@ -208,6 +268,7 @@ static struct tracer function_trace __tracer_data =
208 .wait_pipe = poll_wait_pipe, 268 .wait_pipe = poll_wait_pipe,
209 .flags = &func_flags, 269 .flags = &func_flags,
210 .set_flag = func_set_flag, 270 .set_flag = func_set_flag,
271 .allow_instances = true,
211#ifdef CONFIG_FTRACE_SELFTEST 272#ifdef CONFIG_FTRACE_SELFTEST
212 .selftest = trace_selftest_startup_function, 273 .selftest = trace_selftest_startup_function,
213#endif 274#endif
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 0b99120d395c..deff11200261 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -1476,7 +1476,8 @@ void graph_trace_close(struct trace_iterator *iter)
1476 } 1476 }
1477} 1477}
1478 1478
1479static int func_graph_set_flag(u32 old_flags, u32 bit, int set) 1479static int
1480func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
1480{ 1481{
1481 if (bit == TRACE_GRAPH_PRINT_IRQS) 1482 if (bit == TRACE_GRAPH_PRINT_IRQS)
1482 ftrace_graph_skip_irqs = !set; 1483 ftrace_graph_skip_irqs = !set;
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 887ef88b0bc7..8ff02cbb892f 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -160,7 +160,8 @@ static struct ftrace_ops trace_ops __read_mostly =
160#endif /* CONFIG_FUNCTION_TRACER */ 160#endif /* CONFIG_FUNCTION_TRACER */
161 161
162#ifdef CONFIG_FUNCTION_GRAPH_TRACER 162#ifdef CONFIG_FUNCTION_GRAPH_TRACER
163static int irqsoff_set_flag(u32 old_flags, u32 bit, int set) 163static int
164irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
164{ 165{
165 int cpu; 166 int cpu;
166 167
@@ -266,7 +267,8 @@ __trace_function(struct trace_array *tr,
266#else 267#else
267#define __trace_function trace_function 268#define __trace_function trace_function
268 269
269static int irqsoff_set_flag(u32 old_flags, u32 bit, int set) 270static int
271irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
270{ 272{
271 return -EINVAL; 273 return -EINVAL;
272} 274}
@@ -570,8 +572,10 @@ static void irqsoff_function_set(int set)
570 unregister_irqsoff_function(is_graph()); 572 unregister_irqsoff_function(is_graph());
571} 573}
572 574
573static int irqsoff_flag_changed(struct tracer *tracer, u32 mask, int set) 575static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set)
574{ 576{
577 struct tracer *tracer = tr->current_trace;
578
575 if (mask & TRACE_ITER_FUNCTION) 579 if (mask & TRACE_ITER_FUNCTION)
576 irqsoff_function_set(set); 580 irqsoff_function_set(set);
577 581
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index bdbae450c13e..d021d21dd150 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -35,11 +35,6 @@ struct trace_kprobe {
35 struct trace_probe tp; 35 struct trace_probe tp;
36}; 36};
37 37
38struct event_file_link {
39 struct ftrace_event_file *file;
40 struct list_head list;
41};
42
43#define SIZEOF_TRACE_KPROBE(n) \ 38#define SIZEOF_TRACE_KPROBE(n) \
44 (offsetof(struct trace_kprobe, tp.args) + \ 39 (offsetof(struct trace_kprobe, tp.args) + \
45 (sizeof(struct probe_arg) * (n))) 40 (sizeof(struct probe_arg) * (n)))
@@ -387,18 +382,6 @@ enable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file)
387 return ret; 382 return ret;
388} 383}
389 384
390static struct event_file_link *
391find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file)
392{
393 struct event_file_link *link;
394
395 list_for_each_entry(link, &tp->files, list)
396 if (link->file == file)
397 return link;
398
399 return NULL;
400}
401
402/* 385/*
403 * Disable trace_probe 386 * Disable trace_probe
404 * if the file is NULL, disable "perf" handler, or disable "trace" handler. 387 * if the file is NULL, disable "perf" handler, or disable "trace" handler.
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index 394f94417e2f..69a5cc94c01a 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -62,7 +62,7 @@ static void nop_trace_reset(struct trace_array *tr)
62 * If you don't implement it, then the flag setting will be 62 * If you don't implement it, then the flag setting will be
63 * automatically accepted. 63 * automatically accepted.
64 */ 64 */
65static int nop_set_flag(u32 old_flags, u32 bit, int set) 65static int nop_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
66{ 66{
67 /* 67 /*
68 * Note that you don't need to update nop_flags.val yourself. 68 * Note that you don't need to update nop_flags.val yourself.
@@ -96,6 +96,7 @@ struct tracer nop_trace __read_mostly =
96 .selftest = trace_selftest_startup_nop, 96 .selftest = trace_selftest_startup_nop,
97#endif 97#endif
98 .flags = &nop_flags, 98 .flags = &nop_flags,
99 .set_flag = nop_set_flag 99 .set_flag = nop_set_flag,
100 .allow_instances = true,
100}; 101};
101 102
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index ed32284fbe32..ca0e79e2abaa 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -439,6 +439,37 @@ int ftrace_raw_output_prep(struct trace_iterator *iter,
439} 439}
440EXPORT_SYMBOL(ftrace_raw_output_prep); 440EXPORT_SYMBOL(ftrace_raw_output_prep);
441 441
442static int ftrace_output_raw(struct trace_iterator *iter, char *name,
443 char *fmt, va_list ap)
444{
445 struct trace_seq *s = &iter->seq;
446 int ret;
447
448 ret = trace_seq_printf(s, "%s: ", name);
449 if (!ret)
450 return TRACE_TYPE_PARTIAL_LINE;
451
452 ret = trace_seq_vprintf(s, fmt, ap);
453
454 if (!ret)
455 return TRACE_TYPE_PARTIAL_LINE;
456
457 return TRACE_TYPE_HANDLED;
458}
459
460int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...)
461{
462 va_list ap;
463 int ret;
464
465 va_start(ap, fmt);
466 ret = ftrace_output_raw(iter, name, fmt, ap);
467 va_end(ap);
468
469 return ret;
470}
471EXPORT_SYMBOL_GPL(ftrace_output_call);
472
442#ifdef CONFIG_KRETPROBES 473#ifdef CONFIG_KRETPROBES
443static inline const char *kretprobed(const char *name) 474static inline const char *kretprobed(const char *name)
444{ 475{
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index b73574a5f429..fb1ab5dfbd42 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -288,6 +288,11 @@ struct trace_probe {
288 struct probe_arg args[]; 288 struct probe_arg args[];
289}; 289};
290 290
291struct event_file_link {
292 struct ftrace_event_file *file;
293 struct list_head list;
294};
295
291static inline bool trace_probe_is_enabled(struct trace_probe *tp) 296static inline bool trace_probe_is_enabled(struct trace_probe *tp)
292{ 297{
293 return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE)); 298 return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE));
@@ -316,6 +321,18 @@ static inline int is_good_name(const char *name)
316 return 1; 321 return 1;
317} 322}
318 323
324static inline struct event_file_link *
325find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file)
326{
327 struct event_file_link *link;
328
329 list_for_each_entry(link, &tp->files, list)
330 if (link->file == file)
331 return link;
332
333 return NULL;
334}
335
319extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size, 336extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
320 struct probe_arg *parg, bool is_return, bool is_kprobe); 337 struct probe_arg *parg, bool is_return, bool is_kprobe);
321 338
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 6e32635e5e57..e14da5e97a69 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -179,8 +179,10 @@ static void wakeup_function_set(int set)
179 unregister_wakeup_function(is_graph()); 179 unregister_wakeup_function(is_graph());
180} 180}
181 181
182static int wakeup_flag_changed(struct tracer *tracer, u32 mask, int set) 182static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set)
183{ 183{
184 struct tracer *tracer = tr->current_trace;
185
184 if (mask & TRACE_ITER_FUNCTION) 186 if (mask & TRACE_ITER_FUNCTION)
185 wakeup_function_set(set); 187 wakeup_function_set(set);
186 188
@@ -209,7 +211,8 @@ static void stop_func_tracer(int graph)
209} 211}
210 212
211#ifdef CONFIG_FUNCTION_GRAPH_TRACER 213#ifdef CONFIG_FUNCTION_GRAPH_TRACER
212static int wakeup_set_flag(u32 old_flags, u32 bit, int set) 214static int
215wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
213{ 216{
214 217
215 if (!(bit & TRACE_DISPLAY_GRAPH)) 218 if (!(bit & TRACE_DISPLAY_GRAPH))
@@ -311,7 +314,8 @@ __trace_function(struct trace_array *tr,
311#else 314#else
312#define __trace_function trace_function 315#define __trace_function trace_function
313 316
314static int wakeup_set_flag(u32 old_flags, u32 bit, int set) 317static int
318wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
315{ 319{
316 return -EINVAL; 320 return -EINVAL;
317} 321}
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index e6be585cf06a..21b320e5d163 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -13,6 +13,7 @@
13#include <linux/sysctl.h> 13#include <linux/sysctl.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/fs.h> 15#include <linux/fs.h>
16#include <linux/magic.h>
16 17
17#include <asm/setup.h> 18#include <asm/setup.h>
18 19
@@ -144,6 +145,8 @@ check_stack(unsigned long ip, unsigned long *stack)
144 i++; 145 i++;
145 } 146 }
146 147
148 BUG_ON(current != &init_task &&
149 *(end_of_stack(current)) != STACK_END_MAGIC);
147 out: 150 out:
148 arch_spin_unlock(&max_stack_lock); 151 arch_spin_unlock(&max_stack_lock);
149 local_irq_restore(flags); 152 local_irq_restore(flags);
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 79e52d93860b..e4473367e7a4 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -260,6 +260,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
260 goto error; 260 goto error;
261 261
262 INIT_LIST_HEAD(&tu->list); 262 INIT_LIST_HEAD(&tu->list);
263 INIT_LIST_HEAD(&tu->tp.files);
263 tu->consumer.handler = uprobe_dispatcher; 264 tu->consumer.handler = uprobe_dispatcher;
264 if (is_ret) 265 if (is_ret)
265 tu->consumer.ret_handler = uretprobe_dispatcher; 266 tu->consumer.ret_handler = uretprobe_dispatcher;
@@ -758,31 +759,32 @@ static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb)
758 mutex_unlock(&ucb->mutex); 759 mutex_unlock(&ucb->mutex);
759} 760}
760 761
761static void uprobe_trace_print(struct trace_uprobe *tu, 762static void __uprobe_trace_func(struct trace_uprobe *tu,
762 unsigned long func, struct pt_regs *regs) 763 unsigned long func, struct pt_regs *regs,
764 struct uprobe_cpu_buffer *ucb, int dsize,
765 struct ftrace_event_file *ftrace_file)
763{ 766{
764 struct uprobe_trace_entry_head *entry; 767 struct uprobe_trace_entry_head *entry;
765 struct ring_buffer_event *event; 768 struct ring_buffer_event *event;
766 struct ring_buffer *buffer; 769 struct ring_buffer *buffer;
767 struct uprobe_cpu_buffer *ucb;
768 void *data; 770 void *data;
769 int size, dsize, esize; 771 int size, esize;
770 struct ftrace_event_call *call = &tu->tp.call; 772 struct ftrace_event_call *call = &tu->tp.call;
771 773
772 dsize = __get_data_size(&tu->tp, regs); 774 WARN_ON(call != ftrace_file->event_call);
773 esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
774 775
775 if (WARN_ON_ONCE(!uprobe_cpu_buffer || tu->tp.size + dsize > PAGE_SIZE)) 776 if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE))
776 return; 777 return;
777 778
778 ucb = uprobe_buffer_get(); 779 if (ftrace_trigger_soft_disabled(ftrace_file))
779 store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize); 780 return;
780 781
782 esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
781 size = esize + tu->tp.size + dsize; 783 size = esize + tu->tp.size + dsize;
782 event = trace_current_buffer_lock_reserve(&buffer, call->event.type, 784 event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
783 size, 0, 0); 785 call->event.type, size, 0, 0);
784 if (!event) 786 if (!event)
785 goto out; 787 return;
786 788
787 entry = ring_buffer_event_data(event); 789 entry = ring_buffer_event_data(event);
788 if (is_ret_probe(tu)) { 790 if (is_ret_probe(tu)) {
@@ -796,25 +798,36 @@ static void uprobe_trace_print(struct trace_uprobe *tu,
796 798
797 memcpy(data, ucb->buf, tu->tp.size + dsize); 799 memcpy(data, ucb->buf, tu->tp.size + dsize);
798 800
799 if (!call_filter_check_discard(call, entry, buffer, event)) 801 event_trigger_unlock_commit(ftrace_file, buffer, event, entry, 0, 0);
800 trace_buffer_unlock_commit(buffer, event, 0, 0);
801
802out:
803 uprobe_buffer_put(ucb);
804} 802}
805 803
806/* uprobe handler */ 804/* uprobe handler */
807static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) 805static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs,
806 struct uprobe_cpu_buffer *ucb, int dsize)
808{ 807{
809 if (!is_ret_probe(tu)) 808 struct event_file_link *link;
810 uprobe_trace_print(tu, 0, regs); 809
810 if (is_ret_probe(tu))
811 return 0;
812
813 rcu_read_lock();
814 list_for_each_entry_rcu(link, &tu->tp.files, list)
815 __uprobe_trace_func(tu, 0, regs, ucb, dsize, link->file);
816 rcu_read_unlock();
817
811 return 0; 818 return 0;
812} 819}
813 820
814static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func, 821static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func,
815 struct pt_regs *regs) 822 struct pt_regs *regs,
823 struct uprobe_cpu_buffer *ucb, int dsize)
816{ 824{
817 uprobe_trace_print(tu, func, regs); 825 struct event_file_link *link;
826
827 rcu_read_lock();
828 list_for_each_entry_rcu(link, &tu->tp.files, list)
829 __uprobe_trace_func(tu, func, regs, ucb, dsize, link->file);
830 rcu_read_unlock();
818} 831}
819 832
820/* Event entry printers */ 833/* Event entry printers */
@@ -861,12 +874,24 @@ typedef bool (*filter_func_t)(struct uprobe_consumer *self,
861 struct mm_struct *mm); 874 struct mm_struct *mm);
862 875
863static int 876static int
864probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter) 877probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file,
878 filter_func_t filter)
865{ 879{
866 int ret = 0; 880 bool enabled = trace_probe_is_enabled(&tu->tp);
881 struct event_file_link *link = NULL;
882 int ret;
883
884 if (file) {
885 link = kmalloc(sizeof(*link), GFP_KERNEL);
886 if (!link)
887 return -ENOMEM;
867 888
868 if (trace_probe_is_enabled(&tu->tp)) 889 link->file = file;
869 return -EINTR; 890 list_add_tail_rcu(&link->list, &tu->tp.files);
891
892 tu->tp.flags |= TP_FLAG_TRACE;
893 } else
894 tu->tp.flags |= TP_FLAG_PROFILE;
870 895
871 ret = uprobe_buffer_enable(); 896 ret = uprobe_buffer_enable();
872 if (ret < 0) 897 if (ret < 0)
@@ -874,24 +899,49 @@ probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter)
874 899
875 WARN_ON(!uprobe_filter_is_empty(&tu->filter)); 900 WARN_ON(!uprobe_filter_is_empty(&tu->filter));
876 901
877 tu->tp.flags |= flag; 902 if (enabled)
903 return 0;
904
878 tu->consumer.filter = filter; 905 tu->consumer.filter = filter;
879 ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); 906 ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
880 if (ret) 907 if (ret) {
881 tu->tp.flags &= ~flag; 908 if (file) {
909 list_del(&link->list);
910 kfree(link);
911 tu->tp.flags &= ~TP_FLAG_TRACE;
912 } else
913 tu->tp.flags &= ~TP_FLAG_PROFILE;
914 }
882 915
883 return ret; 916 return ret;
884} 917}
885 918
886static void probe_event_disable(struct trace_uprobe *tu, int flag) 919static void
920probe_event_disable(struct trace_uprobe *tu, struct ftrace_event_file *file)
887{ 921{
888 if (!trace_probe_is_enabled(&tu->tp)) 922 if (!trace_probe_is_enabled(&tu->tp))
889 return; 923 return;
890 924
925 if (file) {
926 struct event_file_link *link;
927
928 link = find_event_file_link(&tu->tp, file);
929 if (!link)
930 return;
931
932 list_del_rcu(&link->list);
933 /* synchronize with u{,ret}probe_trace_func */
934 synchronize_sched();
935 kfree(link);
936
937 if (!list_empty(&tu->tp.files))
938 return;
939 }
940
891 WARN_ON(!uprobe_filter_is_empty(&tu->filter)); 941 WARN_ON(!uprobe_filter_is_empty(&tu->filter));
892 942
893 uprobe_unregister(tu->inode, tu->offset, &tu->consumer); 943 uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
894 tu->tp.flags &= ~flag; 944 tu->tp.flags &= file ? ~TP_FLAG_TRACE : ~TP_FLAG_PROFILE;
895 945
896 uprobe_buffer_disable(); 946 uprobe_buffer_disable();
897} 947}
@@ -1014,31 +1064,24 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc,
1014 return ret; 1064 return ret;
1015} 1065}
1016 1066
1017static void uprobe_perf_print(struct trace_uprobe *tu, 1067static void __uprobe_perf_func(struct trace_uprobe *tu,
1018 unsigned long func, struct pt_regs *regs) 1068 unsigned long func, struct pt_regs *regs,
1069 struct uprobe_cpu_buffer *ucb, int dsize)
1019{ 1070{
1020 struct ftrace_event_call *call = &tu->tp.call; 1071 struct ftrace_event_call *call = &tu->tp.call;
1021 struct uprobe_trace_entry_head *entry; 1072 struct uprobe_trace_entry_head *entry;
1022 struct hlist_head *head; 1073 struct hlist_head *head;
1023 struct uprobe_cpu_buffer *ucb;
1024 void *data; 1074 void *data;
1025 int size, dsize, esize; 1075 int size, esize;
1026 int rctx; 1076 int rctx;
1027 1077
1028 dsize = __get_data_size(&tu->tp, regs);
1029 esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); 1078 esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
1030 1079
1031 if (WARN_ON_ONCE(!uprobe_cpu_buffer))
1032 return;
1033
1034 size = esize + tu->tp.size + dsize; 1080 size = esize + tu->tp.size + dsize;
1035 size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32); 1081 size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32);
1036 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) 1082 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
1037 return; 1083 return;
1038 1084
1039 ucb = uprobe_buffer_get();
1040 store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize);
1041
1042 preempt_disable(); 1085 preempt_disable();
1043 head = this_cpu_ptr(call->perf_events); 1086 head = this_cpu_ptr(call->perf_events);
1044 if (hlist_empty(head)) 1087 if (hlist_empty(head))
@@ -1068,46 +1111,49 @@ static void uprobe_perf_print(struct trace_uprobe *tu,
1068 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); 1111 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
1069 out: 1112 out:
1070 preempt_enable(); 1113 preempt_enable();
1071 uprobe_buffer_put(ucb);
1072} 1114}
1073 1115
1074/* uprobe profile handler */ 1116/* uprobe profile handler */
1075static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) 1117static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs,
1118 struct uprobe_cpu_buffer *ucb, int dsize)
1076{ 1119{
1077 if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) 1120 if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
1078 return UPROBE_HANDLER_REMOVE; 1121 return UPROBE_HANDLER_REMOVE;
1079 1122
1080 if (!is_ret_probe(tu)) 1123 if (!is_ret_probe(tu))
1081 uprobe_perf_print(tu, 0, regs); 1124 __uprobe_perf_func(tu, 0, regs, ucb, dsize);
1082 return 0; 1125 return 0;
1083} 1126}
1084 1127
1085static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func, 1128static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
1086 struct pt_regs *regs) 1129 struct pt_regs *regs,
1130 struct uprobe_cpu_buffer *ucb, int dsize)
1087{ 1131{
1088 uprobe_perf_print(tu, func, regs); 1132 __uprobe_perf_func(tu, func, regs, ucb, dsize);
1089} 1133}
1090#endif /* CONFIG_PERF_EVENTS */ 1134#endif /* CONFIG_PERF_EVENTS */
1091 1135
1092static 1136static int
1093int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data) 1137trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
1138 void *data)
1094{ 1139{
1095 struct trace_uprobe *tu = event->data; 1140 struct trace_uprobe *tu = event->data;
1141 struct ftrace_event_file *file = data;
1096 1142
1097 switch (type) { 1143 switch (type) {
1098 case TRACE_REG_REGISTER: 1144 case TRACE_REG_REGISTER:
1099 return probe_event_enable(tu, TP_FLAG_TRACE, NULL); 1145 return probe_event_enable(tu, file, NULL);
1100 1146
1101 case TRACE_REG_UNREGISTER: 1147 case TRACE_REG_UNREGISTER:
1102 probe_event_disable(tu, TP_FLAG_TRACE); 1148 probe_event_disable(tu, file);
1103 return 0; 1149 return 0;
1104 1150
1105#ifdef CONFIG_PERF_EVENTS 1151#ifdef CONFIG_PERF_EVENTS
1106 case TRACE_REG_PERF_REGISTER: 1152 case TRACE_REG_PERF_REGISTER:
1107 return probe_event_enable(tu, TP_FLAG_PROFILE, uprobe_perf_filter); 1153 return probe_event_enable(tu, NULL, uprobe_perf_filter);
1108 1154
1109 case TRACE_REG_PERF_UNREGISTER: 1155 case TRACE_REG_PERF_UNREGISTER:
1110 probe_event_disable(tu, TP_FLAG_PROFILE); 1156 probe_event_disable(tu, NULL);
1111 return 0; 1157 return 0;
1112 1158
1113 case TRACE_REG_PERF_OPEN: 1159 case TRACE_REG_PERF_OPEN:
@@ -1127,8 +1173,11 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
1127{ 1173{
1128 struct trace_uprobe *tu; 1174 struct trace_uprobe *tu;
1129 struct uprobe_dispatch_data udd; 1175 struct uprobe_dispatch_data udd;
1176 struct uprobe_cpu_buffer *ucb;
1177 int dsize, esize;
1130 int ret = 0; 1178 int ret = 0;
1131 1179
1180
1132 tu = container_of(con, struct trace_uprobe, consumer); 1181 tu = container_of(con, struct trace_uprobe, consumer);
1133 tu->nhit++; 1182 tu->nhit++;
1134 1183
@@ -1137,13 +1186,29 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
1137 1186
1138 current->utask->vaddr = (unsigned long) &udd; 1187 current->utask->vaddr = (unsigned long) &udd;
1139 1188
1189#ifdef CONFIG_PERF_EVENTS
1190 if ((tu->tp.flags & TP_FLAG_TRACE) == 0 &&
1191 !uprobe_perf_filter(&tu->consumer, 0, current->mm))
1192 return UPROBE_HANDLER_REMOVE;
1193#endif
1194
1195 if (WARN_ON_ONCE(!uprobe_cpu_buffer))
1196 return 0;
1197
1198 dsize = __get_data_size(&tu->tp, regs);
1199 esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
1200
1201 ucb = uprobe_buffer_get();
1202 store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize);
1203
1140 if (tu->tp.flags & TP_FLAG_TRACE) 1204 if (tu->tp.flags & TP_FLAG_TRACE)
1141 ret |= uprobe_trace_func(tu, regs); 1205 ret |= uprobe_trace_func(tu, regs, ucb, dsize);
1142 1206
1143#ifdef CONFIG_PERF_EVENTS 1207#ifdef CONFIG_PERF_EVENTS
1144 if (tu->tp.flags & TP_FLAG_PROFILE) 1208 if (tu->tp.flags & TP_FLAG_PROFILE)
1145 ret |= uprobe_perf_func(tu, regs); 1209 ret |= uprobe_perf_func(tu, regs, ucb, dsize);
1146#endif 1210#endif
1211 uprobe_buffer_put(ucb);
1147 return ret; 1212 return ret;
1148} 1213}
1149 1214
@@ -1152,6 +1217,8 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con,
1152{ 1217{
1153 struct trace_uprobe *tu; 1218 struct trace_uprobe *tu;
1154 struct uprobe_dispatch_data udd; 1219 struct uprobe_dispatch_data udd;
1220 struct uprobe_cpu_buffer *ucb;
1221 int dsize, esize;
1155 1222
1156 tu = container_of(con, struct trace_uprobe, consumer); 1223 tu = container_of(con, struct trace_uprobe, consumer);
1157 1224
@@ -1160,13 +1227,23 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con,
1160 1227
1161 current->utask->vaddr = (unsigned long) &udd; 1228 current->utask->vaddr = (unsigned long) &udd;
1162 1229
1230 if (WARN_ON_ONCE(!uprobe_cpu_buffer))
1231 return 0;
1232
1233 dsize = __get_data_size(&tu->tp, regs);
1234 esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
1235
1236 ucb = uprobe_buffer_get();
1237 store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize);
1238
1163 if (tu->tp.flags & TP_FLAG_TRACE) 1239 if (tu->tp.flags & TP_FLAG_TRACE)
1164 uretprobe_trace_func(tu, func, regs); 1240 uretprobe_trace_func(tu, func, regs, ucb, dsize);
1165 1241
1166#ifdef CONFIG_PERF_EVENTS 1242#ifdef CONFIG_PERF_EVENTS
1167 if (tu->tp.flags & TP_FLAG_PROFILE) 1243 if (tu->tp.flags & TP_FLAG_PROFILE)
1168 uretprobe_perf_func(tu, func, regs); 1244 uretprobe_perf_func(tu, func, regs, ucb, dsize);
1169#endif 1245#endif
1246 uprobe_buffer_put(ucb);
1170 return 0; 1247 return 0;
1171} 1248}
1172 1249
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 031cc5655a51..50f8329c2042 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -62,14 +62,12 @@ struct tracepoint_entry {
62 struct hlist_node hlist; 62 struct hlist_node hlist;
63 struct tracepoint_func *funcs; 63 struct tracepoint_func *funcs;
64 int refcount; /* Number of times armed. 0 if disarmed. */ 64 int refcount; /* Number of times armed. 0 if disarmed. */
65 int enabled; /* Tracepoint enabled */
65 char name[0]; 66 char name[0];
66}; 67};
67 68
68struct tp_probes { 69struct tp_probes {
69 union { 70 struct rcu_head rcu;
70 struct rcu_head rcu;
71 struct list_head list;
72 } u;
73 struct tracepoint_func probes[0]; 71 struct tracepoint_func probes[0];
74}; 72};
75 73
@@ -82,7 +80,7 @@ static inline void *allocate_probes(int count)
82 80
83static void rcu_free_old_probes(struct rcu_head *head) 81static void rcu_free_old_probes(struct rcu_head *head)
84{ 82{
85 kfree(container_of(head, struct tp_probes, u.rcu)); 83 kfree(container_of(head, struct tp_probes, rcu));
86} 84}
87 85
88static inline void release_probes(struct tracepoint_func *old) 86static inline void release_probes(struct tracepoint_func *old)
@@ -90,7 +88,7 @@ static inline void release_probes(struct tracepoint_func *old)
90 if (old) { 88 if (old) {
91 struct tp_probes *tp_probes = container_of(old, 89 struct tp_probes *tp_probes = container_of(old,
92 struct tp_probes, probes[0]); 90 struct tp_probes, probes[0]);
93 call_rcu_sched(&tp_probes->u.rcu, rcu_free_old_probes); 91 call_rcu_sched(&tp_probes->rcu, rcu_free_old_probes);
94 } 92 }
95} 93}
96 94
@@ -237,6 +235,7 @@ static struct tracepoint_entry *add_tracepoint(const char *name)
237 memcpy(&e->name[0], name, name_len); 235 memcpy(&e->name[0], name, name_len);
238 e->funcs = NULL; 236 e->funcs = NULL;
239 e->refcount = 0; 237 e->refcount = 0;
238 e->enabled = 0;
240 hlist_add_head(&e->hlist, head); 239 hlist_add_head(&e->hlist, head);
241 return e; 240 return e;
242} 241}
@@ -316,6 +315,7 @@ static void tracepoint_update_probe_range(struct tracepoint * const *begin,
316 if (mark_entry) { 315 if (mark_entry) {
317 set_tracepoint(&mark_entry, *iter, 316 set_tracepoint(&mark_entry, *iter,
318 !!mark_entry->refcount); 317 !!mark_entry->refcount);
318 mark_entry->enabled = !!mark_entry->refcount;
319 } else { 319 } else {
320 disable_tracepoint(*iter); 320 disable_tracepoint(*iter);
321 } 321 }
@@ -373,13 +373,26 @@ tracepoint_add_probe(const char *name, void *probe, void *data)
373 * tracepoint_probe_register - Connect a probe to a tracepoint 373 * tracepoint_probe_register - Connect a probe to a tracepoint
374 * @name: tracepoint name 374 * @name: tracepoint name
375 * @probe: probe handler 375 * @probe: probe handler
376 * @data: probe private data
377 *
378 * Returns:
379 * - 0 if the probe was successfully registered, and tracepoint
380 * callsites are currently loaded for that probe,
381 * - -ENODEV if the probe was successfully registered, but no tracepoint
382 * callsite is currently loaded for that probe,
383 * - other negative error value on error.
384 *
385 * When tracepoint_probe_register() returns either 0 or -ENODEV,
386 * parameters @name, @probe, and @data may be used by the tracepoint
387 * infrastructure until the probe is unregistered.
376 * 388 *
377 * Returns 0 if ok, error value on error.
378 * The probe address must at least be aligned on the architecture pointer size. 389 * The probe address must at least be aligned on the architecture pointer size.
379 */ 390 */
380int tracepoint_probe_register(const char *name, void *probe, void *data) 391int tracepoint_probe_register(const char *name, void *probe, void *data)
381{ 392{
382 struct tracepoint_func *old; 393 struct tracepoint_func *old;
394 struct tracepoint_entry *entry;
395 int ret = 0;
383 396
384 mutex_lock(&tracepoints_mutex); 397 mutex_lock(&tracepoints_mutex);
385 old = tracepoint_add_probe(name, probe, data); 398 old = tracepoint_add_probe(name, probe, data);
@@ -388,9 +401,13 @@ int tracepoint_probe_register(const char *name, void *probe, void *data)
388 return PTR_ERR(old); 401 return PTR_ERR(old);
389 } 402 }
390 tracepoint_update_probes(); /* may update entry */ 403 tracepoint_update_probes(); /* may update entry */
404 entry = get_tracepoint(name);
405 /* Make sure the entry was enabled */
406 if (!entry || !entry->enabled)
407 ret = -ENODEV;
391 mutex_unlock(&tracepoints_mutex); 408 mutex_unlock(&tracepoints_mutex);
392 release_probes(old); 409 release_probes(old);
393 return 0; 410 return ret;
394} 411}
395EXPORT_SYMBOL_GPL(tracepoint_probe_register); 412EXPORT_SYMBOL_GPL(tracepoint_probe_register);
396 413
@@ -415,6 +432,7 @@ tracepoint_remove_probe(const char *name, void *probe, void *data)
415 * tracepoint_probe_unregister - Disconnect a probe from a tracepoint 432 * tracepoint_probe_unregister - Disconnect a probe from a tracepoint
416 * @name: tracepoint name 433 * @name: tracepoint name
417 * @probe: probe function pointer 434 * @probe: probe function pointer
435 * @data: probe private data
418 * 436 *
419 * We do not need to call a synchronize_sched to make sure the probes have 437 * We do not need to call a synchronize_sched to make sure the probes have
420 * finished running before doing a module unload, because the module unload 438 * finished running before doing a module unload, because the module unload
@@ -438,197 +456,6 @@ int tracepoint_probe_unregister(const char *name, void *probe, void *data)
438} 456}
439EXPORT_SYMBOL_GPL(tracepoint_probe_unregister); 457EXPORT_SYMBOL_GPL(tracepoint_probe_unregister);
440 458
441static LIST_HEAD(old_probes);
442static int need_update;
443
444static void tracepoint_add_old_probes(void *old)
445{
446 need_update = 1;
447 if (old) {
448 struct tp_probes *tp_probes = container_of(old,
449 struct tp_probes, probes[0]);
450 list_add(&tp_probes->u.list, &old_probes);
451 }
452}
453
454/**
455 * tracepoint_probe_register_noupdate - register a probe but not connect
456 * @name: tracepoint name
457 * @probe: probe handler
458 *
459 * caller must call tracepoint_probe_update_all()
460 */
461int tracepoint_probe_register_noupdate(const char *name, void *probe,
462 void *data)
463{
464 struct tracepoint_func *old;
465
466 mutex_lock(&tracepoints_mutex);
467 old = tracepoint_add_probe(name, probe, data);
468 if (IS_ERR(old)) {
469 mutex_unlock(&tracepoints_mutex);
470 return PTR_ERR(old);
471 }
472 tracepoint_add_old_probes(old);
473 mutex_unlock(&tracepoints_mutex);
474 return 0;
475}
476EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate);
477
478/**
479 * tracepoint_probe_unregister_noupdate - remove a probe but not disconnect
480 * @name: tracepoint name
481 * @probe: probe function pointer
482 *
483 * caller must call tracepoint_probe_update_all()
484 */
485int tracepoint_probe_unregister_noupdate(const char *name, void *probe,
486 void *data)
487{
488 struct tracepoint_func *old;
489
490 mutex_lock(&tracepoints_mutex);
491 old = tracepoint_remove_probe(name, probe, data);
492 if (IS_ERR(old)) {
493 mutex_unlock(&tracepoints_mutex);
494 return PTR_ERR(old);
495 }
496 tracepoint_add_old_probes(old);
497 mutex_unlock(&tracepoints_mutex);
498 return 0;
499}
500EXPORT_SYMBOL_GPL(tracepoint_probe_unregister_noupdate);
501
502/**
503 * tracepoint_probe_update_all - update tracepoints
504 */
505void tracepoint_probe_update_all(void)
506{
507 LIST_HEAD(release_probes);
508 struct tp_probes *pos, *next;
509
510 mutex_lock(&tracepoints_mutex);
511 if (!need_update) {
512 mutex_unlock(&tracepoints_mutex);
513 return;
514 }
515 if (!list_empty(&old_probes))
516 list_replace_init(&old_probes, &release_probes);
517 need_update = 0;
518 tracepoint_update_probes();
519 mutex_unlock(&tracepoints_mutex);
520 list_for_each_entry_safe(pos, next, &release_probes, u.list) {
521 list_del(&pos->u.list);
522 call_rcu_sched(&pos->u.rcu, rcu_free_old_probes);
523 }
524}
525EXPORT_SYMBOL_GPL(tracepoint_probe_update_all);
526
527/**
528 * tracepoint_get_iter_range - Get a next tracepoint iterator given a range.
529 * @tracepoint: current tracepoints (in), next tracepoint (out)
530 * @begin: beginning of the range
531 * @end: end of the range
532 *
533 * Returns whether a next tracepoint has been found (1) or not (0).
534 * Will return the first tracepoint in the range if the input tracepoint is
535 * NULL.
536 */
537static int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
538 struct tracepoint * const *begin, struct tracepoint * const *end)
539{
540 if (!*tracepoint && begin != end) {
541 *tracepoint = begin;
542 return 1;
543 }
544 if (*tracepoint >= begin && *tracepoint < end)
545 return 1;
546 return 0;
547}
548
549#ifdef CONFIG_MODULES
550static void tracepoint_get_iter(struct tracepoint_iter *iter)
551{
552 int found = 0;
553 struct tp_module *iter_mod;
554
555 /* Core kernel tracepoints */
556 if (!iter->module) {
557 found = tracepoint_get_iter_range(&iter->tracepoint,
558 __start___tracepoints_ptrs,
559 __stop___tracepoints_ptrs);
560 if (found)
561 goto end;
562 }
563 /* Tracepoints in modules */
564 mutex_lock(&tracepoints_mutex);
565 list_for_each_entry(iter_mod, &tracepoint_module_list, list) {
566 /*
567 * Sorted module list
568 */
569 if (iter_mod < iter->module)
570 continue;
571 else if (iter_mod > iter->module)
572 iter->tracepoint = NULL;
573 found = tracepoint_get_iter_range(&iter->tracepoint,
574 iter_mod->tracepoints_ptrs,
575 iter_mod->tracepoints_ptrs
576 + iter_mod->num_tracepoints);
577 if (found) {
578 iter->module = iter_mod;
579 break;
580 }
581 }
582 mutex_unlock(&tracepoints_mutex);
583end:
584 if (!found)
585 tracepoint_iter_reset(iter);
586}
587#else /* CONFIG_MODULES */
588static void tracepoint_get_iter(struct tracepoint_iter *iter)
589{
590 int found = 0;
591
592 /* Core kernel tracepoints */
593 found = tracepoint_get_iter_range(&iter->tracepoint,
594 __start___tracepoints_ptrs,
595 __stop___tracepoints_ptrs);
596 if (!found)
597 tracepoint_iter_reset(iter);
598}
599#endif /* CONFIG_MODULES */
600
601void tracepoint_iter_start(struct tracepoint_iter *iter)
602{
603 tracepoint_get_iter(iter);
604}
605EXPORT_SYMBOL_GPL(tracepoint_iter_start);
606
607void tracepoint_iter_next(struct tracepoint_iter *iter)
608{
609 iter->tracepoint++;
610 /*
611 * iter->tracepoint may be invalid because we blindly incremented it.
612 * Make sure it is valid by marshalling on the tracepoints, getting the
613 * tracepoints from following modules if necessary.
614 */
615 tracepoint_get_iter(iter);
616}
617EXPORT_SYMBOL_GPL(tracepoint_iter_next);
618
619void tracepoint_iter_stop(struct tracepoint_iter *iter)
620{
621}
622EXPORT_SYMBOL_GPL(tracepoint_iter_stop);
623
624void tracepoint_iter_reset(struct tracepoint_iter *iter)
625{
626#ifdef CONFIG_MODULES
627 iter->module = NULL;
628#endif /* CONFIG_MODULES */
629 iter->tracepoint = NULL;
630}
631EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
632 459
633#ifdef CONFIG_MODULES 460#ifdef CONFIG_MODULES
634bool trace_module_has_bad_taint(struct module *mod) 461bool trace_module_has_bad_taint(struct module *mod)
@@ -638,9 +465,12 @@ bool trace_module_has_bad_taint(struct module *mod)
638 465
639static int tracepoint_module_coming(struct module *mod) 466static int tracepoint_module_coming(struct module *mod)
640{ 467{
641 struct tp_module *tp_mod, *iter; 468 struct tp_module *tp_mod;
642 int ret = 0; 469 int ret = 0;
643 470
471 if (!mod->num_tracepoints)
472 return 0;
473
644 /* 474 /*
645 * We skip modules that taint the kernel, especially those with different 475 * We skip modules that taint the kernel, especially those with different
646 * module headers (for forced load), to make sure we don't cause a crash. 476 * module headers (for forced load), to make sure we don't cause a crash.
@@ -656,23 +486,7 @@ static int tracepoint_module_coming(struct module *mod)
656 } 486 }
657 tp_mod->num_tracepoints = mod->num_tracepoints; 487 tp_mod->num_tracepoints = mod->num_tracepoints;
658 tp_mod->tracepoints_ptrs = mod->tracepoints_ptrs; 488 tp_mod->tracepoints_ptrs = mod->tracepoints_ptrs;
659 489 list_add_tail(&tp_mod->list, &tracepoint_module_list);
660 /*
661 * tracepoint_module_list is kept sorted by struct module pointer
662 * address for iteration on tracepoints from a seq_file that can release
663 * the mutex between calls.
664 */
665 list_for_each_entry_reverse(iter, &tracepoint_module_list, list) {
666 BUG_ON(iter == tp_mod); /* Should never be in the list twice */
667 if (iter < tp_mod) {
668 /* We belong to the location right after iter. */
669 list_add(&tp_mod->list, &iter->list);
670 goto module_added;
671 }
672 }
673 /* We belong to the beginning of the list */
674 list_add(&tp_mod->list, &tracepoint_module_list);
675module_added:
676 tracepoint_update_probe_range(mod->tracepoints_ptrs, 490 tracepoint_update_probe_range(mod->tracepoints_ptrs,
677 mod->tracepoints_ptrs + mod->num_tracepoints); 491 mod->tracepoints_ptrs + mod->num_tracepoints);
678end: 492end:
@@ -684,6 +498,9 @@ static int tracepoint_module_going(struct module *mod)
684{ 498{
685 struct tp_module *pos; 499 struct tp_module *pos;
686 500
501 if (!mod->num_tracepoints)
502 return 0;
503
687 mutex_lock(&tracepoints_mutex); 504 mutex_lock(&tracepoints_mutex);
688 tracepoint_update_probe_range(mod->tracepoints_ptrs, 505 tracepoint_update_probe_range(mod->tracepoints_ptrs,
689 mod->tracepoints_ptrs + mod->num_tracepoints); 506 mod->tracepoints_ptrs + mod->num_tracepoints);
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index cb00829bb466..595d7fd795e1 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -30,7 +30,6 @@ struct hugetlb_cgroup {
30#define MEMFILE_IDX(val) (((val) >> 16) & 0xffff) 30#define MEMFILE_IDX(val) (((val) >> 16) & 0xffff)
31#define MEMFILE_ATTR(val) ((val) & 0xffff) 31#define MEMFILE_ATTR(val) ((val) & 0xffff)
32 32
33struct cgroup_subsys hugetlb_subsys __read_mostly;
34static struct hugetlb_cgroup *root_h_cgroup __read_mostly; 33static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
35 34
36static inline 35static inline
@@ -42,7 +41,7 @@ struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
42static inline 41static inline
43struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) 42struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
44{ 43{
45 return hugetlb_cgroup_from_css(task_css(task, hugetlb_subsys_id)); 44 return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id));
46} 45}
47 46
48static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) 47static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
@@ -255,7 +254,7 @@ static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
255} 254}
256 255
257static int hugetlb_cgroup_write(struct cgroup_subsys_state *css, 256static int hugetlb_cgroup_write(struct cgroup_subsys_state *css,
258 struct cftype *cft, const char *buffer) 257 struct cftype *cft, char *buffer)
259{ 258{
260 int idx, name, ret; 259 int idx, name, ret;
261 unsigned long long val; 260 unsigned long long val;
@@ -358,7 +357,7 @@ static void __init __hugetlb_cgroup_file_init(int idx)
358 cft = &h->cgroup_files[4]; 357 cft = &h->cgroup_files[4];
359 memset(cft, 0, sizeof(*cft)); 358 memset(cft, 0, sizeof(*cft));
360 359
361 WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files)); 360 WARN_ON(cgroup_add_cftypes(&hugetlb_cgrp_subsys, h->cgroup_files));
362 361
363 return; 362 return;
364} 363}
@@ -402,10 +401,8 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
402 return; 401 return;
403} 402}
404 403
405struct cgroup_subsys hugetlb_subsys = { 404struct cgroup_subsys hugetlb_cgrp_subsys = {
406 .name = "hugetlb",
407 .css_alloc = hugetlb_cgroup_css_alloc, 405 .css_alloc = hugetlb_cgroup_css_alloc,
408 .css_offline = hugetlb_cgroup_css_offline, 406 .css_offline = hugetlb_cgroup_css_offline,
409 .css_free = hugetlb_cgroup_css_free, 407 .css_free = hugetlb_cgroup_css_free,
410 .subsys_id = hugetlb_subsys_id,
411}; 408};
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5b6b0039f725..dcc8153a1681 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -66,8 +66,8 @@
66 66
67#include <trace/events/vmscan.h> 67#include <trace/events/vmscan.h>
68 68
69struct cgroup_subsys mem_cgroup_subsys __read_mostly; 69struct cgroup_subsys memory_cgrp_subsys __read_mostly;
70EXPORT_SYMBOL(mem_cgroup_subsys); 70EXPORT_SYMBOL(memory_cgrp_subsys);
71 71
72#define MEM_CGROUP_RECLAIM_RETRIES 5 72#define MEM_CGROUP_RECLAIM_RETRIES 5
73static struct mem_cgroup *root_mem_cgroup __read_mostly; 73static struct mem_cgroup *root_mem_cgroup __read_mostly;
@@ -538,7 +538,7 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
538{ 538{
539 struct cgroup_subsys_state *css; 539 struct cgroup_subsys_state *css;
540 540
541 css = css_from_id(id - 1, &mem_cgroup_subsys); 541 css = css_from_id(id - 1, &memory_cgrp_subsys);
542 return mem_cgroup_from_css(css); 542 return mem_cgroup_from_css(css);
543} 543}
544 544
@@ -1072,7 +1072,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
1072 if (unlikely(!p)) 1072 if (unlikely(!p))
1073 return NULL; 1073 return NULL;
1074 1074
1075 return mem_cgroup_from_css(task_css(p, mem_cgroup_subsys_id)); 1075 return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
1076} 1076}
1077 1077
1078struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 1078struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
@@ -1683,15 +1683,8 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
1683 */ 1683 */
1684void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1684void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1685{ 1685{
1686 /* 1686 /* oom_info_lock ensures that parallel ooms do not interleave */
1687 * protects memcg_name and makes sure that parallel ooms do not
1688 * interleave
1689 */
1690 static DEFINE_MUTEX(oom_info_lock); 1687 static DEFINE_MUTEX(oom_info_lock);
1691 struct cgroup *task_cgrp;
1692 struct cgroup *mem_cgrp;
1693 static char memcg_name[PATH_MAX];
1694 int ret;
1695 struct mem_cgroup *iter; 1688 struct mem_cgroup *iter;
1696 unsigned int i; 1689 unsigned int i;
1697 1690
@@ -1701,36 +1694,14 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1701 mutex_lock(&oom_info_lock); 1694 mutex_lock(&oom_info_lock);
1702 rcu_read_lock(); 1695 rcu_read_lock();
1703 1696
1704 mem_cgrp = memcg->css.cgroup; 1697 pr_info("Task in ");
1705 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); 1698 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1706 1699 pr_info(" killed as a result of limit of ");
1707 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); 1700 pr_cont_cgroup_path(memcg->css.cgroup);
1708 if (ret < 0) { 1701 pr_info("\n");
1709 /*
1710 * Unfortunately, we are unable to convert to a useful name
1711 * But we'll still print out the usage information
1712 */
1713 rcu_read_unlock();
1714 goto done;
1715 }
1716 rcu_read_unlock();
1717
1718 pr_info("Task in %s killed", memcg_name);
1719 1702
1720 rcu_read_lock();
1721 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
1722 if (ret < 0) {
1723 rcu_read_unlock();
1724 goto done;
1725 }
1726 rcu_read_unlock(); 1703 rcu_read_unlock();
1727 1704
1728 /*
1729 * Continues from above, so we don't need an KERN_ level
1730 */
1731 pr_cont(" as a result of limit of %s\n", memcg_name);
1732done:
1733
1734 pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n", 1705 pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",
1735 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, 1706 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
1736 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, 1707 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
@@ -1745,13 +1716,8 @@ done:
1745 res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); 1716 res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
1746 1717
1747 for_each_mem_cgroup_tree(iter, memcg) { 1718 for_each_mem_cgroup_tree(iter, memcg) {
1748 pr_info("Memory cgroup stats"); 1719 pr_info("Memory cgroup stats for ");
1749 1720 pr_cont_cgroup_path(iter->css.cgroup);
1750 rcu_read_lock();
1751 ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX);
1752 if (!ret)
1753 pr_cont(" for %s", memcg_name);
1754 rcu_read_unlock();
1755 pr_cont(":"); 1721 pr_cont(":");
1756 1722
1757 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 1723 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
@@ -3401,7 +3367,7 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
3401 struct kmem_cache *s) 3367 struct kmem_cache *s)
3402{ 3368{
3403 struct kmem_cache *new = NULL; 3369 struct kmem_cache *new = NULL;
3404 static char *tmp_name = NULL; 3370 static char *tmp_path = NULL, *tmp_name = NULL;
3405 static DEFINE_MUTEX(mutex); /* protects tmp_name */ 3371 static DEFINE_MUTEX(mutex); /* protects tmp_name */
3406 3372
3407 BUG_ON(!memcg_can_account_kmem(memcg)); 3373 BUG_ON(!memcg_can_account_kmem(memcg));
@@ -3413,18 +3379,20 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
3413 * This static temporary buffer is used to prevent from 3379 * This static temporary buffer is used to prevent from
3414 * pointless shortliving allocation. 3380 * pointless shortliving allocation.
3415 */ 3381 */
3416 if (!tmp_name) { 3382 if (!tmp_path || !tmp_name) {
3417 tmp_name = kmalloc(PATH_MAX, GFP_KERNEL); 3383 if (!tmp_path)
3384 tmp_path = kmalloc(PATH_MAX, GFP_KERNEL);
3418 if (!tmp_name) 3385 if (!tmp_name)
3386 tmp_name = kmalloc(NAME_MAX + 1, GFP_KERNEL);
3387 if (!tmp_path || !tmp_name)
3419 goto out; 3388 goto out;
3420 } 3389 }
3421 3390
3422 rcu_read_lock(); 3391 cgroup_name(memcg->css.cgroup, tmp_name, NAME_MAX + 1);
3423 snprintf(tmp_name, PATH_MAX, "%s(%d:%s)", s->name, 3392 snprintf(tmp_path, PATH_MAX, "%s(%d:%s)", s->name,
3424 memcg_cache_id(memcg), cgroup_name(memcg->css.cgroup)); 3393 memcg_cache_id(memcg), tmp_name);
3425 rcu_read_unlock();
3426 3394
3427 new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align, 3395 new = kmem_cache_create_memcg(memcg, tmp_path, s->object_size, s->align,
3428 (s->flags & ~SLAB_PANIC), s->ctor, s); 3396 (s->flags & ~SLAB_PANIC), s->ctor, s);
3429 if (new) 3397 if (new)
3430 new->allocflags |= __GFP_KMEMCG; 3398 new->allocflags |= __GFP_KMEMCG;
@@ -4990,7 +4958,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
4990 struct cgroup *cgrp = memcg->css.cgroup; 4958 struct cgroup *cgrp = memcg->css.cgroup;
4991 4959
4992 /* returns EBUSY if there is a task or if we come here twice. */ 4960 /* returns EBUSY if there is a task or if we come here twice. */
4993 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 4961 if (cgroup_has_tasks(cgrp) || !list_empty(&cgrp->children))
4994 return -EBUSY; 4962 return -EBUSY;
4995 4963
4996 /* we call try-to-free pages for make this cgroup empty */ 4964 /* we call try-to-free pages for make this cgroup empty */
@@ -5172,7 +5140,7 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg,
5172 * of course permitted. 5140 * of course permitted.
5173 */ 5141 */
5174 mutex_lock(&memcg_create_mutex); 5142 mutex_lock(&memcg_create_mutex);
5175 if (cgroup_task_count(memcg->css.cgroup) || memcg_has_children(memcg)) 5143 if (cgroup_has_tasks(memcg->css.cgroup) || memcg_has_children(memcg))
5176 err = -EBUSY; 5144 err = -EBUSY;
5177 mutex_unlock(&memcg_create_mutex); 5145 mutex_unlock(&memcg_create_mutex);
5178 if (err) 5146 if (err)
@@ -5274,7 +5242,7 @@ static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
5274 * RES_LIMIT. 5242 * RES_LIMIT.
5275 */ 5243 */
5276static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, 5244static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
5277 const char *buffer) 5245 char *buffer)
5278{ 5246{
5279 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5247 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5280 enum res_type type; 5248 enum res_type type;
@@ -6095,7 +6063,7 @@ static void memcg_event_ptable_queue_proc(struct file *file,
6095 * Interpretation of args is defined by control file implementation. 6063 * Interpretation of args is defined by control file implementation.
6096 */ 6064 */
6097static int memcg_write_event_control(struct cgroup_subsys_state *css, 6065static int memcg_write_event_control(struct cgroup_subsys_state *css,
6098 struct cftype *cft, const char *buffer) 6066 struct cftype *cft, char *buffer)
6099{ 6067{
6100 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6068 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6101 struct mem_cgroup_event *event; 6069 struct mem_cgroup_event *event;
@@ -6183,17 +6151,15 @@ static int memcg_write_event_control(struct cgroup_subsys_state *css,
6183 * automatically removed on cgroup destruction but the removal is 6151 * automatically removed on cgroup destruction but the removal is
6184 * asynchronous, so take an extra ref on @css. 6152 * asynchronous, so take an extra ref on @css.
6185 */ 6153 */
6186 rcu_read_lock(); 6154 cfile_css = css_tryget_from_dir(cfile.file->f_dentry->d_parent,
6187 6155 &memory_cgrp_subsys);
6188 ret = -EINVAL; 6156 ret = -EINVAL;
6189 cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, 6157 if (IS_ERR(cfile_css))
6190 &mem_cgroup_subsys); 6158 goto out_put_cfile;
6191 if (cfile_css == css && css_tryget(css)) 6159 if (cfile_css != css) {
6192 ret = 0; 6160 css_put(cfile_css);
6193
6194 rcu_read_unlock();
6195 if (ret)
6196 goto out_put_cfile; 6161 goto out_put_cfile;
6162 }
6197 6163
6198 ret = event->register_event(memcg, event->eventfd, buffer); 6164 ret = event->register_event(memcg, event->eventfd, buffer);
6199 if (ret) 6165 if (ret)
@@ -6566,11 +6532,11 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
6566 * unfortunate state in our controller. 6532 * unfortunate state in our controller.
6567 */ 6533 */
6568 if (parent != root_mem_cgroup) 6534 if (parent != root_mem_cgroup)
6569 mem_cgroup_subsys.broken_hierarchy = true; 6535 memory_cgrp_subsys.broken_hierarchy = true;
6570 } 6536 }
6571 mutex_unlock(&memcg_create_mutex); 6537 mutex_unlock(&memcg_create_mutex);
6572 6538
6573 return memcg_init_kmem(memcg, &mem_cgroup_subsys); 6539 return memcg_init_kmem(memcg, &memory_cgrp_subsys);
6574} 6540}
6575 6541
6576/* 6542/*
@@ -7272,9 +7238,7 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
7272 mem_cgroup_from_css(root_css)->use_hierarchy = true; 7238 mem_cgroup_from_css(root_css)->use_hierarchy = true;
7273} 7239}
7274 7240
7275struct cgroup_subsys mem_cgroup_subsys = { 7241struct cgroup_subsys memory_cgrp_subsys = {
7276 .name = "memory",
7277 .subsys_id = mem_cgroup_subsys_id,
7278 .css_alloc = mem_cgroup_css_alloc, 7242 .css_alloc = mem_cgroup_css_alloc,
7279 .css_online = mem_cgroup_css_online, 7243 .css_online = mem_cgroup_css_online,
7280 .css_offline = mem_cgroup_css_offline, 7244 .css_offline = mem_cgroup_css_offline,
@@ -7300,7 +7264,7 @@ __setup("swapaccount=", enable_swap_account);
7300 7264
7301static void __init memsw_file_init(void) 7265static void __init memsw_file_init(void)
7302{ 7266{
7303 WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, memsw_cgroup_files)); 7267 WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, memsw_cgroup_files));
7304} 7268}
7305 7269
7306static void __init enable_swap_cgroup(void) 7270static void __init enable_swap_cgroup(void)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 90002ea43638..35ef28acf137 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -145,14 +145,10 @@ static int hwpoison_filter_task(struct page *p)
145 return -EINVAL; 145 return -EINVAL;
146 146
147 css = mem_cgroup_css(mem); 147 css = mem_cgroup_css(mem);
148 /* root_mem_cgroup has NULL dentries */ 148 ino = cgroup_ino(css->cgroup);
149 if (!css->cgroup->dentry)
150 return -EINVAL;
151
152 ino = css->cgroup->dentry->d_inode->i_ino;
153 css_put(css); 149 css_put(css);
154 150
155 if (ino != hwpoison_filter_memcg) 151 if (!ino || ino != hwpoison_filter_memcg)
156 return -EINVAL; 152 return -EINVAL;
157 153
158 return 0; 154 return 0;
diff --git a/net/Kconfig b/net/Kconfig
index d1f6f968fc09..d92afe4204d9 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -243,7 +243,7 @@ config XPS
243 default y 243 default y
244 244
245config CGROUP_NET_PRIO 245config CGROUP_NET_PRIO
246 tristate "Network priority cgroup" 246 bool "Network priority cgroup"
247 depends on CGROUPS 247 depends on CGROUPS
248 ---help--- 248 ---help---
249 Cgroup subsystem for use in assigning processes to network priorities on 249 Cgroup subsystem for use in assigning processes to network priorities on
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 719efd541668..22931e1b99b4 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -23,7 +23,7 @@ static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state
23 23
24struct cgroup_cls_state *task_cls_state(struct task_struct *p) 24struct cgroup_cls_state *task_cls_state(struct task_struct *p)
25{ 25{
26 return css_cls_state(task_css(p, net_cls_subsys_id)); 26 return css_cls_state(task_css(p, net_cls_cgrp_id));
27} 27}
28EXPORT_SYMBOL_GPL(task_cls_state); 28EXPORT_SYMBOL_GPL(task_cls_state);
29 29
@@ -73,7 +73,7 @@ static void cgrp_attach(struct cgroup_subsys_state *css,
73 void *v = (void *)(unsigned long)cs->classid; 73 void *v = (void *)(unsigned long)cs->classid;
74 struct task_struct *p; 74 struct task_struct *p;
75 75
76 cgroup_taskset_for_each(p, css, tset) { 76 cgroup_taskset_for_each(p, tset) {
77 task_lock(p); 77 task_lock(p);
78 iterate_fd(p->files, 0, update_classid, v); 78 iterate_fd(p->files, 0, update_classid, v);
79 task_unlock(p); 79 task_unlock(p);
@@ -102,19 +102,10 @@ static struct cftype ss_files[] = {
102 { } /* terminate */ 102 { } /* terminate */
103}; 103};
104 104
105struct cgroup_subsys net_cls_subsys = { 105struct cgroup_subsys net_cls_cgrp_subsys = {
106 .name = "net_cls",
107 .css_alloc = cgrp_css_alloc, 106 .css_alloc = cgrp_css_alloc,
108 .css_online = cgrp_css_online, 107 .css_online = cgrp_css_online,
109 .css_free = cgrp_css_free, 108 .css_free = cgrp_css_free,
110 .attach = cgrp_attach, 109 .attach = cgrp_attach,
111 .subsys_id = net_cls_subsys_id,
112 .base_cftypes = ss_files, 110 .base_cftypes = ss_files,
113 .module = THIS_MODULE,
114}; 111};
115
116static int __init init_netclassid_cgroup(void)
117{
118 return cgroup_load_subsys(&net_cls_subsys);
119}
120__initcall(init_netclassid_cgroup);
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 9043caedcd08..3825f669147b 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -186,7 +186,7 @@ static int read_priomap(struct seq_file *sf, void *v)
186} 186}
187 187
188static int write_priomap(struct cgroup_subsys_state *css, struct cftype *cft, 188static int write_priomap(struct cgroup_subsys_state *css, struct cftype *cft,
189 const char *buffer) 189 char *buffer)
190{ 190{
191 char devname[IFNAMSIZ + 1]; 191 char devname[IFNAMSIZ + 1];
192 struct net_device *dev; 192 struct net_device *dev;
@@ -224,7 +224,7 @@ static void net_prio_attach(struct cgroup_subsys_state *css,
224 struct task_struct *p; 224 struct task_struct *p;
225 void *v = (void *)(unsigned long)css->cgroup->id; 225 void *v = (void *)(unsigned long)css->cgroup->id;
226 226
227 cgroup_taskset_for_each(p, css, tset) { 227 cgroup_taskset_for_each(p, tset) {
228 task_lock(p); 228 task_lock(p);
229 iterate_fd(p->files, 0, update_netprio, v); 229 iterate_fd(p->files, 0, update_netprio, v);
230 task_unlock(p); 230 task_unlock(p);
@@ -244,15 +244,12 @@ static struct cftype ss_files[] = {
244 { } /* terminate */ 244 { } /* terminate */
245}; 245};
246 246
247struct cgroup_subsys net_prio_subsys = { 247struct cgroup_subsys net_prio_cgrp_subsys = {
248 .name = "net_prio",
249 .css_alloc = cgrp_css_alloc, 248 .css_alloc = cgrp_css_alloc,
250 .css_online = cgrp_css_online, 249 .css_online = cgrp_css_online,
251 .css_free = cgrp_css_free, 250 .css_free = cgrp_css_free,
252 .attach = net_prio_attach, 251 .attach = net_prio_attach,
253 .subsys_id = net_prio_subsys_id,
254 .base_cftypes = ss_files, 252 .base_cftypes = ss_files,
255 .module = THIS_MODULE,
256}; 253};
257 254
258static int netprio_device_event(struct notifier_block *unused, 255static int netprio_device_event(struct notifier_block *unused,
@@ -283,37 +280,9 @@ static struct notifier_block netprio_device_notifier = {
283 280
284static int __init init_cgroup_netprio(void) 281static int __init init_cgroup_netprio(void)
285{ 282{
286 int ret;
287
288 ret = cgroup_load_subsys(&net_prio_subsys);
289 if (ret)
290 goto out;
291
292 register_netdevice_notifier(&netprio_device_notifier); 283 register_netdevice_notifier(&netprio_device_notifier);
293 284 return 0;
294out:
295 return ret;
296}
297
298static void __exit exit_cgroup_netprio(void)
299{
300 struct netprio_map *old;
301 struct net_device *dev;
302
303 unregister_netdevice_notifier(&netprio_device_notifier);
304
305 cgroup_unload_subsys(&net_prio_subsys);
306
307 rtnl_lock();
308 for_each_netdev(&init_net, dev) {
309 old = rtnl_dereference(dev->priomap);
310 RCU_INIT_POINTER(dev->priomap, NULL);
311 if (old)
312 kfree_rcu(old, rcu);
313 }
314 rtnl_unlock();
315} 285}
316 286
317module_init(init_cgroup_netprio); 287subsys_initcall(init_cgroup_netprio);
318module_exit(exit_cgroup_netprio);
319MODULE_LICENSE("GPL v2"); 288MODULE_LICENSE("GPL v2");
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index f7e522c558ba..d4f015ad6c84 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -103,7 +103,7 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
103} 103}
104 104
105static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, 105static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
106 const char *buffer) 106 char *buffer)
107{ 107{
108 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 108 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
109 unsigned long long val; 109 unsigned long long val;
@@ -219,7 +219,7 @@ static struct cftype tcp_files[] = {
219 219
220static int __init tcp_memcontrol_init(void) 220static int __init tcp_memcontrol_init(void)
221{ 221{
222 WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, tcp_files)); 222 WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, tcp_files));
223 return 0; 223 return 0;
224} 224}
225__initcall(tcp_memcontrol_init); 225__initcall(tcp_memcontrol_init);
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index d3b6d2cd3a06..8365909f5f8c 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -58,11 +58,9 @@ static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s)
58 58
59static inline struct dev_cgroup *task_devcgroup(struct task_struct *task) 59static inline struct dev_cgroup *task_devcgroup(struct task_struct *task)
60{ 60{
61 return css_to_devcgroup(task_css(task, devices_subsys_id)); 61 return css_to_devcgroup(task_css(task, devices_cgrp_id));
62} 62}
63 63
64struct cgroup_subsys devices_subsys;
65
66/* 64/*
67 * called under devcgroup_mutex 65 * called under devcgroup_mutex
68 */ 66 */
@@ -498,7 +496,7 @@ static inline bool has_children(struct dev_cgroup *devcgroup)
498 * parent cgroup has the access you're asking for. 496 * parent cgroup has the access you're asking for.
499 */ 497 */
500static int devcgroup_update_access(struct dev_cgroup *devcgroup, 498static int devcgroup_update_access(struct dev_cgroup *devcgroup,
501 int filetype, const char *buffer) 499 int filetype, char *buffer)
502{ 500{
503 const char *b; 501 const char *b;
504 char temp[12]; /* 11 + 1 characters needed for a u32 */ 502 char temp[12]; /* 11 + 1 characters needed for a u32 */
@@ -654,7 +652,7 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
654} 652}
655 653
656static int devcgroup_access_write(struct cgroup_subsys_state *css, 654static int devcgroup_access_write(struct cgroup_subsys_state *css,
657 struct cftype *cft, const char *buffer) 655 struct cftype *cft, char *buffer)
658{ 656{
659 int retval; 657 int retval;
660 658
@@ -684,13 +682,11 @@ static struct cftype dev_cgroup_files[] = {
684 { } /* terminate */ 682 { } /* terminate */
685}; 683};
686 684
687struct cgroup_subsys devices_subsys = { 685struct cgroup_subsys devices_cgrp_subsys = {
688 .name = "devices",
689 .css_alloc = devcgroup_css_alloc, 686 .css_alloc = devcgroup_css_alloc,
690 .css_free = devcgroup_css_free, 687 .css_free = devcgroup_css_free,
691 .css_online = devcgroup_online, 688 .css_online = devcgroup_online,
692 .css_offline = devcgroup_offline, 689 .css_offline = devcgroup_offline,
693 .subsys_id = devices_subsys_id,
694 .base_cftypes = dev_cgroup_files, 690 .base_cftypes = dev_cgroup_files,
695}; 691};
696 692
diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
index b4f9507ae650..ba1a93f935c7 100644
--- a/virt/kvm/vfio.c
+++ b/virt/kvm/vfio.c
@@ -59,6 +59,22 @@ static void kvm_vfio_group_put_external_user(struct vfio_group *vfio_group)
59 symbol_put(vfio_group_put_external_user); 59 symbol_put(vfio_group_put_external_user);
60} 60}
61 61
62static bool kvm_vfio_group_is_coherent(struct vfio_group *vfio_group)
63{
64 long (*fn)(struct vfio_group *, unsigned long);
65 long ret;
66
67 fn = symbol_get(vfio_external_check_extension);
68 if (!fn)
69 return false;
70
71 ret = fn(vfio_group, VFIO_DMA_CC_IOMMU);
72
73 symbol_put(vfio_external_check_extension);
74
75 return ret > 0;
76}
77
62/* 78/*
63 * Groups can use the same or different IOMMU domains. If the same then 79 * Groups can use the same or different IOMMU domains. If the same then
64 * adding a new group may change the coherency of groups we've previously 80 * adding a new group may change the coherency of groups we've previously
@@ -75,13 +91,10 @@ static void kvm_vfio_update_coherency(struct kvm_device *dev)
75 mutex_lock(&kv->lock); 91 mutex_lock(&kv->lock);
76 92
77 list_for_each_entry(kvg, &kv->group_list, node) { 93 list_for_each_entry(kvg, &kv->group_list, node) {
78 /* 94 if (!kvm_vfio_group_is_coherent(kvg->vfio_group)) {
79 * TODO: We need an interface to check the coherency of 95 noncoherent = true;
80 * the IOMMU domain this group is using. For now, assume 96 break;
81 * it's always noncoherent. 97 }
82 */
83 noncoherent = true;
84 break;
85 } 98 }
86 99
87 if (noncoherent != kv->noncoherent) { 100 if (noncoherent != kv->noncoherent) {