aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/RCU/RTFP.txt77
-rw-r--r--Documentation/RCU/UP.txt34
-rw-r--r--Documentation/RCU/checklist.txt20
-rw-r--r--Documentation/RCU/rcu.txt10
-rw-r--r--Documentation/RCU/rcubarrier.txt7
-rw-r--r--Documentation/RCU/torture.txt23
-rw-r--r--Documentation/RCU/trace.txt7
-rw-r--r--Documentation/RCU/whatisRCU.txt22
-rw-r--r--Documentation/feature-removal-schedule.txt27
-rw-r--r--Documentation/kernel-parameters.txt5
-rw-r--r--Documentation/trace/events.txt9
-rw-r--r--Documentation/trace/ftrace.txt68
-rw-r--r--Documentation/trace/function-graph-fold.vim42
-rw-r--r--Documentation/trace/ring-buffer-design.txt955
-rw-r--r--arch/Kconfig12
-rw-r--r--arch/ia64/include/asm/dma-mapping.h19
-rw-r--r--arch/ia64/xen/time.c3
-rw-r--r--arch/m68k/include/asm/entry_mm.h4
-rw-r--r--arch/m68k/include/asm/entry_no.h8
-rw-r--r--arch/m68k/include/asm/math-emu.h20
-rw-r--r--arch/m68k/include/asm/thread_info_mm.h11
-rw-r--r--arch/m68k/kernel/asm-offsets.c39
-rw-r--r--arch/m68k/kernel/entry.S22
-rw-r--r--arch/m68k/math-emu/fp_entry.S38
-rw-r--r--arch/powerpc/include/asm/dma-mapping.h23
-rw-r--r--arch/powerpc/include/asm/pgtable.h6
-rw-r--r--arch/powerpc/include/asm/spinlock.h20
-rw-r--r--arch/powerpc/kernel/Makefile2
-rw-r--r--arch/powerpc/kernel/asm-offsets.c2
-rw-r--r--arch/powerpc/kernel/dma-swiotlb.c48
-rw-r--r--arch/powerpc/kernel/exceptions-64s.S19
-rw-r--r--arch/powerpc/kernel/perf_callchain.c527
-rw-r--r--arch/powerpc/mm/slb.c37
-rw-r--r--arch/powerpc/mm/stab.c11
-rw-r--r--arch/s390/Kconfig2
-rw-r--r--arch/s390/defconfig2
-rw-r--r--arch/s390/include/asm/spinlock.h29
-rw-r--r--arch/s390/include/asm/thread_info.h4
-rw-r--r--arch/s390/kernel/entry.S2
-rw-r--r--arch/s390/kernel/entry64.S2
-rw-r--r--arch/s390/kernel/ftrace.c36
-rw-r--r--arch/s390/kernel/ptrace.c11
-rw-r--r--arch/sparc/Kconfig2
-rw-r--r--arch/sparc/include/asm/dma-mapping.h145
-rw-r--r--arch/sparc/include/asm/irq_64.h4
-rw-r--r--arch/sparc/include/asm/pci.h3
-rw-r--r--arch/sparc/include/asm/pci_32.h105
-rw-r--r--arch/sparc/include/asm/pci_64.h88
-rw-r--r--arch/sparc/include/asm/spinlock_32.h12
-rw-r--r--arch/sparc/include/asm/spinlock_64.h28
-rw-r--r--arch/sparc/kernel/Makefile2
-rw-r--r--arch/sparc/kernel/dma.c175
-rw-r--r--arch/sparc/kernel/dma.h14
-rw-r--r--arch/sparc/kernel/iommu.c20
-rw-r--r--arch/sparc/kernel/ioport.c190
-rw-r--r--arch/sparc/kernel/pci.c2
-rw-r--r--arch/sparc/kernel/pci_sun4v.c30
-rw-r--r--arch/sparc/kernel/process_64.c4
-rw-r--r--arch/x86/Kconfig3
-rw-r--r--arch/x86/configs/i386_defconfig2
-rw-r--r--arch/x86/configs/x86_64_defconfig2
-rw-r--r--arch/x86/include/asm/amd_iommu.h1
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h50
-rw-r--r--arch/x86/include/asm/dma-mapping.h18
-rw-r--r--arch/x86/include/asm/ftrace.h7
-rw-r--r--arch/x86/include/asm/nmi.h4
-rw-r--r--arch/x86/include/asm/perf_counter.h10
-rw-r--r--arch/x86/include/asm/thread_info.h13
-rw-r--r--arch/x86/include/asm/topology.h47
-rw-r--r--arch/x86/include/asm/unistd_32.h2
-rw-r--r--arch/x86/include/asm/unistd_64.h6
-rw-r--r--arch/x86/kernel/amd_iommu.c489
-rw-r--r--arch/x86/kernel/amd_iommu_init.c42
-rw-r--r--arch/x86/kernel/apic/nmi.c20
-rw-r--r--arch/x86/kernel/asm-offsets_64.c1
-rw-r--r--arch/x86/kernel/cpu/perf_counter.c329
-rw-r--r--arch/x86/kernel/ftrace.c51
-rw-r--r--arch/x86/kernel/pci-dma.c11
-rw-r--r--arch/x86/kernel/pci-gart_64.c5
-rw-r--r--arch/x86/kernel/pci-nommu.c29
-rw-r--r--arch/x86/kernel/pci-swiotlb.c25
-rw-r--r--arch/x86/kernel/ptrace.c13
-rw-r--r--arch/x86/kernel/sys_x86_64.c8
-rw-r--r--arch/x86/oprofile/nmi_int.c404
-rw-r--r--arch/x86/oprofile/op_counter.h2
-rw-r--r--arch/x86/oprofile/op_model_amd.c372
-rw-r--r--arch/x86/oprofile/op_model_p4.c72
-rw-r--r--arch/x86/oprofile/op_model_ppro.c101
-rw-r--r--arch/x86/oprofile/op_x86_model.h59
-rw-r--r--arch/x86/pci/direct.c5
-rw-r--r--drivers/acpi/blacklist.c5
-rw-r--r--drivers/ata/Kconfig21
-rw-r--r--drivers/ata/Makefile1
-rw-r--r--drivers/ata/ahci.c143
-rw-r--r--drivers/ata/libata-acpi.c7
-rw-r--r--drivers/ata/libata-core.c44
-rw-r--r--drivers/ata/libata-eh.c146
-rw-r--r--drivers/ata/libata-pmp.c2
-rw-r--r--drivers/ata/libata-scsi.c159
-rw-r--r--drivers/ata/libata.h1
-rw-r--r--drivers/ata/pata_atiixp.c1
-rw-r--r--drivers/ata/pata_cs5535.c3
-rw-r--r--drivers/ata/pata_octeon_cf.c4
-rw-r--r--drivers/ata/pata_platform.c8
-rw-r--r--drivers/ata/pata_rb532_cf.c2
-rw-r--r--drivers/ata/pata_rdc.c400
-rw-r--r--drivers/ata/pata_rz1000.c4
-rw-r--r--drivers/ata/sata_fsl.c1
-rw-r--r--drivers/ata/sata_inic162x.c2
-rw-r--r--drivers/ata/sata_mv.c2
-rw-r--r--drivers/ata/sata_sil.c13
-rw-r--r--drivers/ata/sata_sil24.c11
-rw-r--r--drivers/ata/sata_sis.c75
-rw-r--r--drivers/char/sysrq.c19
-rw-r--r--drivers/firmware/dmi_scan.c77
-rw-r--r--drivers/ide/atiixp.c1
-rw-r--r--drivers/oprofile/cpu_buffer.c16
-rw-r--r--drivers/oprofile/oprof.c71
-rw-r--r--drivers/oprofile/oprof.h3
-rw-r--r--drivers/oprofile/oprofile_files.c46
-rw-r--r--drivers/oprofile/oprofile_stats.c5
-rw-r--r--drivers/oprofile/oprofile_stats.h1
-rw-r--r--drivers/pci/intr_remapping.c14
-rw-r--r--drivers/pci/quirks.c4
-rw-r--r--fs/dcache.c1
-rw-r--r--fs/locks.c2
-rw-r--r--include/asm-generic/dma-mapping-common.h6
-rw-r--r--include/linux/ata.h36
-rw-r--r--include/linux/cpu.h17
-rw-r--r--include/linux/dma-mapping.h5
-rw-r--r--include/linux/dmi.h13
-rw-r--r--include/linux/ftrace_event.h51
-rw-r--r--include/linux/hardirq.h10
-rw-r--r--include/linux/init_task.h11
-rw-r--r--include/linux/interrupt.h4
-rw-r--r--include/linux/irq.h18
-rw-r--r--include/linux/irqnr.h6
-rw-r--r--include/linux/kernel.h5
-rw-r--r--include/linux/libata.h3
-rw-r--r--include/linux/lockdep.h18
-rw-r--r--include/linux/module.h14
-rw-r--r--include/linux/nmi.h19
-rw-r--r--include/linux/oprofile.h5
-rw-r--r--include/linux/pagemap.h4
-rw-r--r--include/linux/pci_ids.h4
-rw-r--r--include/linux/perf_counter.h7
-rw-r--r--include/linux/rcuclassic.h178
-rw-r--r--include/linux/rcupdate.h98
-rw-r--r--include/linux/rcupreempt.h127
-rw-r--r--include/linux/rcupreempt_trace.h97
-rw-r--r--include/linux/rcutree.h262
-rw-r--r--include/linux/ring_buffer.h24
-rw-r--r--include/linux/sched.h126
-rw-r--r--include/linux/spinlock.h64
-rw-r--r--include/linux/spinlock_api_smp.h394
-rw-r--r--include/linux/swiotlb.h11
-rw-r--r--include/linux/syscalls.h131
-rw-r--r--include/linux/topology.h168
-rw-r--r--include/linux/tracepoint.h29
-rw-r--r--include/trace/define_trace.h7
-rw-r--r--include/trace/events/module.h126
-rw-r--r--include/trace/events/sched.h107
-rw-r--r--include/trace/events/syscalls.h70
-rw-r--r--include/trace/ftrace.h93
-rw-r--r--include/trace/syscall.h48
-rw-r--r--init/Kconfig46
-rw-r--r--init/main.c4
-rw-r--r--kernel/Makefile4
-rw-r--r--kernel/exit.c1
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/futex.c47
-rw-r--r--kernel/irq/chip.c74
-rw-r--r--kernel/irq/handle.c5
-rw-r--r--kernel/irq/internals.h13
-rw-r--r--kernel/irq/manage.c102
-rw-r--r--kernel/irq/pm.c8
-rw-r--r--kernel/irq/resend.c3
-rw-r--r--kernel/irq/spurious.c1
-rw-r--r--kernel/kmod.c4
-rw-r--r--kernel/kprobes.c30
-rw-r--r--kernel/kthread.c4
-rw-r--r--kernel/lockdep.c792
-rw-r--r--kernel/lockdep_internals.h2
-rw-r--r--kernel/lockdep_proc.c128
-rw-r--r--kernel/module.c11
-rw-r--r--kernel/perf_counter.c173
-rw-r--r--kernel/printk.c175
-rw-r--r--kernel/rcuclassic.c807
-rw-r--r--kernel/rcupdate.c44
-rw-r--r--kernel/rcupreempt.c1539
-rw-r--r--kernel/rcupreempt_trace.c334
-rw-r--r--kernel/rcutorture.c202
-rw-r--r--kernel/rcutree.c280
-rw-r--r--kernel/rcutree.h253
-rw-r--r--kernel/rcutree_plugin.h532
-rw-r--r--kernel/rcutree_trace.c88
-rw-r--r--kernel/sched.c1232
-rw-r--r--kernel/sched_cpupri.c30
-rw-r--r--kernel/sched_debug.c4
-rw-r--r--kernel/sched_fair.c84
-rw-r--r--kernel/sched_features.h2
-rw-r--r--kernel/sched_rt.c62
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/spinlock.c230
-rw-r--r--kernel/sysctl.c24
-rw-r--r--kernel/timer.c3
-rw-r--r--kernel/trace/Kconfig13
-rw-r--r--kernel/trace/blktrace.c12
-rw-r--r--kernel/trace/ftrace.c107
-rw-r--r--kernel/trace/kmemtrace.c149
-rw-r--r--kernel/trace/ring_buffer.c1112
-rw-r--r--kernel/trace/trace.c679
-rw-r--r--kernel/trace/trace.h76
-rw-r--r--kernel/trace/trace_boot.c16
-rw-r--r--kernel/trace/trace_events.c146
-rw-r--r--kernel/trace/trace_events_filter.c261
-rw-r--r--kernel/trace/trace_export.c28
-rw-r--r--kernel/trace/trace_functions.c4
-rw-r--r--kernel/trace/trace_functions_graph.c166
-rw-r--r--kernel/trace/trace_irqsoff.c3
-rw-r--r--kernel/trace/trace_mmiotrace.c10
-rw-r--r--kernel/trace/trace_power.c22
-rw-r--r--kernel/trace/trace_sched_switch.c59
-rw-r--r--kernel/trace/trace_sched_wakeup.c7
-rw-r--r--kernel/trace/trace_selftest.c1
-rw-r--r--kernel/trace/trace_stack.c43
-rw-r--r--kernel/trace/trace_stat.c17
-rw-r--r--kernel/trace/trace_stat.h2
-rw-r--r--kernel/trace/trace_syscalls.c471
-rw-r--r--kernel/trace/trace_workqueue.c32
-rw-r--r--kernel/tracepoint.c50
-rw-r--r--kernel/workqueue.c9
-rw-r--r--lib/Kconfig.debug2
-rw-r--r--lib/swiotlb.c124
-rwxr-xr-xscripts/recordmcount.pl1
-rw-r--r--tools/perf/Documentation/perf-record.txt4
-rw-r--r--tools/perf/Documentation/perf-report.txt13
-rw-r--r--tools/perf/Makefile39
-rw-r--r--tools/perf/builtin-annotate.c472
-rw-r--r--tools/perf/builtin-help.c1
-rw-r--r--tools/perf/builtin-record.c38
-rw-r--r--tools/perf/builtin-report.c719
-rw-r--r--tools/perf/builtin-stat.c239
-rw-r--r--tools/perf/builtin-top.c66
-rw-r--r--tools/perf/builtin-trace.c297
-rw-r--r--tools/perf/builtin.h1
-rw-r--r--tools/perf/perf.c1
-rw-r--r--tools/perf/util/abspath.c3
-rw-r--r--tools/perf/util/cache.h1
-rw-r--r--tools/perf/util/callchain.c2
-rw-r--r--tools/perf/util/callchain.h1
-rw-r--r--tools/perf/util/color.c16
-rw-r--r--tools/perf/util/color.h3
-rw-r--r--tools/perf/util/config.c22
-rw-r--r--tools/perf/util/debug.c95
-rw-r--r--tools/perf/util/debug.h8
-rw-r--r--tools/perf/util/event.h96
-rw-r--r--tools/perf/util/exec_cmd.c1
-rw-r--r--tools/perf/util/header.c37
-rw-r--r--tools/perf/util/header.h4
-rw-r--r--tools/perf/util/map.c97
-rw-r--r--tools/perf/util/module.c4
-rw-r--r--tools/perf/util/parse-events.c147
-rw-r--r--tools/perf/util/parse-events.h17
-rw-r--r--tools/perf/util/parse-options.c22
-rw-r--r--tools/perf/util/path.c25
-rw-r--r--tools/perf/util/run-command.c6
-rw-r--r--tools/perf/util/symbol.c199
-rw-r--r--tools/perf/util/symbol.h14
-rw-r--r--tools/perf/util/thread.c175
-rw-r--r--tools/perf/util/thread.h21
-rw-r--r--tools/perf/util/trace-event-info.c539
-rw-r--r--tools/perf/util/trace-event-parse.c2942
-rw-r--r--tools/perf/util/trace-event-read.c512
-rw-r--r--tools/perf/util/trace-event.h240
-rw-r--r--tools/perf/util/util.h6
-rw-r--r--tools/perf/util/values.c230
-rw-r--r--tools/perf/util/values.h27
278 files changed, 18033 insertions, 9400 deletions
diff --git a/Documentation/RCU/RTFP.txt b/Documentation/RCU/RTFP.txt
index 9f711d2df91b..d2b85237c76e 100644
--- a/Documentation/RCU/RTFP.txt
+++ b/Documentation/RCU/RTFP.txt
@@ -743,3 +743,80 @@ Revised:
743 RCU, realtime RCU, sleepable RCU, performance. 743 RCU, realtime RCU, sleepable RCU, performance.
744" 744"
745} 745}
746
747@article{PaulEMcKenney2008RCUOSR
748,author="Paul E. McKenney and Jonathan Walpole"
749,title="Introducing technology into the {Linux} kernel: a case study"
750,Year="2008"
751,journal="SIGOPS Oper. Syst. Rev."
752,volume="42"
753,number="5"
754,pages="4--17"
755,issn="0163-5980"
756,doi={http://doi.acm.org/10.1145/1400097.1400099}
757,publisher="ACM"
758,address="New York, NY, USA"
759,annotation={
760 Linux changed RCU to a far greater degree than RCU has changed Linux.
761}
762}
763
764@unpublished{PaulEMcKenney2008HierarchicalRCU
765,Author="Paul E. McKenney"
766,Title="Hierarchical {RCU}"
767,month="November"
768,day="3"
769,year="2008"
770,note="Available:
771\url{http://lwn.net/Articles/305782/}
772[Viewed November 6, 2008]"
773,annotation="
774 RCU with combining-tree-based grace-period detection,
775 permitting it to handle thousands of CPUs.
776"
777}
778
779@conference{PaulEMcKenney2009MaliciousURCU
780,Author="Paul E. McKenney"
781,Title="Using a Malicious User-Level {RCU} to Torture {RCU}-Based Algorithms"
782,Booktitle="linux.conf.au 2009"
783,month="January"
784,year="2009"
785,address="Hobart, Australia"
786,note="Available:
787\url{http://www.rdrop.com/users/paulmck/RCU/urcutorture.2009.01.22a.pdf}
788[Viewed February 2, 2009]"
789,annotation="
790 Realtime RCU and torture-testing RCU uses.
791"
792}
793
794@unpublished{MathieuDesnoyers2009URCU
795,Author="Mathieu Desnoyers"
796,Title="[{RFC} git tree] Userspace {RCU} (urcu) for {Linux}"
797,month="February"
798,day="5"
799,year="2009"
800,note="Available:
801\url{http://lkml.org/lkml/2009/2/5/572}
802\url{git://lttng.org/userspace-rcu.git}
803[Viewed February 20, 2009]"
804,annotation="
805 Mathieu Desnoyers's user-space RCU implementation.
806 git://lttng.org/userspace-rcu.git
807"
808}
809
810@unpublished{PaulEMcKenney2009BloatWatchRCU
811,Author="Paul E. McKenney"
812,Title="{RCU}: The {Bloatwatch} Edition"
813,month="March"
814,day="17"
815,year="2009"
816,note="Available:
817\url{http://lwn.net/Articles/323929/}
818[Viewed March 20, 2009]"
819,annotation="
820 Uniprocessor assumptions allow simplified RCU implementation.
821"
822}
diff --git a/Documentation/RCU/UP.txt b/Documentation/RCU/UP.txt
index aab4a9ec3931..90ec5341ee98 100644
--- a/Documentation/RCU/UP.txt
+++ b/Documentation/RCU/UP.txt
@@ -2,14 +2,13 @@ RCU on Uniprocessor Systems
2 2
3 3
4A common misconception is that, on UP systems, the call_rcu() primitive 4A common misconception is that, on UP systems, the call_rcu() primitive
5may immediately invoke its function, and that the synchronize_rcu() 5may immediately invoke its function. The basis of this misconception
6primitive may return immediately. The basis of this misconception
7is that since there is only one CPU, it should not be necessary to 6is that since there is only one CPU, it should not be necessary to
8wait for anything else to get done, since there are no other CPUs for 7wait for anything else to get done, since there are no other CPUs for
9anything else to be happening on. Although this approach will -sort- -of- 8anything else to be happening on. Although this approach will -sort- -of-
10work a surprising amount of the time, it is a very bad idea in general. 9work a surprising amount of the time, it is a very bad idea in general.
11This document presents three examples that demonstrate exactly how bad an 10This document presents three examples that demonstrate exactly how bad
12idea this is. 11an idea this is.
13 12
14 13
15Example 1: softirq Suicide 14Example 1: softirq Suicide
@@ -82,11 +81,18 @@ Quick Quiz #2: What locking restriction must RCU callbacks respect?
82 81
83Summary 82Summary
84 83
85Permitting call_rcu() to immediately invoke its arguments or permitting 84Permitting call_rcu() to immediately invoke its arguments breaks RCU,
86synchronize_rcu() to immediately return breaks RCU, even on a UP system. 85even on a UP system. So do not do it! Even on a UP system, the RCU
87So do not do it! Even on a UP system, the RCU infrastructure -must- 86infrastructure -must- respect grace periods, and -must- invoke callbacks
88respect grace periods, and -must- invoke callbacks from a known environment 87from a known environment in which no locks are held.
89in which no locks are held. 88
89It -is- safe for synchronize_sched() and synchronize_rcu_bh() to return
90immediately on an UP system. It is also safe for synchronize_rcu()
91to return immediately on UP systems, except when running preemptable
92RCU.
93
94Quick Quiz #3: Why can't synchronize_rcu() return immediately on
95 UP systems running preemptable RCU?
90 96
91 97
92Answer to Quick Quiz #1: 98Answer to Quick Quiz #1:
@@ -117,3 +123,13 @@ Answer to Quick Quiz #2:
117 callbacks acquire locks directly. However, a great many RCU 123 callbacks acquire locks directly. However, a great many RCU
118 callbacks do acquire locks -indirectly-, for example, via 124 callbacks do acquire locks -indirectly-, for example, via
119 the kfree() primitive. 125 the kfree() primitive.
126
127Answer to Quick Quiz #3:
128 Why can't synchronize_rcu() return immediately on UP systems
129 running preemptable RCU?
130
131 Because some other task might have been preempted in the middle
132 of an RCU read-side critical section. If synchronize_rcu()
133 simply immediately returned, it would prematurely signal the
134 end of the grace period, which would come as a nasty shock to
135 that other thread when it started running again.
diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt
index accfe2f5247d..51525a30e8b4 100644
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.txt
@@ -11,7 +11,10 @@ over a rather long period of time, but improvements are always welcome!
11 structure is updated more than about 10% of the time, then 11 structure is updated more than about 10% of the time, then
12 you should strongly consider some other approach, unless 12 you should strongly consider some other approach, unless
13 detailed performance measurements show that RCU is nonetheless 13 detailed performance measurements show that RCU is nonetheless
14 the right tool for the job. 14 the right tool for the job. Yes, you might think of RCU
15 as simply cutting overhead off of the readers and imposing it
16 on the writers. That is exactly why normal uses of RCU will
17 do much more reading than updating.
15 18
16 Another exception is where performance is not an issue, and RCU 19 Another exception is where performance is not an issue, and RCU
17 provides a simpler implementation. An example of this situation 20 provides a simpler implementation. An example of this situation
@@ -240,10 +243,11 @@ over a rather long period of time, but improvements are always welcome!
240 instead need to use synchronize_irq() or synchronize_sched(). 243 instead need to use synchronize_irq() or synchronize_sched().
241 244
24212. Any lock acquired by an RCU callback must be acquired elsewhere 24512. Any lock acquired by an RCU callback must be acquired elsewhere
243 with irq disabled, e.g., via spin_lock_irqsave(). Failing to 246 with softirq disabled, e.g., via spin_lock_irqsave(),
244 disable irq on a given acquisition of that lock will result in 247 spin_lock_bh(), etc. Failing to disable irq on a given
245 deadlock as soon as the RCU callback happens to interrupt that 248 acquisition of that lock will result in deadlock as soon as the
246 acquisition's critical section. 249 RCU callback happens to interrupt that acquisition's critical
250 section.
247 251
24813. RCU callbacks can be and are executed in parallel. In many cases, 25213. RCU callbacks can be and are executed in parallel. In many cases,
249 the callback code simply wrappers around kfree(), so that this 253 the callback code simply wrappers around kfree(), so that this
@@ -310,3 +314,9 @@ over a rather long period of time, but improvements are always welcome!
310 Because these primitives only wait for pre-existing readers, 314 Because these primitives only wait for pre-existing readers,
311 it is the caller's responsibility to guarantee safety to 315 it is the caller's responsibility to guarantee safety to
312 any subsequent readers. 316 any subsequent readers.
317
31816. The various RCU read-side primitives do -not- contain memory
319 barriers. The CPU (and in some cases, the compiler) is free
320 to reorder code into and out of RCU read-side critical sections.
321 It is the responsibility of the RCU update-side primitives to
322 deal with this.
diff --git a/Documentation/RCU/rcu.txt b/Documentation/RCU/rcu.txt
index 7aa2002ade77..2a23523ce471 100644
--- a/Documentation/RCU/rcu.txt
+++ b/Documentation/RCU/rcu.txt
@@ -36,7 +36,7 @@ o How can the updater tell when a grace period has completed
36 executed in user mode, or executed in the idle loop, we can 36 executed in user mode, or executed in the idle loop, we can
37 safely free up that item. 37 safely free up that item.
38 38
39 Preemptible variants of RCU (CONFIG_PREEMPT_RCU) get the 39 Preemptible variants of RCU (CONFIG_TREE_PREEMPT_RCU) get the
40 same effect, but require that the readers manipulate CPU-local 40 same effect, but require that the readers manipulate CPU-local
41 counters. These counters allow limited types of blocking 41 counters. These counters allow limited types of blocking
42 within RCU read-side critical sections. SRCU also uses 42 within RCU read-side critical sections. SRCU also uses
@@ -79,10 +79,10 @@ o I hear that RCU is patented? What is with that?
79o I hear that RCU needs work in order to support realtime kernels? 79o I hear that RCU needs work in order to support realtime kernels?
80 80
81 This work is largely completed. Realtime-friendly RCU can be 81 This work is largely completed. Realtime-friendly RCU can be
82 enabled via the CONFIG_PREEMPT_RCU kernel configuration parameter. 82 enabled via the CONFIG_TREE_PREEMPT_RCU kernel configuration
83 However, work is in progress for enabling priority boosting of 83 parameter. However, work is in progress for enabling priority
84 preempted RCU read-side critical sections. This is needed if you 84 boosting of preempted RCU read-side critical sections. This is
85 have CPU-bound realtime threads. 85 needed if you have CPU-bound realtime threads.
86 86
87o Where can I find more information on RCU? 87o Where can I find more information on RCU?
88 88
diff --git a/Documentation/RCU/rcubarrier.txt b/Documentation/RCU/rcubarrier.txt
index 909602d409bb..e439a0edee22 100644
--- a/Documentation/RCU/rcubarrier.txt
+++ b/Documentation/RCU/rcubarrier.txt
@@ -170,6 +170,13 @@ module invokes call_rcu() from timers, you will need to first cancel all
170the timers, and only then invoke rcu_barrier() to wait for any remaining 170the timers, and only then invoke rcu_barrier() to wait for any remaining
171RCU callbacks to complete. 171RCU callbacks to complete.
172 172
173Of course, if you module uses call_rcu_bh(), you will need to invoke
174rcu_barrier_bh() before unloading. Similarly, if your module uses
175call_rcu_sched(), you will need to invoke rcu_barrier_sched() before
176unloading. If your module uses call_rcu(), call_rcu_bh(), -and-
177call_rcu_sched(), then you will need to invoke each of rcu_barrier(),
178rcu_barrier_bh(), and rcu_barrier_sched().
179
173 180
174Implementing rcu_barrier() 181Implementing rcu_barrier()
175 182
diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt
index a342b6e1cc10..9dba3bb90e60 100644
--- a/Documentation/RCU/torture.txt
+++ b/Documentation/RCU/torture.txt
@@ -76,8 +76,10 @@ torture_type The type of RCU to test: "rcu" for the rcu_read_lock() API,
76 "rcu_sync" for rcu_read_lock() with synchronous reclamation, 76 "rcu_sync" for rcu_read_lock() with synchronous reclamation,
77 "rcu_bh" for the rcu_read_lock_bh() API, "rcu_bh_sync" for 77 "rcu_bh" for the rcu_read_lock_bh() API, "rcu_bh_sync" for
78 rcu_read_lock_bh() with synchronous reclamation, "srcu" for 78 rcu_read_lock_bh() with synchronous reclamation, "srcu" for
79 the "srcu_read_lock()" API, and "sched" for the use of 79 the "srcu_read_lock()" API, "sched" for the use of
80 preempt_disable() together with synchronize_sched(). 80 preempt_disable() together with synchronize_sched(),
81 and "sched_expedited" for the use of preempt_disable()
82 with synchronize_sched_expedited().
81 83
82verbose Enable debug printk()s. Default is disabled. 84verbose Enable debug printk()s. Default is disabled.
83 85
@@ -162,6 +164,23 @@ of the "old" and "current" counters for the corresponding CPU. The
162"idx" value maps the "old" and "current" values to the underlying array, 164"idx" value maps the "old" and "current" values to the underlying array,
163and is useful for debugging. 165and is useful for debugging.
164 166
167Similarly, sched_expedited RCU provides the following:
168
169 sched_expedited-torture: rtc: d0000000016c1880 ver: 1090796 tfle: 0 rta: 1090796 rtaf: 0 rtf: 1090787 rtmbe: 0 nt: 27713319
170 sched_expedited-torture: Reader Pipe: 12660320201 95875 0 0 0 0 0 0 0 0 0
171 sched_expedited-torture: Reader Batch: 12660424885 0 0 0 0 0 0 0 0 0 0
172 sched_expedited-torture: Free-Block Circulation: 1090795 1090795 1090794 1090793 1090792 1090791 1090790 1090789 1090788 1090787 0
173 state: -1 / 0:0 3:0 4:0
174
175As before, the first four lines are similar to those for RCU.
176The last line shows the task-migration state. The first number is
177-1 if synchronize_sched_expedited() is idle, -2 if in the process of
178posting wakeups to the migration kthreads, and N when waiting on CPU N.
179Each of the colon-separated fields following the "/" is a CPU:state pair.
180Valid states are "0" for idle, "1" for waiting for quiescent state,
181"2" for passed through quiescent state, and "3" when a race with a
182CPU-hotplug event forces use of the synchronize_sched() primitive.
183
165 184
166USAGE 185USAGE
167 186
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index 02cced183b2d..187bbf10c923 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -191,8 +191,7 @@ rcu/rcuhier (which displays the struct rcu_node hierarchy).
191 191
192The output of "cat rcu/rcudata" looks as follows: 192The output of "cat rcu/rcudata" looks as follows:
193 193
194rcu: 194rcu_sched:
195rcu:
196 0 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=10951/1 dn=0 df=1101 of=0 ri=36 ql=0 b=10 195 0 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=10951/1 dn=0 df=1101 of=0 ri=36 ql=0 b=10
197 1 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=16117/1 dn=0 df=1015 of=0 ri=0 ql=0 b=10 196 1 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=16117/1 dn=0 df=1015 of=0 ri=0 ql=0 b=10
198 2 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1445/1 dn=0 df=1839 of=0 ri=0 ql=0 b=10 197 2 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1445/1 dn=0 df=1839 of=0 ri=0 ql=0 b=10
@@ -306,7 +305,7 @@ comma-separated-variable spreadsheet format.
306 305
307The output of "cat rcu/rcugp" looks as follows: 306The output of "cat rcu/rcugp" looks as follows:
308 307
309rcu: completed=33062 gpnum=33063 308rcu_sched: completed=33062 gpnum=33063
310rcu_bh: completed=464 gpnum=464 309rcu_bh: completed=464 gpnum=464
311 310
312Again, this output is for both "rcu" and "rcu_bh". The fields are 311Again, this output is for both "rcu" and "rcu_bh". The fields are
@@ -413,7 +412,7 @@ o Each element of the form "1/1 0:127 ^0" represents one struct
413 412
414The output of "cat rcu/rcu_pending" looks as follows: 413The output of "cat rcu/rcu_pending" looks as follows:
415 414
416rcu: 415rcu_sched:
417 0 np=255892 qsp=53936 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741 416 0 np=255892 qsp=53936 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741
418 1 np=261224 qsp=54638 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792 417 1 np=261224 qsp=54638 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792
419 2 np=237496 qsp=49664 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629 418 2 np=237496 qsp=49664 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index 96170824a717..e41a7fecf0d3 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -136,10 +136,10 @@ rcu_read_lock()
136 Used by a reader to inform the reclaimer that the reader is 136 Used by a reader to inform the reclaimer that the reader is
137 entering an RCU read-side critical section. It is illegal 137 entering an RCU read-side critical section. It is illegal
138 to block while in an RCU read-side critical section, though 138 to block while in an RCU read-side critical section, though
139 kernels built with CONFIG_PREEMPT_RCU can preempt RCU read-side 139 kernels built with CONFIG_TREE_PREEMPT_RCU can preempt RCU
140 critical sections. Any RCU-protected data structure accessed 140 read-side critical sections. Any RCU-protected data structure
141 during an RCU read-side critical section is guaranteed to remain 141 accessed during an RCU read-side critical section is guaranteed to
142 unreclaimed for the full duration of that critical section. 142 remain unreclaimed for the full duration of that critical section.
143 Reference counts may be used in conjunction with RCU to maintain 143 Reference counts may be used in conjunction with RCU to maintain
144 longer-term references to data structures. 144 longer-term references to data structures.
145 145
@@ -785,6 +785,7 @@ RCU pointer/list traversal:
785 rcu_dereference 785 rcu_dereference
786 list_for_each_entry_rcu 786 list_for_each_entry_rcu
787 hlist_for_each_entry_rcu 787 hlist_for_each_entry_rcu
788 hlist_nulls_for_each_entry_rcu
788 789
789 list_for_each_continue_rcu (to be deprecated in favor of new 790 list_for_each_continue_rcu (to be deprecated in favor of new
790 list_for_each_entry_continue_rcu) 791 list_for_each_entry_continue_rcu)
@@ -807,19 +808,23 @@ RCU: Critical sections Grace period Barrier
807 808
808 rcu_read_lock synchronize_net rcu_barrier 809 rcu_read_lock synchronize_net rcu_barrier
809 rcu_read_unlock synchronize_rcu 810 rcu_read_unlock synchronize_rcu
811 synchronize_rcu_expedited
810 call_rcu 812 call_rcu
811 813
812 814
813bh: Critical sections Grace period Barrier 815bh: Critical sections Grace period Barrier
814 816
815 rcu_read_lock_bh call_rcu_bh rcu_barrier_bh 817 rcu_read_lock_bh call_rcu_bh rcu_barrier_bh
816 rcu_read_unlock_bh 818 rcu_read_unlock_bh synchronize_rcu_bh
819 synchronize_rcu_bh_expedited
817 820
818 821
819sched: Critical sections Grace period Barrier 822sched: Critical sections Grace period Barrier
820 823
821 [preempt_disable] synchronize_sched rcu_barrier_sched 824 rcu_read_lock_sched synchronize_sched rcu_barrier_sched
822 [and friends] call_rcu_sched 825 rcu_read_unlock_sched call_rcu_sched
826 [preempt_disable] synchronize_sched_expedited
827 [and friends]
823 828
824 829
825SRCU: Critical sections Grace period Barrier 830SRCU: Critical sections Grace period Barrier
@@ -827,6 +832,9 @@ SRCU: Critical sections Grace period Barrier
827 srcu_read_lock synchronize_srcu N/A 832 srcu_read_lock synchronize_srcu N/A
828 srcu_read_unlock 833 srcu_read_unlock
829 834
835SRCU: Initialization/cleanup
836 init_srcu_struct
837 cleanup_srcu_struct
830 838
831See the comment headers in the source code (or the docbook generated 839See the comment headers in the source code (or the docbook generated
832from them) for more information. 840from them) for more information.
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index f0690bbbd73c..bb3a53cdfbc3 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -206,24 +206,6 @@ Who: Len Brown <len.brown@intel.com>
206 206
207--------------------------- 207---------------------------
208 208
209What: libata spindown skipping and warning
210When: Dec 2008
211Why: Some halt(8) implementations synchronize caches for and spin
212 down libata disks because libata didn't use to spin down disk on
213 system halt (only synchronized caches).
214 Spin down on system halt is now implemented. sysfs node
215 /sys/class/scsi_disk/h:c:i:l/manage_start_stop is present if
216 spin down support is available.
217 Because issuing spin down command to an already spun down disk
218 makes some disks spin up just to spin down again, libata tracks
219 device spindown status to skip the extra spindown command and
220 warn about it.
221 This is to give userspace tools the time to get updated and will
222 be removed after userspace is reasonably updated.
223Who: Tejun Heo <htejun@gmail.com>
224
225---------------------------
226
227What: i386/x86_64 bzImage symlinks 209What: i386/x86_64 bzImage symlinks
228When: April 2010 210When: April 2010
229 211
@@ -394,15 +376,6 @@ Who: Thomas Gleixner <tglx@linutronix.de>
394 376
395----------------------------- 377-----------------------------
396 378
397What: obsolete generic irq defines and typedefs
398When: 2.6.30
399Why: The defines and typedefs (hw_interrupt_type, no_irq_type, irq_desc_t)
400 have been kept around for migration reasons. After more than two years
401 it's time to remove them finally
402Who: Thomas Gleixner <tglx@linutronix.de>
403
404---------------------------
405
406What: fakephp and associated sysfs files in /sys/bus/pci/slots/ 379What: fakephp and associated sysfs files in /sys/bus/pci/slots/
407When: 2011 380When: 2011
408Why: In 2.6.27, the semantics of /sys/bus/pci/slots was redefined to 381Why: In 2.6.27, the semantics of /sys/bus/pci/slots was redefined to
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index ce8853755814..5d4427d17281 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2509,6 +2509,11 @@ and is between 256 and 4096 characters. It is defined in the file
2509 trace_buf_size=nn[KMG] 2509 trace_buf_size=nn[KMG]
2510 [FTRACE] will set tracing buffer size. 2510 [FTRACE] will set tracing buffer size.
2511 2511
2512 trace_event=[event-list]
2513 [FTRACE] Set and start specified trace events in order
2514 to facilitate early boot debugging.
2515 See also Documentation/trace/events.txt
2516
2512 trix= [HW,OSS] MediaTrix AudioTrix Pro 2517 trix= [HW,OSS] MediaTrix AudioTrix Pro
2513 Format: 2518 Format:
2514 <io>,<irq>,<dma>,<dma2>,<sb_io>,<sb_irq>,<sb_dma>,<mpu_io>,<mpu_irq> 2519 <io>,<irq>,<dma>,<dma2>,<sb_io>,<sb_irq>,<sb_dma>,<mpu_io>,<mpu_irq>
diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt
index f157d7594ea7..2bcc8d4dea29 100644
--- a/Documentation/trace/events.txt
+++ b/Documentation/trace/events.txt
@@ -83,6 +83,15 @@ When reading one of these enable files, there are four results:
83 X - there is a mixture of events enabled and disabled 83 X - there is a mixture of events enabled and disabled
84 ? - this file does not affect any event 84 ? - this file does not affect any event
85 85
862.3 Boot option
87---------------
88
89In order to facilitate early boot debugging, use boot option:
90
91 trace_event=[event-list]
92
93The format of this boot option is the same as described in section 2.1.
94
863. Defining an event-enabled tracepoint 953. Defining an event-enabled tracepoint
87======================================= 96=======================================
88 97
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index a39b3c749de5..355d0f1f8c50 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -85,26 +85,19 @@ of ftrace. Here is a list of some of the key files:
85 This file holds the output of the trace in a human 85 This file holds the output of the trace in a human
86 readable format (described below). 86 readable format (described below).
87 87
88 latency_trace:
89
90 This file shows the same trace but the information
91 is organized more to display possible latencies
92 in the system (described below).
93
94 trace_pipe: 88 trace_pipe:
95 89
96 The output is the same as the "trace" file but this 90 The output is the same as the "trace" file but this
97 file is meant to be streamed with live tracing. 91 file is meant to be streamed with live tracing.
98 Reads from this file will block until new data 92 Reads from this file will block until new data is
99 is retrieved. Unlike the "trace" and "latency_trace" 93 retrieved. Unlike the "trace" file, this file is a
100 files, this file is a consumer. This means reading 94 consumer. This means reading from this file causes
101 from this file causes sequential reads to display 95 sequential reads to display more current data. Once
102 more current data. Once data is read from this 96 data is read from this file, it is consumed, and
103 file, it is consumed, and will not be read 97 will not be read again with a sequential read. The
104 again with a sequential read. The "trace" and 98 "trace" file is static, and if the tracer is not
105 "latency_trace" files are static, and if the 99 adding more data,they will display the same
106 tracer is not adding more data, they will display 100 information every time they are read.
107 the same information every time they are read.
108 101
109 trace_options: 102 trace_options:
110 103
@@ -117,10 +110,10 @@ of ftrace. Here is a list of some of the key files:
117 Some of the tracers record the max latency. 110 Some of the tracers record the max latency.
118 For example, the time interrupts are disabled. 111 For example, the time interrupts are disabled.
119 This time is saved in this file. The max trace 112 This time is saved in this file. The max trace
120 will also be stored, and displayed by either 113 will also be stored, and displayed by "trace".
121 "trace" or "latency_trace". A new max trace will 114 A new max trace will only be recorded if the
122 only be recorded if the latency is greater than 115 latency is greater than the value in this
123 the value in this file. (in microseconds) 116 file. (in microseconds)
124 117
125 buffer_size_kb: 118 buffer_size_kb:
126 119
@@ -210,7 +203,7 @@ Here is the list of current tracers that may be configured.
210 the trace with the longest max latency. 203 the trace with the longest max latency.
211 See tracing_max_latency. When a new max is recorded, 204 See tracing_max_latency. When a new max is recorded,
212 it replaces the old trace. It is best to view this 205 it replaces the old trace. It is best to view this
213 trace via the latency_trace file. 206 trace with the latency-format option enabled.
214 207
215 "preemptoff" 208 "preemptoff"
216 209
@@ -307,8 +300,8 @@ the lowest priority thread (pid 0).
307Latency trace format 300Latency trace format
308-------------------- 301--------------------
309 302
310For traces that display latency times, the latency_trace file 303When the latency-format option is enabled, the trace file gives
311gives somewhat more information to see why a latency happened. 304somewhat more information to see why a latency happened.
312Here is a typical trace. 305Here is a typical trace.
313 306
314# tracer: irqsoff 307# tracer: irqsoff
@@ -380,9 +373,10 @@ explains which is which.
380 373
381The above is mostly meaningful for kernel developers. 374The above is mostly meaningful for kernel developers.
382 375
383 time: This differs from the trace file output. The trace file output 376 time: When the latency-format option is enabled, the trace file
384 includes an absolute timestamp. The timestamp used by the 377 output includes a timestamp relative to the start of the
385 latency_trace file is relative to the start of the trace. 378 trace. This differs from the output when latency-format
379 is disabled, which includes an absolute timestamp.
386 380
387 delay: This is just to help catch your eye a bit better. And 381 delay: This is just to help catch your eye a bit better. And
388 needs to be fixed to be only relative to the same CPU. 382 needs to be fixed to be only relative to the same CPU.
@@ -440,7 +434,8 @@ Here are the available options:
440 sym-addr: 434 sym-addr:
441 bash-4000 [01] 1477.606694: simple_strtoul <c0339346> 435 bash-4000 [01] 1477.606694: simple_strtoul <c0339346>
442 436
443 verbose - This deals with the latency_trace file. 437 verbose - This deals with the trace file when the
438 latency-format option is enabled.
444 439
445 bash 4000 1 0 00000000 00010a95 [58127d26] 1720.415ms \ 440 bash 4000 1 0 00000000 00010a95 [58127d26] 1720.415ms \
446 (+0.000ms): simple_strtoul (strict_strtoul) 441 (+0.000ms): simple_strtoul (strict_strtoul)
@@ -472,7 +467,7 @@ Here are the available options:
472 the app is no longer running 467 the app is no longer running
473 468
474 The lookup is performed when you read 469 The lookup is performed when you read
475 trace,trace_pipe,latency_trace. Example: 470 trace,trace_pipe. Example:
476 471
477 a.out-1623 [000] 40874.465068: /root/a.out[+0x480] <-/root/a.out[+0 472 a.out-1623 [000] 40874.465068: /root/a.out[+0x480] <-/root/a.out[+0
478x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6] 473x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6]
@@ -481,6 +476,11 @@ x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6]
481 every scheduling event. Will add overhead if 476 every scheduling event. Will add overhead if
482 there's a lot of tasks running at once. 477 there's a lot of tasks running at once.
483 478
479 latency-format - This option changes the trace. When
480 it is enabled, the trace displays
481 additional information about the
482 latencies, as described in "Latency
483 trace format".
484 484
485sched_switch 485sched_switch
486------------ 486------------
@@ -596,12 +596,13 @@ To reset the maximum, echo 0 into tracing_max_latency. Here is
596an example: 596an example:
597 597
598 # echo irqsoff > current_tracer 598 # echo irqsoff > current_tracer
599 # echo latency-format > trace_options
599 # echo 0 > tracing_max_latency 600 # echo 0 > tracing_max_latency
600 # echo 1 > tracing_enabled 601 # echo 1 > tracing_enabled
601 # ls -ltr 602 # ls -ltr
602 [...] 603 [...]
603 # echo 0 > tracing_enabled 604 # echo 0 > tracing_enabled
604 # cat latency_trace 605 # cat trace
605# tracer: irqsoff 606# tracer: irqsoff
606# 607#
607irqsoff latency trace v1.1.5 on 2.6.26 608irqsoff latency trace v1.1.5 on 2.6.26
@@ -703,12 +704,13 @@ which preemption was disabled. The control of preemptoff tracer
703is much like the irqsoff tracer. 704is much like the irqsoff tracer.
704 705
705 # echo preemptoff > current_tracer 706 # echo preemptoff > current_tracer
707 # echo latency-format > trace_options
706 # echo 0 > tracing_max_latency 708 # echo 0 > tracing_max_latency
707 # echo 1 > tracing_enabled 709 # echo 1 > tracing_enabled
708 # ls -ltr 710 # ls -ltr
709 [...] 711 [...]
710 # echo 0 > tracing_enabled 712 # echo 0 > tracing_enabled
711 # cat latency_trace 713 # cat trace
712# tracer: preemptoff 714# tracer: preemptoff
713# 715#
714preemptoff latency trace v1.1.5 on 2.6.26-rc8 716preemptoff latency trace v1.1.5 on 2.6.26-rc8
@@ -850,12 +852,13 @@ Again, using this trace is much like the irqsoff and preemptoff
850tracers. 852tracers.
851 853
852 # echo preemptirqsoff > current_tracer 854 # echo preemptirqsoff > current_tracer
855 # echo latency-format > trace_options
853 # echo 0 > tracing_max_latency 856 # echo 0 > tracing_max_latency
854 # echo 1 > tracing_enabled 857 # echo 1 > tracing_enabled
855 # ls -ltr 858 # ls -ltr
856 [...] 859 [...]
857 # echo 0 > tracing_enabled 860 # echo 0 > tracing_enabled
858 # cat latency_trace 861 # cat trace
859# tracer: preemptirqsoff 862# tracer: preemptirqsoff
860# 863#
861preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8 864preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8
@@ -1012,11 +1015,12 @@ Instead of performing an 'ls', we will run 'sleep 1' under
1012'chrt' which changes the priority of the task. 1015'chrt' which changes the priority of the task.
1013 1016
1014 # echo wakeup > current_tracer 1017 # echo wakeup > current_tracer
1018 # echo latency-format > trace_options
1015 # echo 0 > tracing_max_latency 1019 # echo 0 > tracing_max_latency
1016 # echo 1 > tracing_enabled 1020 # echo 1 > tracing_enabled
1017 # chrt -f 5 sleep 1 1021 # chrt -f 5 sleep 1
1018 # echo 0 > tracing_enabled 1022 # echo 0 > tracing_enabled
1019 # cat latency_trace 1023 # cat trace
1020# tracer: wakeup 1024# tracer: wakeup
1021# 1025#
1022wakeup latency trace v1.1.5 on 2.6.26-rc8 1026wakeup latency trace v1.1.5 on 2.6.26-rc8
diff --git a/Documentation/trace/function-graph-fold.vim b/Documentation/trace/function-graph-fold.vim
new file mode 100644
index 000000000000..0544b504c8b0
--- /dev/null
+++ b/Documentation/trace/function-graph-fold.vim
@@ -0,0 +1,42 @@
1" Enable folding for ftrace function_graph traces.
2"
3" To use, :source this file while viewing a function_graph trace, or use vim's
4" -S option to load from the command-line together with a trace. You can then
5" use the usual vim fold commands, such as "za", to open and close nested
6" functions. While closed, a fold will show the total time taken for a call,
7" as would normally appear on the line with the closing brace. Folded
8" functions will not include finish_task_switch(), so folding should remain
9" relatively sane even through a context switch.
10"
11" Note that this will almost certainly only work well with a
12" single-CPU trace (e.g. trace-cmd report --cpu 1).
13
14function! FunctionGraphFoldExpr(lnum)
15 let line = getline(a:lnum)
16 if line[-1:] == '{'
17 if line =~ 'finish_task_switch() {$'
18 return '>1'
19 endif
20 return 'a1'
21 elseif line[-1:] == '}'
22 return 's1'
23 else
24 return '='
25 endif
26endfunction
27
28function! FunctionGraphFoldText()
29 let s = split(getline(v:foldstart), '|', 1)
30 if getline(v:foldend+1) =~ 'finish_task_switch() {$'
31 let s[2] = ' task switch '
32 else
33 let e = split(getline(v:foldend), '|', 1)
34 let s[2] = e[2]
35 endif
36 return join(s, '|')
37endfunction
38
39setlocal foldexpr=FunctionGraphFoldExpr(v:lnum)
40setlocal foldtext=FunctionGraphFoldText()
41setlocal foldcolumn=12
42setlocal foldmethod=expr
diff --git a/Documentation/trace/ring-buffer-design.txt b/Documentation/trace/ring-buffer-design.txt
new file mode 100644
index 000000000000..5b1d23d604c5
--- /dev/null
+++ b/Documentation/trace/ring-buffer-design.txt
@@ -0,0 +1,955 @@
1 Lockless Ring Buffer Design
2 ===========================
3
4Copyright 2009 Red Hat Inc.
5 Author: Steven Rostedt <srostedt@redhat.com>
6 License: The GNU Free Documentation License, Version 1.2
7 (dual licensed under the GPL v2)
8Reviewers: Mathieu Desnoyers, Huang Ying, Hidetoshi Seto,
9 and Frederic Weisbecker.
10
11
12Written for: 2.6.31
13
14Terminology used in this Document
15---------------------------------
16
17tail - where new writes happen in the ring buffer.
18
19head - where new reads happen in the ring buffer.
20
21producer - the task that writes into the ring buffer (same as writer)
22
23writer - same as producer
24
25consumer - the task that reads from the buffer (same as reader)
26
27reader - same as consumer.
28
29reader_page - A page outside the ring buffer used solely (for the most part)
30 by the reader.
31
32head_page - a pointer to the page that the reader will use next
33
34tail_page - a pointer to the page that will be written to next
35
36commit_page - a pointer to the page with the last finished non nested write.
37
38cmpxchg - hardware assisted atomic transaction that performs the following:
39
40 A = B iff previous A == C
41
42 R = cmpxchg(A, C, B) is saying that we replace A with B if and only if
43 current A is equal to C, and we put the old (current) A into R
44
45 R gets the previous A regardless if A is updated with B or not.
46
47 To see if the update was successful a compare of R == C may be used.
48
49The Generic Ring Buffer
50-----------------------
51
52The ring buffer can be used in either an overwrite mode or in
53producer/consumer mode.
54
55Producer/consumer mode is where the producer were to fill up the
56buffer before the consumer could free up anything, the producer
57will stop writing to the buffer. This will lose most recent events.
58
59Overwrite mode is where the produce were to fill up the buffer
60before the consumer could free up anything, the producer will
61overwrite the older data. This will lose the oldest events.
62
63No two writers can write at the same time (on the same per cpu buffer),
64but a writer may interrupt another writer, but it must finish writing
65before the previous writer may continue. This is very important to the
66algorithm. The writers act like a "stack". The way interrupts works
67enforces this behavior.
68
69
70 writer1 start
71 <preempted> writer2 start
72 <preempted> writer3 start
73 writer3 finishes
74 writer2 finishes
75 writer1 finishes
76
77This is very much like a writer being preempted by an interrupt and
78the interrupt doing a write as well.
79
80Readers can happen at any time. But no two readers may run at the
81same time, nor can a reader preempt/interrupt another reader. A reader
82can not preempt/interrupt a writer, but it may read/consume from the
83buffer at the same time as a writer is writing, but the reader must be
84on another processor to do so. A reader may read on its own processor
85and can be preempted by a writer.
86
87A writer can preempt a reader, but a reader can not preempt a writer.
88But a reader can read the buffer at the same time (on another processor)
89as a writer.
90
91The ring buffer is made up of a list of pages held together by a link list.
92
93At initialization a reader page is allocated for the reader that is not
94part of the ring buffer.
95
96The head_page, tail_page and commit_page are all initialized to point
97to the same page.
98
99The reader page is initialized to have its next pointer pointing to
100the head page, and its previous pointer pointing to a page before
101the head page.
102
103The reader has its own page to use. At start up time, this page is
104allocated but is not attached to the list. When the reader wants
105to read from the buffer, if its page is empty (like it is on start up)
106it will swap its page with the head_page. The old reader page will
107become part of the ring buffer and the head_page will be removed.
108The page after the inserted page (old reader_page) will become the
109new head page.
110
111Once the new page is given to the reader, the reader could do what
112it wants with it, as long as a writer has left that page.
113
114A sample of how the reader page is swapped: Note this does not
115show the head page in the buffer, it is for demonstrating a swap
116only.
117
118 +------+
119 |reader| RING BUFFER
120 |page |
121 +------+
122 +---+ +---+ +---+
123 | |-->| |-->| |
124 | |<--| |<--| |
125 +---+ +---+ +---+
126 ^ | ^ |
127 | +-------------+ |
128 +-----------------+
129
130
131 +------+
132 |reader| RING BUFFER
133 |page |-------------------+
134 +------+ v
135 | +---+ +---+ +---+
136 | | |-->| |-->| |
137 | | |<--| |<--| |<-+
138 | +---+ +---+ +---+ |
139 | ^ | ^ | |
140 | | +-------------+ | |
141 | +-----------------+ |
142 +------------------------------------+
143
144 +------+
145 |reader| RING BUFFER
146 |page |-------------------+
147 +------+ <---------------+ v
148 | ^ +---+ +---+ +---+
149 | | | |-->| |-->| |
150 | | | | | |<--| |<-+
151 | | +---+ +---+ +---+ |
152 | | | ^ | |
153 | | +-------------+ | |
154 | +-----------------------------+ |
155 +------------------------------------+
156
157 +------+
158 |buffer| RING BUFFER
159 |page |-------------------+
160 +------+ <---------------+ v
161 | ^ +---+ +---+ +---+
162 | | | | | |-->| |
163 | | New | | | |<--| |<-+
164 | | Reader +---+ +---+ +---+ |
165 | | page ----^ | |
166 | | | |
167 | +-----------------------------+ |
168 +------------------------------------+
169
170
171
172It is possible that the page swapped is the commit page and the tail page,
173if what is in the ring buffer is less than what is held in a buffer page.
174
175
176 reader page commit page tail page
177 | | |
178 v | |
179 +---+ | |
180 | |<----------+ |
181 | |<------------------------+
182 | |------+
183 +---+ |
184 |
185 v
186 +---+ +---+ +---+ +---+
187<---| |--->| |--->| |--->| |--->
188--->| |<---| |<---| |<---| |<---
189 +---+ +---+ +---+ +---+
190
191This case is still valid for this algorithm.
192When the writer leaves the page, it simply goes into the ring buffer
193since the reader page still points to the next location in the ring
194buffer.
195
196
197The main pointers:
198
199 reader page - The page used solely by the reader and is not part
200 of the ring buffer (may be swapped in)
201
202 head page - the next page in the ring buffer that will be swapped
203 with the reader page.
204
205 tail page - the page where the next write will take place.
206
207 commit page - the page that last finished a write.
208
209The commit page only is updated by the outer most writer in the
210writer stack. A writer that preempts another writer will not move the
211commit page.
212
213When data is written into the ring buffer, a position is reserved
214in the ring buffer and passed back to the writer. When the writer
215is finished writing data into that position, it commits the write.
216
217Another write (or a read) may take place at anytime during this
218transaction. If another write happens it must finish before continuing
219with the previous write.
220
221
222 Write reserve:
223
224 Buffer page
225 +---------+
226 |written |
227 +---------+ <--- given back to writer (current commit)
228 |reserved |
229 +---------+ <--- tail pointer
230 | empty |
231 +---------+
232
233 Write commit:
234
235 Buffer page
236 +---------+
237 |written |
238 +---------+
239 |written |
240 +---------+ <--- next positon for write (current commit)
241 | empty |
242 +---------+
243
244
245 If a write happens after the first reserve:
246
247 Buffer page
248 +---------+
249 |written |
250 +---------+ <-- current commit
251 |reserved |
252 +---------+ <--- given back to second writer
253 |reserved |
254 +---------+ <--- tail pointer
255
256 After second writer commits:
257
258
259 Buffer page
260 +---------+
261 |written |
262 +---------+ <--(last full commit)
263 |reserved |
264 +---------+
265 |pending |
266 |commit |
267 +---------+ <--- tail pointer
268
269 When the first writer commits:
270
271 Buffer page
272 +---------+
273 |written |
274 +---------+
275 |written |
276 +---------+
277 |written |
278 +---------+ <--(last full commit and tail pointer)
279
280
281The commit pointer points to the last write location that was
282committed without preempting another write. When a write that
283preempted another write is committed, it only becomes a pending commit
284and will not be a full commit till all writes have been committed.
285
286The commit page points to the page that has the last full commit.
287The tail page points to the page with the last write (before
288committing).
289
290The tail page is always equal to or after the commit page. It may
291be several pages ahead. If the tail page catches up to the commit
292page then no more writes may take place (regardless of the mode
293of the ring buffer: overwrite and produce/consumer).
294
295The order of pages are:
296
297 head page
298 commit page
299 tail page
300
301Possible scenario:
302 tail page
303 head page commit page |
304 | | |
305 v v v
306 +---+ +---+ +---+ +---+
307<---| |--->| |--->| |--->| |--->
308--->| |<---| |<---| |<---| |<---
309 +---+ +---+ +---+ +---+
310
311There is a special case that the head page is after either the commit page
312and possibly the tail page. That is when the commit (and tail) page has been
313swapped with the reader page. This is because the head page is always
314part of the ring buffer, but the reader page is not. When ever there
315has been less than a full page that has been committed inside the ring buffer,
316and a reader swaps out a page, it will be swapping out the commit page.
317
318
319 reader page commit page tail page
320 | | |
321 v | |
322 +---+ | |
323 | |<----------+ |
324 | |<------------------------+
325 | |------+
326 +---+ |
327 |
328 v
329 +---+ +---+ +---+ +---+
330<---| |--->| |--->| |--->| |--->
331--->| |<---| |<---| |<---| |<---
332 +---+ +---+ +---+ +---+
333 ^
334 |
335 head page
336
337
338In this case, the head page will not move when the tail and commit
339move back into the ring buffer.
340
341The reader can not swap a page into the ring buffer if the commit page
342is still on that page. If the read meets the last commit (real commit
343not pending or reserved), then there is nothing more to read.
344The buffer is considered empty until another full commit finishes.
345
346When the tail meets the head page, if the buffer is in overwrite mode,
347the head page will be pushed ahead one. If the buffer is in producer/consumer
348mode, the write will fail.
349
350Overwrite mode:
351
352 tail page
353 |
354 v
355 +---+ +---+ +---+ +---+
356<---| |--->| |--->| |--->| |--->
357--->| |<---| |<---| |<---| |<---
358 +---+ +---+ +---+ +---+
359 ^
360 |
361 head page
362
363
364 tail page
365 |
366 v
367 +---+ +---+ +---+ +---+
368<---| |--->| |--->| |--->| |--->
369--->| |<---| |<---| |<---| |<---
370 +---+ +---+ +---+ +---+
371 ^
372 |
373 head page
374
375
376 tail page
377 |
378 v
379 +---+ +---+ +---+ +---+
380<---| |--->| |--->| |--->| |--->
381--->| |<---| |<---| |<---| |<---
382 +---+ +---+ +---+ +---+
383 ^
384 |
385 head page
386
387Note, the reader page will still point to the previous head page.
388But when a swap takes place, it will use the most recent head page.
389
390
391Making the Ring Buffer Lockless:
392--------------------------------
393
394The main idea behind the lockless algorithm is to combine the moving
395of the head_page pointer with the swapping of pages with the reader.
396State flags are placed inside the pointer to the page. To do this,
397each page must be aligned in memory by 4 bytes. This will allow the 2
398least significant bits of the address to be used as flags. Since
399they will always be zero for the address. To get the address,
400simply mask out the flags.
401
402 MASK = ~3
403
404 address & MASK
405
406Two flags will be kept by these two bits:
407
408 HEADER - the page being pointed to is a head page
409
410 UPDATE - the page being pointed to is being updated by a writer
411 and was or is about to be a head page.
412
413
414 reader page
415 |
416 v
417 +---+
418 | |------+
419 +---+ |
420 |
421 v
422 +---+ +---+ +---+ +---+
423<---| |--->| |-H->| |--->| |--->
424--->| |<---| |<---| |<---| |<---
425 +---+ +---+ +---+ +---+
426
427
428The above pointer "-H->" would have the HEADER flag set. That is
429the next page is the next page to be swapped out by the reader.
430This pointer means the next page is the head page.
431
432When the tail page meets the head pointer, it will use cmpxchg to
433change the pointer to the UPDATE state:
434
435
436 tail page
437 |
438 v
439 +---+ +---+ +---+ +---+
440<---| |--->| |-H->| |--->| |--->
441--->| |<---| |<---| |<---| |<---
442 +---+ +---+ +---+ +---+
443
444 tail page
445 |
446 v
447 +---+ +---+ +---+ +---+
448<---| |--->| |-U->| |--->| |--->
449--->| |<---| |<---| |<---| |<---
450 +---+ +---+ +---+ +---+
451
452"-U->" represents a pointer in the UPDATE state.
453
454Any access to the reader will need to take some sort of lock to serialize
455the readers. But the writers will never take a lock to write to the
456ring buffer. This means we only need to worry about a single reader,
457and writes only preempt in "stack" formation.
458
459When the reader tries to swap the page with the ring buffer, it
460will also use cmpxchg. If the flag bit in the pointer to the
461head page does not have the HEADER flag set, the compare will fail
462and the reader will need to look for the new head page and try again.
463Note, the flag UPDATE and HEADER are never set at the same time.
464
465The reader swaps the reader page as follows:
466
467 +------+
468 |reader| RING BUFFER
469 |page |
470 +------+
471 +---+ +---+ +---+
472 | |--->| |--->| |
473 | |<---| |<---| |
474 +---+ +---+ +---+
475 ^ | ^ |
476 | +---------------+ |
477 +-----H-------------+
478
479The reader sets the reader page next pointer as HEADER to the page after
480the head page.
481
482
483 +------+
484 |reader| RING BUFFER
485 |page |-------H-----------+
486 +------+ v
487 | +---+ +---+ +---+
488 | | |--->| |--->| |
489 | | |<---| |<---| |<-+
490 | +---+ +---+ +---+ |
491 | ^ | ^ | |
492 | | +---------------+ | |
493 | +-----H-------------+ |
494 +--------------------------------------+
495
496It does a cmpxchg with the pointer to the previous head page to make it
497point to the reader page. Note that the new pointer does not have the HEADER
498flag set. This action atomically moves the head page forward.
499
500 +------+
501 |reader| RING BUFFER
502 |page |-------H-----------+
503 +------+ v
504 | ^ +---+ +---+ +---+
505 | | | |-->| |-->| |
506 | | | |<--| |<--| |<-+
507 | | +---+ +---+ +---+ |
508 | | | ^ | |
509 | | +-------------+ | |
510 | +-----------------------------+ |
511 +------------------------------------+
512
513After the new head page is set, the previous pointer of the head page is
514updated to the reader page.
515
516 +------+
517 |reader| RING BUFFER
518 |page |-------H-----------+
519 +------+ <---------------+ v
520 | ^ +---+ +---+ +---+
521 | | | |-->| |-->| |
522 | | | | | |<--| |<-+
523 | | +---+ +---+ +---+ |
524 | | | ^ | |
525 | | +-------------+ | |
526 | +-----------------------------+ |
527 +------------------------------------+
528
529 +------+
530 |buffer| RING BUFFER
531 |page |-------H-----------+ <--- New head page
532 +------+ <---------------+ v
533 | ^ +---+ +---+ +---+
534 | | | | | |-->| |
535 | | New | | | |<--| |<-+
536 | | Reader +---+ +---+ +---+ |
537 | | page ----^ | |
538 | | | |
539 | +-----------------------------+ |
540 +------------------------------------+
541
542Another important point. The page that the reader page points back to
543by its previous pointer (the one that now points to the new head page)
544never points back to the reader page. That is because the reader page is
545not part of the ring buffer. Traversing the ring buffer via the next pointers
546will always stay in the ring buffer. Traversing the ring buffer via the
547prev pointers may not.
548
549Note, the way to determine a reader page is simply by examining the previous
550pointer of the page. If the next pointer of the previous page does not
551point back to the original page, then the original page is a reader page:
552
553
554 +--------+
555 | reader | next +----+
556 | page |-------->| |<====== (buffer page)
557 +--------+ +----+
558 | | ^
559 | v | next
560 prev | +----+
561 +------------->| |
562 +----+
563
564The way the head page moves forward:
565
566When the tail page meets the head page and the buffer is in overwrite mode
567and more writes take place, the head page must be moved forward before the
568writer may move the tail page. The way this is done is that the writer
569performs a cmpxchg to convert the pointer to the head page from the HEADER
570flag to have the UPDATE flag set. Once this is done, the reader will
571not be able to swap the head page from the buffer, nor will it be able to
572move the head page, until the writer is finished with the move.
573
574This eliminates any races that the reader can have on the writer. The reader
575must spin, and this is why the reader can not preempt the writer.
576
577 tail page
578 |
579 v
580 +---+ +---+ +---+ +---+
581<---| |--->| |-H->| |--->| |--->
582--->| |<---| |<---| |<---| |<---
583 +---+ +---+ +---+ +---+
584
585 tail page
586 |
587 v
588 +---+ +---+ +---+ +---+
589<---| |--->| |-U->| |--->| |--->
590--->| |<---| |<---| |<---| |<---
591 +---+ +---+ +---+ +---+
592
593The following page will be made into the new head page.
594
595 tail page
596 |
597 v
598 +---+ +---+ +---+ +---+
599<---| |--->| |-U->| |-H->| |--->
600--->| |<---| |<---| |<---| |<---
601 +---+ +---+ +---+ +---+
602
603After the new head page has been set, we can set the old head page
604pointer back to NORMAL.
605
606 tail page
607 |
608 v
609 +---+ +---+ +---+ +---+
610<---| |--->| |--->| |-H->| |--->
611--->| |<---| |<---| |<---| |<---
612 +---+ +---+ +---+ +---+
613
614After the head page has been moved, the tail page may now move forward.
615
616 tail page
617 |
618 v
619 +---+ +---+ +---+ +---+
620<---| |--->| |--->| |-H->| |--->
621--->| |<---| |<---| |<---| |<---
622 +---+ +---+ +---+ +---+
623
624
625The above are the trivial updates. Now for the more complex scenarios.
626
627
628As stated before, if enough writes preempt the first write, the
629tail page may make it all the way around the buffer and meet the commit
630page. At this time, we must start dropping writes (usually with some kind
631of warning to the user). But what happens if the commit was still on the
632reader page? The commit page is not part of the ring buffer. The tail page
633must account for this.
634
635
636 reader page commit page
637 | |
638 v |
639 +---+ |
640 | |<----------+
641 | |
642 | |------+
643 +---+ |
644 |
645 v
646 +---+ +---+ +---+ +---+
647<---| |--->| |-H->| |--->| |--->
648--->| |<---| |<---| |<---| |<---
649 +---+ +---+ +---+ +---+
650 ^
651 |
652 tail page
653
654If the tail page were to simply push the head page forward, the commit when
655leaving the reader page would not be pointing to the correct page.
656
657The solution to this is to test if the commit page is on the reader page
658before pushing the head page. If it is, then it can be assumed that the
659tail page wrapped the buffer, and we must drop new writes.
660
661This is not a race condition, because the commit page can only be moved
662by the outter most writer (the writer that was preempted).
663This means that the commit will not move while a writer is moving the
664tail page. The reader can not swap the reader page if it is also being
665used as the commit page. The reader can simply check that the commit
666is off the reader page. Once the commit page leaves the reader page
667it will never go back on it unless a reader does another swap with the
668buffer page that is also the commit page.
669
670
671Nested writes
672-------------
673
674In the pushing forward of the tail page we must first push forward
675the head page if the head page is the next page. If the head page
676is not the next page, the tail page is simply updated with a cmpxchg.
677
678Only writers move the tail page. This must be done atomically to protect
679against nested writers.
680
681 temp_page = tail_page
682 next_page = temp_page->next
683 cmpxchg(tail_page, temp_page, next_page)
684
685The above will update the tail page if it is still pointing to the expected
686page. If this fails, a nested write pushed it forward, the the current write
687does not need to push it.
688
689
690 temp page
691 |
692 v
693 tail page
694 |
695 v
696 +---+ +---+ +---+ +---+
697<---| |--->| |--->| |--->| |--->
698--->| |<---| |<---| |<---| |<---
699 +---+ +---+ +---+ +---+
700
701Nested write comes in and moves the tail page forward:
702
703 tail page (moved by nested writer)
704 temp page |
705 | |
706 v v
707 +---+ +---+ +---+ +---+
708<---| |--->| |--->| |--->| |--->
709--->| |<---| |<---| |<---| |<---
710 +---+ +---+ +---+ +---+
711
712The above would fail the cmpxchg, but since the tail page has already
713been moved forward, the writer will just try again to reserve storage
714on the new tail page.
715
716But the moving of the head page is a bit more complex.
717
718 tail page
719 |
720 v
721 +---+ +---+ +---+ +---+
722<---| |--->| |-H->| |--->| |--->
723--->| |<---| |<---| |<---| |<---
724 +---+ +---+ +---+ +---+
725
726The write converts the head page pointer to UPDATE.
727
728 tail page
729 |
730 v
731 +---+ +---+ +---+ +---+
732<---| |--->| |-U->| |--->| |--->
733--->| |<---| |<---| |<---| |<---
734 +---+ +---+ +---+ +---+
735
736But if a nested writer preempts here. It will see that the next
737page is a head page, but it is also nested. It will detect that
738it is nested and will save that information. The detection is the
739fact that it sees the UPDATE flag instead of a HEADER or NORMAL
740pointer.
741
742The nested writer will set the new head page pointer.
743
744 tail page
745 |
746 v
747 +---+ +---+ +---+ +---+
748<---| |--->| |-U->| |-H->| |--->
749--->| |<---| |<---| |<---| |<---
750 +---+ +---+ +---+ +---+
751
752But it will not reset the update back to normal. Only the writer
753that converted a pointer from HEAD to UPDATE will convert it back
754to NORMAL.
755
756 tail page
757 |
758 v
759 +---+ +---+ +---+ +---+
760<---| |--->| |-U->| |-H->| |--->
761--->| |<---| |<---| |<---| |<---
762 +---+ +---+ +---+ +---+
763
764After the nested writer finishes, the outer most writer will convert
765the UPDATE pointer to NORMAL.
766
767
768 tail page
769 |
770 v
771 +---+ +---+ +---+ +---+
772<---| |--->| |--->| |-H->| |--->
773--->| |<---| |<---| |<---| |<---
774 +---+ +---+ +---+ +---+
775
776
777It can be even more complex if several nested writes came in and moved
778the tail page ahead several pages:
779
780
781(first writer)
782
783 tail page
784 |
785 v
786 +---+ +---+ +---+ +---+
787<---| |--->| |-H->| |--->| |--->
788--->| |<---| |<---| |<---| |<---
789 +---+ +---+ +---+ +---+
790
791The write converts the head page pointer to UPDATE.
792
793 tail page
794 |
795 v
796 +---+ +---+ +---+ +---+
797<---| |--->| |-U->| |--->| |--->
798--->| |<---| |<---| |<---| |<---
799 +---+ +---+ +---+ +---+
800
801Next writer comes in, and sees the update and sets up the new
802head page.
803
804(second writer)
805
806 tail page
807 |
808 v
809 +---+ +---+ +---+ +---+
810<---| |--->| |-U->| |-H->| |--->
811--->| |<---| |<---| |<---| |<---
812 +---+ +---+ +---+ +---+
813
814The nested writer moves the tail page forward. But does not set the old
815update page to NORMAL because it is not the outer most writer.
816
817 tail page
818 |
819 v
820 +---+ +---+ +---+ +---+
821<---| |--->| |-U->| |-H->| |--->
822--->| |<---| |<---| |<---| |<---
823 +---+ +---+ +---+ +---+
824
825Another writer preempts and sees the page after the tail page is a head page.
826It changes it from HEAD to UPDATE.
827
828(third writer)
829
830 tail page
831 |
832 v
833 +---+ +---+ +---+ +---+
834<---| |--->| |-U->| |-U->| |--->
835--->| |<---| |<---| |<---| |<---
836 +---+ +---+ +---+ +---+
837
838The writer will move the head page forward:
839
840
841(third writer)
842
843 tail page
844 |
845 v
846 +---+ +---+ +---+ +---+
847<---| |--->| |-U->| |-U->| |-H->
848--->| |<---| |<---| |<---| |<---
849 +---+ +---+ +---+ +---+
850
851But now that the third writer did change the HEAD flag to UPDATE it
852will convert it to normal:
853
854
855(third writer)
856
857 tail page
858 |
859 v
860 +---+ +---+ +---+ +---+
861<---| |--->| |-U->| |--->| |-H->
862--->| |<---| |<---| |<---| |<---
863 +---+ +---+ +---+ +---+
864
865
866Then it will move the tail page, and return back to the second writer.
867
868
869(second writer)
870
871 tail page
872 |
873 v
874 +---+ +---+ +---+ +---+
875<---| |--->| |-U->| |--->| |-H->
876--->| |<---| |<---| |<---| |<---
877 +---+ +---+ +---+ +---+
878
879
880The second writer will fail to move the tail page because it was already
881moved, so it will try again and add its data to the new tail page.
882It will return to the first writer.
883
884
885(first writer)
886
887 tail page
888 |
889 v
890 +---+ +---+ +---+ +---+
891<---| |--->| |-U->| |--->| |-H->
892--->| |<---| |<---| |<---| |<---
893 +---+ +---+ +---+ +---+
894
895The first writer can not know atomically test if the tail page moved
896while it updates the HEAD page. It will then update the head page to
897what it thinks is the new head page.
898
899
900(first writer)
901
902 tail page
903 |
904 v
905 +---+ +---+ +---+ +---+
906<---| |--->| |-U->| |-H->| |-H->
907--->| |<---| |<---| |<---| |<---
908 +---+ +---+ +---+ +---+
909
910Since the cmpxchg returns the old value of the pointer the first writer
911will see it succeeded in updating the pointer from NORMAL to HEAD.
912But as we can see, this is not good enough. It must also check to see
913if the tail page is either where it use to be or on the next page:
914
915
916(first writer)
917
918 A B tail page
919 | | |
920 v v v
921 +---+ +---+ +---+ +---+
922<---| |--->| |-U->| |-H->| |-H->
923--->| |<---| |<---| |<---| |<---
924 +---+ +---+ +---+ +---+
925
926If tail page != A and tail page does not equal B, then it must reset the
927pointer back to NORMAL. The fact that it only needs to worry about
928nested writers, it only needs to check this after setting the HEAD page.
929
930
931(first writer)
932
933 A B tail page
934 | | |
935 v v v
936 +---+ +---+ +---+ +---+
937<---| |--->| |-U->| |--->| |-H->
938--->| |<---| |<---| |<---| |<---
939 +---+ +---+ +---+ +---+
940
941Now the writer can update the head page. This is also why the head page must
942remain in UPDATE and only reset by the outer most writer. This prevents
943the reader from seeing the incorrect head page.
944
945
946(first writer)
947
948 A B tail page
949 | | |
950 v v v
951 +---+ +---+ +---+ +---+
952<---| |--->| |--->| |--->| |-H->
953--->| |<---| |<---| |<---| |<---
954 +---+ +---+ +---+ +---+
955
diff --git a/arch/Kconfig b/arch/Kconfig
index 99193b160232..beea3ccebb5e 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -30,6 +30,18 @@ config OPROFILE_IBS
30 30
31 If unsure, say N. 31 If unsure, say N.
32 32
33config OPROFILE_EVENT_MULTIPLEX
34 bool "OProfile multiplexing support (EXPERIMENTAL)"
35 default n
36 depends on OPROFILE && X86
37 help
38 The number of hardware counters is limited. The multiplexing
39 feature enables OProfile to gather more events than counters
40 are provided by the hardware. This is realized by switching
41 between events at an user specified time interval.
42
43 If unsure, say N.
44
33config HAVE_OPROFILE 45config HAVE_OPROFILE
34 bool 46 bool
35 47
diff --git a/arch/ia64/include/asm/dma-mapping.h b/arch/ia64/include/asm/dma-mapping.h
index 5a61b5c2e18f..8d3c79cd81e7 100644
--- a/arch/ia64/include/asm/dma-mapping.h
+++ b/arch/ia64/include/asm/dma-mapping.h
@@ -44,7 +44,6 @@ static inline void dma_free_coherent(struct device *dev, size_t size,
44#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) 44#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
45 45
46#define get_dma_ops(dev) platform_dma_get_ops(dev) 46#define get_dma_ops(dev) platform_dma_get_ops(dev)
47#define flush_write_buffers()
48 47
49#include <asm-generic/dma-mapping-common.h> 48#include <asm-generic/dma-mapping-common.h>
50 49
@@ -69,6 +68,24 @@ dma_set_mask (struct device *dev, u64 mask)
69 return 0; 68 return 0;
70} 69}
71 70
71static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
72{
73 if (!dev->dma_mask)
74 return 0;
75
76 return addr + size <= *dev->dma_mask;
77}
78
79static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
80{
81 return paddr;
82}
83
84static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
85{
86 return daddr;
87}
88
72extern int dma_get_cache_alignment(void); 89extern int dma_get_cache_alignment(void);
73 90
74static inline void 91static inline void
diff --git a/arch/ia64/xen/time.c b/arch/ia64/xen/time.c
index fb8332690179..dbeadb9c8e20 100644
--- a/arch/ia64/xen/time.c
+++ b/arch/ia64/xen/time.c
@@ -133,8 +133,7 @@ consider_steal_time(unsigned long new_itm)
133 account_idle_ticks(blocked); 133 account_idle_ticks(blocked);
134 run_local_timers(); 134 run_local_timers();
135 135
136 if (rcu_pending(cpu)) 136 rcu_check_callbacks(cpu, user_mode(get_irq_regs()));
137 rcu_check_callbacks(cpu, user_mode(get_irq_regs()));
138 137
139 scheduler_tick(); 138 scheduler_tick();
140 run_posix_cpu_timers(p); 139 run_posix_cpu_timers(p);
diff --git a/arch/m68k/include/asm/entry_mm.h b/arch/m68k/include/asm/entry_mm.h
index 5202f5a5b420..474125886218 100644
--- a/arch/m68k/include/asm/entry_mm.h
+++ b/arch/m68k/include/asm/entry_mm.h
@@ -46,7 +46,6 @@
46#define curptr a2 46#define curptr a2
47 47
48LFLUSH_I_AND_D = 0x00000808 48LFLUSH_I_AND_D = 0x00000808
49LSIGTRAP = 5
50 49
51/* process bits for task_struct.ptrace */ 50/* process bits for task_struct.ptrace */
52PT_TRACESYS_OFF = 3 51PT_TRACESYS_OFF = 3
@@ -118,9 +117,6 @@ PT_DTRACE_BIT = 2
118#define STR(X) STR1(X) 117#define STR(X) STR1(X)
119#define STR1(X) #X 118#define STR1(X) #X
120 119
121#define PT_OFF_ORIG_D0 0x24
122#define PT_OFF_FORMATVEC 0x32
123#define PT_OFF_SR 0x2C
124#define SAVE_ALL_INT \ 120#define SAVE_ALL_INT \
125 "clrl %%sp@-;" /* stk_adj */ \ 121 "clrl %%sp@-;" /* stk_adj */ \
126 "pea -1:w;" /* orig d0 = -1 */ \ 122 "pea -1:w;" /* orig d0 = -1 */ \
diff --git a/arch/m68k/include/asm/entry_no.h b/arch/m68k/include/asm/entry_no.h
index c2553d26273d..907ed03d792f 100644
--- a/arch/m68k/include/asm/entry_no.h
+++ b/arch/m68k/include/asm/entry_no.h
@@ -72,8 +72,8 @@ LENOSYS = 38
72 lea %sp@(-32),%sp /* space for 8 regs */ 72 lea %sp@(-32),%sp /* space for 8 regs */
73 moveml %d1-%d5/%a0-%a2,%sp@ 73 moveml %d1-%d5/%a0-%a2,%sp@
74 movel sw_usp,%a0 /* get usp */ 74 movel sw_usp,%a0 /* get usp */
75 movel %a0@-,%sp@(PT_PC) /* copy exception program counter */ 75 movel %a0@-,%sp@(PT_OFF_PC) /* copy exception program counter */
76 movel %a0@-,%sp@(PT_FORMATVEC)/* copy exception format/vector/sr */ 76 movel %a0@-,%sp@(PT_OFF_FORMATVEC)/*copy exception format/vector/sr */
77 bra 7f 77 bra 7f
78 6: 78 6:
79 clrl %sp@- /* stkadj */ 79 clrl %sp@- /* stkadj */
@@ -89,8 +89,8 @@ LENOSYS = 38
89 bnes 8f /* no, skip */ 89 bnes 8f /* no, skip */
90 move #0x2700,%sr /* disable intrs */ 90 move #0x2700,%sr /* disable intrs */
91 movel sw_usp,%a0 /* get usp */ 91 movel sw_usp,%a0 /* get usp */
92 movel %sp@(PT_PC),%a0@- /* copy exception program counter */ 92 movel %sp@(PT_OFF_PC),%a0@- /* copy exception program counter */
93 movel %sp@(PT_FORMATVEC),%a0@-/* copy exception format/vector/sr */ 93 movel %sp@(PT_OFF_FORMATVEC),%a0@-/*copy exception format/vector/sr */
94 moveml %sp@,%d1-%d5/%a0-%a2 94 moveml %sp@,%d1-%d5/%a0-%a2
95 lea %sp@(32),%sp /* space for 8 regs */ 95 lea %sp@(32),%sp /* space for 8 regs */
96 movel %sp@+,%d0 96 movel %sp@+,%d0
diff --git a/arch/m68k/include/asm/math-emu.h b/arch/m68k/include/asm/math-emu.h
index ddfab96403cb..5e9249b0014c 100644
--- a/arch/m68k/include/asm/math-emu.h
+++ b/arch/m68k/include/asm/math-emu.h
@@ -145,16 +145,16 @@ extern unsigned int fp_debugprint;
145 * these are only used during instruction decoding 145 * these are only used during instruction decoding
146 * where we always know how deep we're on the stack. 146 * where we always know how deep we're on the stack.
147 */ 147 */
148#define FPS_DO (PT_D0) 148#define FPS_DO (PT_OFF_D0)
149#define FPS_D1 (PT_D1) 149#define FPS_D1 (PT_OFF_D1)
150#define FPS_D2 (PT_D2) 150#define FPS_D2 (PT_OFF_D2)
151#define FPS_A0 (PT_A0) 151#define FPS_A0 (PT_OFF_A0)
152#define FPS_A1 (PT_A1) 152#define FPS_A1 (PT_OFF_A1)
153#define FPS_A2 (PT_A2) 153#define FPS_A2 (PT_OFF_A2)
154#define FPS_SR (PT_SR) 154#define FPS_SR (PT_OFF_SR)
155#define FPS_PC (PT_PC) 155#define FPS_PC (PT_OFF_PC)
156#define FPS_EA (PT_PC+6) 156#define FPS_EA (PT_OFF_PC+6)
157#define FPS_PC2 (PT_PC+10) 157#define FPS_PC2 (PT_OFF_PC+10)
158 158
159.macro fp_get_fp_reg 159.macro fp_get_fp_reg
160 lea (FPD_FPREG,FPDATA,%d0.w*4),%a0 160 lea (FPD_FPREG,FPDATA,%d0.w*4),%a0
diff --git a/arch/m68k/include/asm/thread_info_mm.h b/arch/m68k/include/asm/thread_info_mm.h
index 6ea5c33b3c56..b6da3882be9b 100644
--- a/arch/m68k/include/asm/thread_info_mm.h
+++ b/arch/m68k/include/asm/thread_info_mm.h
@@ -1,6 +1,10 @@
1#ifndef _ASM_M68K_THREAD_INFO_H 1#ifndef _ASM_M68K_THREAD_INFO_H
2#define _ASM_M68K_THREAD_INFO_H 2#define _ASM_M68K_THREAD_INFO_H
3 3
4#ifndef ASM_OFFSETS_C
5#include <asm/asm-offsets.h>
6#endif
7#include <asm/current.h>
4#include <asm/types.h> 8#include <asm/types.h>
5#include <asm/page.h> 9#include <asm/page.h>
6 10
@@ -31,7 +35,12 @@ struct thread_info {
31#define init_thread_info (init_task.thread.info) 35#define init_thread_info (init_task.thread.info)
32#define init_stack (init_thread_union.stack) 36#define init_stack (init_thread_union.stack)
33 37
34#define task_thread_info(tsk) (&(tsk)->thread.info) 38#ifdef ASM_OFFSETS_C
39#define task_thread_info(tsk) ((struct thread_info *) NULL)
40#else
41#define task_thread_info(tsk) ((struct thread_info *)((char *)tsk+TASK_TINFO))
42#endif
43
35#define task_stack_page(tsk) ((tsk)->stack) 44#define task_stack_page(tsk) ((tsk)->stack)
36#define current_thread_info() task_thread_info(current) 45#define current_thread_info() task_thread_info(current)
37 46
diff --git a/arch/m68k/kernel/asm-offsets.c b/arch/m68k/kernel/asm-offsets.c
index b1f012f6c493..73e5e581245b 100644
--- a/arch/m68k/kernel/asm-offsets.c
+++ b/arch/m68k/kernel/asm-offsets.c
@@ -8,6 +8,8 @@
8 * #defines from the assembly-language output. 8 * #defines from the assembly-language output.
9 */ 9 */
10 10
11#define ASM_OFFSETS_C
12
11#include <linux/stddef.h> 13#include <linux/stddef.h>
12#include <linux/sched.h> 14#include <linux/sched.h>
13#include <linux/kernel_stat.h> 15#include <linux/kernel_stat.h>
@@ -27,6 +29,9 @@ int main(void)
27 DEFINE(TASK_INFO, offsetof(struct task_struct, thread.info)); 29 DEFINE(TASK_INFO, offsetof(struct task_struct, thread.info));
28 DEFINE(TASK_MM, offsetof(struct task_struct, mm)); 30 DEFINE(TASK_MM, offsetof(struct task_struct, mm));
29 DEFINE(TASK_ACTIVE_MM, offsetof(struct task_struct, active_mm)); 31 DEFINE(TASK_ACTIVE_MM, offsetof(struct task_struct, active_mm));
32#ifdef CONFIG_MMU
33 DEFINE(TASK_TINFO, offsetof(struct task_struct, thread.info));
34#endif
30 35
31 /* offsets into the thread struct */ 36 /* offsets into the thread struct */
32 DEFINE(THREAD_KSP, offsetof(struct thread_struct, ksp)); 37 DEFINE(THREAD_KSP, offsetof(struct thread_struct, ksp));
@@ -44,20 +49,20 @@ int main(void)
44 DEFINE(TINFO_FLAGS, offsetof(struct thread_info, flags)); 49 DEFINE(TINFO_FLAGS, offsetof(struct thread_info, flags));
45 50
46 /* offsets into the pt_regs */ 51 /* offsets into the pt_regs */
47 DEFINE(PT_D0, offsetof(struct pt_regs, d0)); 52 DEFINE(PT_OFF_D0, offsetof(struct pt_regs, d0));
48 DEFINE(PT_ORIG_D0, offsetof(struct pt_regs, orig_d0)); 53 DEFINE(PT_OFF_ORIG_D0, offsetof(struct pt_regs, orig_d0));
49 DEFINE(PT_D1, offsetof(struct pt_regs, d1)); 54 DEFINE(PT_OFF_D1, offsetof(struct pt_regs, d1));
50 DEFINE(PT_D2, offsetof(struct pt_regs, d2)); 55 DEFINE(PT_OFF_D2, offsetof(struct pt_regs, d2));
51 DEFINE(PT_D3, offsetof(struct pt_regs, d3)); 56 DEFINE(PT_OFF_D3, offsetof(struct pt_regs, d3));
52 DEFINE(PT_D4, offsetof(struct pt_regs, d4)); 57 DEFINE(PT_OFF_D4, offsetof(struct pt_regs, d4));
53 DEFINE(PT_D5, offsetof(struct pt_regs, d5)); 58 DEFINE(PT_OFF_D5, offsetof(struct pt_regs, d5));
54 DEFINE(PT_A0, offsetof(struct pt_regs, a0)); 59 DEFINE(PT_OFF_A0, offsetof(struct pt_regs, a0));
55 DEFINE(PT_A1, offsetof(struct pt_regs, a1)); 60 DEFINE(PT_OFF_A1, offsetof(struct pt_regs, a1));
56 DEFINE(PT_A2, offsetof(struct pt_regs, a2)); 61 DEFINE(PT_OFF_A2, offsetof(struct pt_regs, a2));
57 DEFINE(PT_PC, offsetof(struct pt_regs, pc)); 62 DEFINE(PT_OFF_PC, offsetof(struct pt_regs, pc));
58 DEFINE(PT_SR, offsetof(struct pt_regs, sr)); 63 DEFINE(PT_OFF_SR, offsetof(struct pt_regs, sr));
59 /* bitfields are a bit difficult */ 64 /* bitfields are a bit difficult */
60 DEFINE(PT_VECTOR, offsetof(struct pt_regs, pc) + 4); 65 DEFINE(PT_OFF_FORMATVEC, offsetof(struct pt_regs, pc) + 4);
61 66
62 /* offsets into the irq_handler struct */ 67 /* offsets into the irq_handler struct */
63 DEFINE(IRQ_HANDLER, offsetof(struct irq_node, handler)); 68 DEFINE(IRQ_HANDLER, offsetof(struct irq_node, handler));
@@ -84,10 +89,10 @@ int main(void)
84 DEFINE(FONT_DESC_PREF, offsetof(struct font_desc, pref)); 89 DEFINE(FONT_DESC_PREF, offsetof(struct font_desc, pref));
85 90
86 /* signal defines */ 91 /* signal defines */
87 DEFINE(SIGSEGV, SIGSEGV); 92 DEFINE(LSIGSEGV, SIGSEGV);
88 DEFINE(SEGV_MAPERR, SEGV_MAPERR); 93 DEFINE(LSEGV_MAPERR, SEGV_MAPERR);
89 DEFINE(SIGTRAP, SIGTRAP); 94 DEFINE(LSIGTRAP, SIGTRAP);
90 DEFINE(TRAP_TRACE, TRAP_TRACE); 95 DEFINE(LTRAP_TRACE, TRAP_TRACE);
91 96
92 /* offsets into the custom struct */ 97 /* offsets into the custom struct */
93 DEFINE(CUSTOMBASE, &amiga_custom); 98 DEFINE(CUSTOMBASE, &amiga_custom);
diff --git a/arch/m68k/kernel/entry.S b/arch/m68k/kernel/entry.S
index c3735cd6207e..922f52e7ed1a 100644
--- a/arch/m68k/kernel/entry.S
+++ b/arch/m68k/kernel/entry.S
@@ -77,17 +77,17 @@ ENTRY(ret_from_fork)
77 jra .Lret_from_exception 77 jra .Lret_from_exception
78 78
79do_trace_entry: 79do_trace_entry:
80 movel #-ENOSYS,%sp@(PT_D0) | needed for strace 80 movel #-ENOSYS,%sp@(PT_OFF_D0)| needed for strace
81 subql #4,%sp 81 subql #4,%sp
82 SAVE_SWITCH_STACK 82 SAVE_SWITCH_STACK
83 jbsr syscall_trace 83 jbsr syscall_trace
84 RESTORE_SWITCH_STACK 84 RESTORE_SWITCH_STACK
85 addql #4,%sp 85 addql #4,%sp
86 movel %sp@(PT_ORIG_D0),%d0 86 movel %sp@(PT_OFF_ORIG_D0),%d0
87 cmpl #NR_syscalls,%d0 87 cmpl #NR_syscalls,%d0
88 jcs syscall 88 jcs syscall
89badsys: 89badsys:
90 movel #-ENOSYS,%sp@(PT_D0) 90 movel #-ENOSYS,%sp@(PT_OFF_D0)
91 jra ret_from_syscall 91 jra ret_from_syscall
92 92
93do_trace_exit: 93do_trace_exit:
@@ -103,7 +103,7 @@ ENTRY(ret_from_signal)
103 addql #4,%sp 103 addql #4,%sp
104/* on 68040 complete pending writebacks if any */ 104/* on 68040 complete pending writebacks if any */
105#ifdef CONFIG_M68040 105#ifdef CONFIG_M68040
106 bfextu %sp@(PT_VECTOR){#0,#4},%d0 106 bfextu %sp@(PT_OFF_FORMATVEC){#0,#4},%d0
107 subql #7,%d0 | bus error frame ? 107 subql #7,%d0 | bus error frame ?
108 jbne 1f 108 jbne 1f
109 movel %sp,%sp@- 109 movel %sp,%sp@-
@@ -127,7 +127,7 @@ ENTRY(system_call)
127 jcc badsys 127 jcc badsys
128syscall: 128syscall:
129 jbsr @(sys_call_table,%d0:l:4)@(0) 129 jbsr @(sys_call_table,%d0:l:4)@(0)
130 movel %d0,%sp@(PT_D0) | save the return value 130 movel %d0,%sp@(PT_OFF_D0) | save the return value
131ret_from_syscall: 131ret_from_syscall:
132 |oriw #0x0700,%sr 132 |oriw #0x0700,%sr
133 movew %curptr@(TASK_INFO+TINFO_FLAGS+2),%d0 133 movew %curptr@(TASK_INFO+TINFO_FLAGS+2),%d0
@@ -135,7 +135,7 @@ ret_from_syscall:
1351: RESTORE_ALL 1351: RESTORE_ALL
136 136
137syscall_exit_work: 137syscall_exit_work:
138 btst #5,%sp@(PT_SR) | check if returning to kernel 138 btst #5,%sp@(PT_OFF_SR) | check if returning to kernel
139 bnes 1b | if so, skip resched, signals 139 bnes 1b | if so, skip resched, signals
140 lslw #1,%d0 140 lslw #1,%d0
141 jcs do_trace_exit 141 jcs do_trace_exit
@@ -148,7 +148,7 @@ syscall_exit_work:
148 148
149ENTRY(ret_from_exception) 149ENTRY(ret_from_exception)
150.Lret_from_exception: 150.Lret_from_exception:
151 btst #5,%sp@(PT_SR) | check if returning to kernel 151 btst #5,%sp@(PT_OFF_SR) | check if returning to kernel
152 bnes 1f | if so, skip resched, signals 152 bnes 1f | if so, skip resched, signals
153 | only allow interrupts when we are really the last one on the 153 | only allow interrupts when we are really the last one on the
154 | kernel stack, otherwise stack overflow can occur during 154 | kernel stack, otherwise stack overflow can occur during
@@ -182,7 +182,7 @@ do_signal_return:
182 jbra resume_userspace 182 jbra resume_userspace
183 183
184do_delayed_trace: 184do_delayed_trace:
185 bclr #7,%sp@(PT_SR) | clear trace bit in SR 185 bclr #7,%sp@(PT_OFF_SR) | clear trace bit in SR
186 pea 1 | send SIGTRAP 186 pea 1 | send SIGTRAP
187 movel %curptr,%sp@- 187 movel %curptr,%sp@-
188 pea LSIGTRAP 188 pea LSIGTRAP
@@ -199,7 +199,7 @@ ENTRY(auto_inthandler)
199 GET_CURRENT(%d0) 199 GET_CURRENT(%d0)
200 addqb #1,%curptr@(TASK_INFO+TINFO_PREEMPT+1) 200 addqb #1,%curptr@(TASK_INFO+TINFO_PREEMPT+1)
201 | put exception # in d0 201 | put exception # in d0
202 bfextu %sp@(PT_VECTOR){#4,#10},%d0 202 bfextu %sp@(PT_OFF_FORMATVEC){#4,#10},%d0
203 subw #VEC_SPUR,%d0 203 subw #VEC_SPUR,%d0
204 204
205 movel %sp,%sp@- 205 movel %sp,%sp@-
@@ -216,7 +216,7 @@ ret_from_interrupt:
216 ALIGN 216 ALIGN
217ret_from_last_interrupt: 217ret_from_last_interrupt:
218 moveq #(~ALLOWINT>>8)&0xff,%d0 218 moveq #(~ALLOWINT>>8)&0xff,%d0
219 andb %sp@(PT_SR),%d0 219 andb %sp@(PT_OFF_SR),%d0
220 jne 2b 220 jne 2b
221 221
222 /* check if we need to do software interrupts */ 222 /* check if we need to do software interrupts */
@@ -232,7 +232,7 @@ ENTRY(user_inthandler)
232 GET_CURRENT(%d0) 232 GET_CURRENT(%d0)
233 addqb #1,%curptr@(TASK_INFO+TINFO_PREEMPT+1) 233 addqb #1,%curptr@(TASK_INFO+TINFO_PREEMPT+1)
234 | put exception # in d0 234 | put exception # in d0
235 bfextu %sp@(PT_VECTOR){#4,#10},%d0 235 bfextu %sp@(PT_OFF_FORMATVEC){#4,#10},%d0
236user_irqvec_fixup = . + 2 236user_irqvec_fixup = . + 2
237 subw #VEC_USER,%d0 237 subw #VEC_USER,%d0
238 238
diff --git a/arch/m68k/math-emu/fp_entry.S b/arch/m68k/math-emu/fp_entry.S
index 954b4f304a7d..a3fe1f348dfe 100644
--- a/arch/m68k/math-emu/fp_entry.S
+++ b/arch/m68k/math-emu/fp_entry.S
@@ -85,8 +85,8 @@ fp_err_ua2:
85fp_err_ua1: 85fp_err_ua1:
86 addq.l #4,%sp 86 addq.l #4,%sp
87 move.l %a0,-(%sp) 87 move.l %a0,-(%sp)
88 pea SEGV_MAPERR 88 pea LSEGV_MAPERR
89 pea SIGSEGV 89 pea LSIGSEGV
90 jsr fpemu_signal 90 jsr fpemu_signal
91 add.w #12,%sp 91 add.w #12,%sp
92 jra ret_from_exception 92 jra ret_from_exception
@@ -96,8 +96,8 @@ fp_err_ua1:
96 | it does not really belong here, but... 96 | it does not really belong here, but...
97fp_sendtrace060: 97fp_sendtrace060:
98 move.l (FPS_PC,%sp),-(%sp) 98 move.l (FPS_PC,%sp),-(%sp)
99 pea TRAP_TRACE 99 pea LTRAP_TRACE
100 pea SIGTRAP 100 pea LSIGTRAP
101 jsr fpemu_signal 101 jsr fpemu_signal
102 add.w #12,%sp 102 add.w #12,%sp
103 jra ret_from_exception 103 jra ret_from_exception
@@ -122,17 +122,17 @@ fp_get_data_reg:
122 .long fp_get_d6, fp_get_d7 122 .long fp_get_d6, fp_get_d7
123 123
124fp_get_d0: 124fp_get_d0:
125 move.l (PT_D0+8,%sp),%d0 125 move.l (PT_OFF_D0+8,%sp),%d0
126 printf PREGISTER,"{d0->%08x}",1,%d0 126 printf PREGISTER,"{d0->%08x}",1,%d0
127 rts 127 rts
128 128
129fp_get_d1: 129fp_get_d1:
130 move.l (PT_D1+8,%sp),%d0 130 move.l (PT_OFF_D1+8,%sp),%d0
131 printf PREGISTER,"{d1->%08x}",1,%d0 131 printf PREGISTER,"{d1->%08x}",1,%d0
132 rts 132 rts
133 133
134fp_get_d2: 134fp_get_d2:
135 move.l (PT_D2+8,%sp),%d0 135 move.l (PT_OFF_D2+8,%sp),%d0
136 printf PREGISTER,"{d2->%08x}",1,%d0 136 printf PREGISTER,"{d2->%08x}",1,%d0
137 rts 137 rts
138 138
@@ -173,35 +173,35 @@ fp_put_data_reg:
173 173
174fp_put_d0: 174fp_put_d0:
175 printf PREGISTER,"{d0<-%08x}",1,%d0 175 printf PREGISTER,"{d0<-%08x}",1,%d0
176 move.l %d0,(PT_D0+8,%sp) 176 move.l %d0,(PT_OFF_D0+8,%sp)
177 rts 177 rts
178 178
179fp_put_d1: 179fp_put_d1:
180 printf PREGISTER,"{d1<-%08x}",1,%d0 180 printf PREGISTER,"{d1<-%08x}",1,%d0
181 move.l %d0,(PT_D1+8,%sp) 181 move.l %d0,(PT_OFF_D1+8,%sp)
182 rts 182 rts
183 183
184fp_put_d2: 184fp_put_d2:
185 printf PREGISTER,"{d2<-%08x}",1,%d0 185 printf PREGISTER,"{d2<-%08x}",1,%d0
186 move.l %d0,(PT_D2+8,%sp) 186 move.l %d0,(PT_OFF_D2+8,%sp)
187 rts 187 rts
188 188
189fp_put_d3: 189fp_put_d3:
190 printf PREGISTER,"{d3<-%08x}",1,%d0 190 printf PREGISTER,"{d3<-%08x}",1,%d0
191| move.l %d0,%d3 191| move.l %d0,%d3
192 move.l %d0,(PT_D3+8,%sp) 192 move.l %d0,(PT_OFF_D3+8,%sp)
193 rts 193 rts
194 194
195fp_put_d4: 195fp_put_d4:
196 printf PREGISTER,"{d4<-%08x}",1,%d0 196 printf PREGISTER,"{d4<-%08x}",1,%d0
197| move.l %d0,%d4 197| move.l %d0,%d4
198 move.l %d0,(PT_D4+8,%sp) 198 move.l %d0,(PT_OFF_D4+8,%sp)
199 rts 199 rts
200 200
201fp_put_d5: 201fp_put_d5:
202 printf PREGISTER,"{d5<-%08x}",1,%d0 202 printf PREGISTER,"{d5<-%08x}",1,%d0
203| move.l %d0,%d5 203| move.l %d0,%d5
204 move.l %d0,(PT_D5+8,%sp) 204 move.l %d0,(PT_OFF_D5+8,%sp)
205 rts 205 rts
206 206
207fp_put_d6: 207fp_put_d6:
@@ -225,17 +225,17 @@ fp_get_addr_reg:
225 .long fp_get_a6, fp_get_a7 225 .long fp_get_a6, fp_get_a7
226 226
227fp_get_a0: 227fp_get_a0:
228 move.l (PT_A0+8,%sp),%a0 228 move.l (PT_OFF_A0+8,%sp),%a0
229 printf PREGISTER,"{a0->%08x}",1,%a0 229 printf PREGISTER,"{a0->%08x}",1,%a0
230 rts 230 rts
231 231
232fp_get_a1: 232fp_get_a1:
233 move.l (PT_A1+8,%sp),%a0 233 move.l (PT_OFF_A1+8,%sp),%a0
234 printf PREGISTER,"{a1->%08x}",1,%a0 234 printf PREGISTER,"{a1->%08x}",1,%a0
235 rts 235 rts
236 236
237fp_get_a2: 237fp_get_a2:
238 move.l (PT_A2+8,%sp),%a0 238 move.l (PT_OFF_A2+8,%sp),%a0
239 printf PREGISTER,"{a2->%08x}",1,%a0 239 printf PREGISTER,"{a2->%08x}",1,%a0
240 rts 240 rts
241 241
@@ -276,17 +276,17 @@ fp_put_addr_reg:
276 276
277fp_put_a0: 277fp_put_a0:
278 printf PREGISTER,"{a0<-%08x}",1,%a0 278 printf PREGISTER,"{a0<-%08x}",1,%a0
279 move.l %a0,(PT_A0+8,%sp) 279 move.l %a0,(PT_OFF_A0+8,%sp)
280 rts 280 rts
281 281
282fp_put_a1: 282fp_put_a1:
283 printf PREGISTER,"{a1<-%08x}",1,%a0 283 printf PREGISTER,"{a1<-%08x}",1,%a0
284 move.l %a0,(PT_A1+8,%sp) 284 move.l %a0,(PT_OFF_A1+8,%sp)
285 rts 285 rts
286 286
287fp_put_a2: 287fp_put_a2:
288 printf PREGISTER,"{a2<-%08x}",1,%a0 288 printf PREGISTER,"{a2<-%08x}",1,%a0
289 move.l %a0,(PT_A2+8,%sp) 289 move.l %a0,(PT_OFF_A2+8,%sp)
290 rts 290 rts
291 291
292fp_put_a3: 292fp_put_a3:
diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h
index b44aaabdd1a6..0c34371ec49c 100644
--- a/arch/powerpc/include/asm/dma-mapping.h
+++ b/arch/powerpc/include/asm/dma-mapping.h
@@ -424,6 +424,29 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
424#endif 424#endif
425} 425}
426 426
427static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
428{
429 struct dma_mapping_ops *ops = get_dma_ops(dev);
430
431 if (ops->addr_needs_map && ops->addr_needs_map(dev, addr, size))
432 return 0;
433
434 if (!dev->dma_mask)
435 return 0;
436
437 return addr + size <= *dev->dma_mask;
438}
439
440static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
441{
442 return paddr + get_dma_direct_offset(dev);
443}
444
445static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
446{
447 return daddr - get_dma_direct_offset(dev);
448}
449
427#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) 450#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
428#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) 451#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
429#ifdef CONFIG_NOT_COHERENT_CACHE 452#ifdef CONFIG_NOT_COHERENT_CACHE
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index eb17da781128..2a5da069714e 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -104,8 +104,8 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
104 else 104 else
105 pte_update(ptep, ~_PAGE_HASHPTE, pte_val(pte)); 105 pte_update(ptep, ~_PAGE_HASHPTE, pte_val(pte));
106 106
107#elif defined(CONFIG_PPC32) && defined(CONFIG_PTE_64BIT) && defined(CONFIG_SMP) 107#elif defined(CONFIG_PPC32) && defined(CONFIG_PTE_64BIT)
108 /* Second case is 32-bit with 64-bit PTE in SMP mode. In this case, we 108 /* Second case is 32-bit with 64-bit PTE. In this case, we
109 * can just store as long as we do the two halves in the right order 109 * can just store as long as we do the two halves in the right order
110 * with a barrier in between. This is possible because we take care, 110 * with a barrier in between. This is possible because we take care,
111 * in the hash code, to pre-invalidate if the PTE was already hashed, 111 * in the hash code, to pre-invalidate if the PTE was already hashed,
@@ -140,7 +140,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
140 140
141#else 141#else
142 /* Anything else just stores the PTE normally. That covers all 64-bit 142 /* Anything else just stores the PTE normally. That covers all 64-bit
143 * cases, and 32-bit non-hash with 64-bit PTEs in UP mode 143 * cases, and 32-bit non-hash with 32-bit PTEs.
144 */ 144 */
145 *ptep = pte; 145 *ptep = pte;
146#endif 146#endif
diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index c3b193121f81..198266cf9e2d 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -54,7 +54,7 @@
54 * This returns the old value in the lock, so we succeeded 54 * This returns the old value in the lock, so we succeeded
55 * in getting the lock if the return value is 0. 55 * in getting the lock if the return value is 0.
56 */ 56 */
57static inline unsigned long __spin_trylock(raw_spinlock_t *lock) 57static inline unsigned long arch_spin_trylock(raw_spinlock_t *lock)
58{ 58{
59 unsigned long tmp, token; 59 unsigned long tmp, token;
60 60
@@ -76,7 +76,7 @@ static inline unsigned long __spin_trylock(raw_spinlock_t *lock)
76static inline int __raw_spin_trylock(raw_spinlock_t *lock) 76static inline int __raw_spin_trylock(raw_spinlock_t *lock)
77{ 77{
78 CLEAR_IO_SYNC; 78 CLEAR_IO_SYNC;
79 return __spin_trylock(lock) == 0; 79 return arch_spin_trylock(lock) == 0;
80} 80}
81 81
82/* 82/*
@@ -108,7 +108,7 @@ static inline void __raw_spin_lock(raw_spinlock_t *lock)
108{ 108{
109 CLEAR_IO_SYNC; 109 CLEAR_IO_SYNC;
110 while (1) { 110 while (1) {
111 if (likely(__spin_trylock(lock) == 0)) 111 if (likely(arch_spin_trylock(lock) == 0))
112 break; 112 break;
113 do { 113 do {
114 HMT_low(); 114 HMT_low();
@@ -126,7 +126,7 @@ void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
126 126
127 CLEAR_IO_SYNC; 127 CLEAR_IO_SYNC;
128 while (1) { 128 while (1) {
129 if (likely(__spin_trylock(lock) == 0)) 129 if (likely(arch_spin_trylock(lock) == 0))
130 break; 130 break;
131 local_save_flags(flags_dis); 131 local_save_flags(flags_dis);
132 local_irq_restore(flags); 132 local_irq_restore(flags);
@@ -181,7 +181,7 @@ extern void __raw_spin_unlock_wait(raw_spinlock_t *lock);
181 * This returns the old value in the lock + 1, 181 * This returns the old value in the lock + 1,
182 * so we got a read lock if the return value is > 0. 182 * so we got a read lock if the return value is > 0.
183 */ 183 */
184static inline long __read_trylock(raw_rwlock_t *rw) 184static inline long arch_read_trylock(raw_rwlock_t *rw)
185{ 185{
186 long tmp; 186 long tmp;
187 187
@@ -205,7 +205,7 @@ static inline long __read_trylock(raw_rwlock_t *rw)
205 * This returns the old value in the lock, 205 * This returns the old value in the lock,
206 * so we got the write lock if the return value is 0. 206 * so we got the write lock if the return value is 0.
207 */ 207 */
208static inline long __write_trylock(raw_rwlock_t *rw) 208static inline long arch_write_trylock(raw_rwlock_t *rw)
209{ 209{
210 long tmp, token; 210 long tmp, token;
211 211
@@ -228,7 +228,7 @@ static inline long __write_trylock(raw_rwlock_t *rw)
228static inline void __raw_read_lock(raw_rwlock_t *rw) 228static inline void __raw_read_lock(raw_rwlock_t *rw)
229{ 229{
230 while (1) { 230 while (1) {
231 if (likely(__read_trylock(rw) > 0)) 231 if (likely(arch_read_trylock(rw) > 0))
232 break; 232 break;
233 do { 233 do {
234 HMT_low(); 234 HMT_low();
@@ -242,7 +242,7 @@ static inline void __raw_read_lock(raw_rwlock_t *rw)
242static inline void __raw_write_lock(raw_rwlock_t *rw) 242static inline void __raw_write_lock(raw_rwlock_t *rw)
243{ 243{
244 while (1) { 244 while (1) {
245 if (likely(__write_trylock(rw) == 0)) 245 if (likely(arch_write_trylock(rw) == 0))
246 break; 246 break;
247 do { 247 do {
248 HMT_low(); 248 HMT_low();
@@ -255,12 +255,12 @@ static inline void __raw_write_lock(raw_rwlock_t *rw)
255 255
256static inline int __raw_read_trylock(raw_rwlock_t *rw) 256static inline int __raw_read_trylock(raw_rwlock_t *rw)
257{ 257{
258 return __read_trylock(rw) > 0; 258 return arch_read_trylock(rw) > 0;
259} 259}
260 260
261static inline int __raw_write_trylock(raw_rwlock_t *rw) 261static inline int __raw_write_trylock(raw_rwlock_t *rw)
262{ 262{
263 return __write_trylock(rw) == 0; 263 return arch_write_trylock(rw) == 0;
264} 264}
265 265
266static inline void __raw_read_unlock(raw_rwlock_t *rw) 266static inline void __raw_read_unlock(raw_rwlock_t *rw)
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index b73396b93905..9619285f64e8 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -97,7 +97,7 @@ obj64-$(CONFIG_AUDIT) += compat_audit.o
97 97
98obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o 98obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
99obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o 99obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
100obj-$(CONFIG_PPC_PERF_CTRS) += perf_counter.o 100obj-$(CONFIG_PPC_PERF_CTRS) += perf_counter.o perf_callchain.o
101obj64-$(CONFIG_PPC_PERF_CTRS) += power4-pmu.o ppc970-pmu.o power5-pmu.o \ 101obj64-$(CONFIG_PPC_PERF_CTRS) += power4-pmu.o ppc970-pmu.o power5-pmu.o \
102 power5+-pmu.o power6-pmu.o power7-pmu.o 102 power5+-pmu.o power6-pmu.o power7-pmu.o
103obj32-$(CONFIG_PPC_PERF_CTRS) += mpc7450-pmu.o 103obj32-$(CONFIG_PPC_PERF_CTRS) += mpc7450-pmu.o
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 561b64652311..197b15646eeb 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -67,6 +67,8 @@ int main(void)
67 DEFINE(MMCONTEXTID, offsetof(struct mm_struct, context.id)); 67 DEFINE(MMCONTEXTID, offsetof(struct mm_struct, context.id));
68#ifdef CONFIG_PPC64 68#ifdef CONFIG_PPC64
69 DEFINE(AUDITCONTEXT, offsetof(struct task_struct, audit_context)); 69 DEFINE(AUDITCONTEXT, offsetof(struct task_struct, audit_context));
70 DEFINE(SIGSEGV, SIGSEGV);
71 DEFINE(NMI_MASK, NMI_MASK);
70#else 72#else
71 DEFINE(THREAD_INFO, offsetof(struct task_struct, stack)); 73 DEFINE(THREAD_INFO, offsetof(struct task_struct, stack));
72#endif /* CONFIG_PPC64 */ 74#endif /* CONFIG_PPC64 */
diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c
index 68ccf11e4f19..e8a57de85bcf 100644
--- a/arch/powerpc/kernel/dma-swiotlb.c
+++ b/arch/powerpc/kernel/dma-swiotlb.c
@@ -24,50 +24,12 @@
24int swiotlb __read_mostly; 24int swiotlb __read_mostly;
25unsigned int ppc_swiotlb_enable; 25unsigned int ppc_swiotlb_enable;
26 26
27void *swiotlb_bus_to_virt(struct device *hwdev, dma_addr_t addr)
28{
29 unsigned long pfn = PFN_DOWN(swiotlb_bus_to_phys(hwdev, addr));
30 void *pageaddr = page_address(pfn_to_page(pfn));
31
32 if (pageaddr != NULL)
33 return pageaddr + (addr % PAGE_SIZE);
34 return NULL;
35}
36
37dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr)
38{
39 return paddr + get_dma_direct_offset(hwdev);
40}
41
42phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr)
43
44{
45 return baddr - get_dma_direct_offset(hwdev);
46}
47
48/*
49 * Determine if an address needs bounce buffering via swiotlb.
50 * Going forward I expect the swiotlb code to generalize on using
51 * a dma_ops->addr_needs_map, and this function will move from here to the
52 * generic swiotlb code.
53 */
54int
55swiotlb_arch_address_needs_mapping(struct device *hwdev, dma_addr_t addr,
56 size_t size)
57{
58 struct dma_mapping_ops *dma_ops = get_dma_ops(hwdev);
59
60 BUG_ON(!dma_ops);
61 return dma_ops->addr_needs_map(hwdev, addr, size);
62}
63
64/* 27/*
65 * Determine if an address is reachable by a pci device, or if we must bounce. 28 * Determine if an address is reachable by a pci device, or if we must bounce.
66 */ 29 */
67static int 30static int
68swiotlb_pci_addr_needs_map(struct device *hwdev, dma_addr_t addr, size_t size) 31swiotlb_pci_addr_needs_map(struct device *hwdev, dma_addr_t addr, size_t size)
69{ 32{
70 u64 mask = dma_get_mask(hwdev);
71 dma_addr_t max; 33 dma_addr_t max;
72 struct pci_controller *hose; 34 struct pci_controller *hose;
73 struct pci_dev *pdev = to_pci_dev(hwdev); 35 struct pci_dev *pdev = to_pci_dev(hwdev);
@@ -79,16 +41,9 @@ swiotlb_pci_addr_needs_map(struct device *hwdev, dma_addr_t addr, size_t size)
79 if ((addr + size > max) | (addr < hose->dma_window_base_cur)) 41 if ((addr + size > max) | (addr < hose->dma_window_base_cur))
80 return 1; 42 return 1;
81 43
82 return !is_buffer_dma_capable(mask, addr, size); 44 return 0;
83}
84
85static int
86swiotlb_addr_needs_map(struct device *hwdev, dma_addr_t addr, size_t size)
87{
88 return !is_buffer_dma_capable(dma_get_mask(hwdev), addr, size);
89} 45}
90 46
91
92/* 47/*
93 * At the moment, all platforms that use this code only require 48 * At the moment, all platforms that use this code only require
94 * swiotlb to be used if we're operating on HIGHMEM. Since 49 * swiotlb to be used if we're operating on HIGHMEM. Since
@@ -104,7 +59,6 @@ struct dma_mapping_ops swiotlb_dma_ops = {
104 .dma_supported = swiotlb_dma_supported, 59 .dma_supported = swiotlb_dma_supported,
105 .map_page = swiotlb_map_page, 60 .map_page = swiotlb_map_page,
106 .unmap_page = swiotlb_unmap_page, 61 .unmap_page = swiotlb_unmap_page,
107 .addr_needs_map = swiotlb_addr_needs_map,
108 .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu, 62 .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
109 .sync_single_range_for_device = swiotlb_sync_single_range_for_device, 63 .sync_single_range_for_device = swiotlb_sync_single_range_for_device,
110 .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, 64 .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index eb898112e577..8ac85e08ffae 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -729,6 +729,11 @@ BEGIN_FTR_SECTION
729 bne- do_ste_alloc /* If so handle it */ 729 bne- do_ste_alloc /* If so handle it */
730END_FTR_SECTION_IFCLR(CPU_FTR_SLB) 730END_FTR_SECTION_IFCLR(CPU_FTR_SLB)
731 731
732 clrrdi r11,r1,THREAD_SHIFT
733 lwz r0,TI_PREEMPT(r11) /* If we're in an "NMI" */
734 andis. r0,r0,NMI_MASK@h /* (i.e. an irq when soft-disabled) */
735 bne 77f /* then don't call hash_page now */
736
732 /* 737 /*
733 * On iSeries, we soft-disable interrupts here, then 738 * On iSeries, we soft-disable interrupts here, then
734 * hard-enable interrupts so that the hash_page code can spin on 739 * hard-enable interrupts so that the hash_page code can spin on
@@ -833,6 +838,20 @@ handle_page_fault:
833 bl .low_hash_fault 838 bl .low_hash_fault
834 b .ret_from_except 839 b .ret_from_except
835 840
841/*
842 * We come here as a result of a DSI at a point where we don't want
843 * to call hash_page, such as when we are accessing memory (possibly
844 * user memory) inside a PMU interrupt that occurred while interrupts
845 * were soft-disabled. We want to invoke the exception handler for
846 * the access, or panic if there isn't a handler.
847 */
84877: bl .save_nvgprs
849 mr r4,r3
850 addi r3,r1,STACK_FRAME_OVERHEAD
851 li r5,SIGSEGV
852 bl .bad_page_fault
853 b .ret_from_except
854
836 /* here we have a segment miss */ 855 /* here we have a segment miss */
837do_ste_alloc: 856do_ste_alloc:
838 bl .ste_allocate /* try to insert stab entry */ 857 bl .ste_allocate /* try to insert stab entry */
diff --git a/arch/powerpc/kernel/perf_callchain.c b/arch/powerpc/kernel/perf_callchain.c
new file mode 100644
index 000000000000..f74b62c67511
--- /dev/null
+++ b/arch/powerpc/kernel/perf_callchain.c
@@ -0,0 +1,527 @@
1/*
2 * Performance counter callchain support - powerpc architecture code
3 *
4 * Copyright © 2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/kernel.h>
12#include <linux/sched.h>
13#include <linux/perf_counter.h>
14#include <linux/percpu.h>
15#include <linux/uaccess.h>
16#include <linux/mm.h>
17#include <asm/ptrace.h>
18#include <asm/pgtable.h>
19#include <asm/sigcontext.h>
20#include <asm/ucontext.h>
21#include <asm/vdso.h>
22#ifdef CONFIG_PPC64
23#include "ppc32.h"
24#endif
25
26/*
27 * Store another value in a callchain_entry.
28 */
29static inline void callchain_store(struct perf_callchain_entry *entry, u64 ip)
30{
31 unsigned int nr = entry->nr;
32
33 if (nr < PERF_MAX_STACK_DEPTH) {
34 entry->ip[nr] = ip;
35 entry->nr = nr + 1;
36 }
37}
38
39/*
40 * Is sp valid as the address of the next kernel stack frame after prev_sp?
41 * The next frame may be in a different stack area but should not go
42 * back down in the same stack area.
43 */
44static int valid_next_sp(unsigned long sp, unsigned long prev_sp)
45{
46 if (sp & 0xf)
47 return 0; /* must be 16-byte aligned */
48 if (!validate_sp(sp, current, STACK_FRAME_OVERHEAD))
49 return 0;
50 if (sp >= prev_sp + STACK_FRAME_OVERHEAD)
51 return 1;
52 /*
53 * sp could decrease when we jump off an interrupt stack
54 * back to the regular process stack.
55 */
56 if ((sp & ~(THREAD_SIZE - 1)) != (prev_sp & ~(THREAD_SIZE - 1)))
57 return 1;
58 return 0;
59}
60
61static void perf_callchain_kernel(struct pt_regs *regs,
62 struct perf_callchain_entry *entry)
63{
64 unsigned long sp, next_sp;
65 unsigned long next_ip;
66 unsigned long lr;
67 long level = 0;
68 unsigned long *fp;
69
70 lr = regs->link;
71 sp = regs->gpr[1];
72 callchain_store(entry, PERF_CONTEXT_KERNEL);
73 callchain_store(entry, regs->nip);
74
75 if (!validate_sp(sp, current, STACK_FRAME_OVERHEAD))
76 return;
77
78 for (;;) {
79 fp = (unsigned long *) sp;
80 next_sp = fp[0];
81
82 if (next_sp == sp + STACK_INT_FRAME_SIZE &&
83 fp[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) {
84 /*
85 * This looks like an interrupt frame for an
86 * interrupt that occurred in the kernel
87 */
88 regs = (struct pt_regs *)(sp + STACK_FRAME_OVERHEAD);
89 next_ip = regs->nip;
90 lr = regs->link;
91 level = 0;
92 callchain_store(entry, PERF_CONTEXT_KERNEL);
93
94 } else {
95 if (level == 0)
96 next_ip = lr;
97 else
98 next_ip = fp[STACK_FRAME_LR_SAVE];
99
100 /*
101 * We can't tell which of the first two addresses
102 * we get are valid, but we can filter out the
103 * obviously bogus ones here. We replace them
104 * with 0 rather than removing them entirely so
105 * that userspace can tell which is which.
106 */
107 if ((level == 1 && next_ip == lr) ||
108 (level <= 1 && !kernel_text_address(next_ip)))
109 next_ip = 0;
110
111 ++level;
112 }
113
114 callchain_store(entry, next_ip);
115 if (!valid_next_sp(next_sp, sp))
116 return;
117 sp = next_sp;
118 }
119}
120
121#ifdef CONFIG_PPC64
122
123#ifdef CONFIG_HUGETLB_PAGE
124#define is_huge_psize(pagesize) (HPAGE_SHIFT && mmu_huge_psizes[pagesize])
125#else
126#define is_huge_psize(pagesize) 0
127#endif
128
129/*
130 * On 64-bit we don't want to invoke hash_page on user addresses from
131 * interrupt context, so if the access faults, we read the page tables
132 * to find which page (if any) is mapped and access it directly.
133 */
134static int read_user_stack_slow(void __user *ptr, void *ret, int nb)
135{
136 pgd_t *pgdir;
137 pte_t *ptep, pte;
138 int pagesize;
139 unsigned long addr = (unsigned long) ptr;
140 unsigned long offset;
141 unsigned long pfn;
142 void *kaddr;
143
144 pgdir = current->mm->pgd;
145 if (!pgdir)
146 return -EFAULT;
147
148 pagesize = get_slice_psize(current->mm, addr);
149
150 /* align address to page boundary */
151 offset = addr & ((1ul << mmu_psize_defs[pagesize].shift) - 1);
152 addr -= offset;
153
154 if (is_huge_psize(pagesize))
155 ptep = huge_pte_offset(current->mm, addr);
156 else
157 ptep = find_linux_pte(pgdir, addr);
158
159 if (ptep == NULL)
160 return -EFAULT;
161 pte = *ptep;
162 if (!pte_present(pte) || !(pte_val(pte) & _PAGE_USER))
163 return -EFAULT;
164 pfn = pte_pfn(pte);
165 if (!page_is_ram(pfn))
166 return -EFAULT;
167
168 /* no highmem to worry about here */
169 kaddr = pfn_to_kaddr(pfn);
170 memcpy(ret, kaddr + offset, nb);
171 return 0;
172}
173
174static int read_user_stack_64(unsigned long __user *ptr, unsigned long *ret)
175{
176 if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned long) ||
177 ((unsigned long)ptr & 7))
178 return -EFAULT;
179
180 if (!__get_user_inatomic(*ret, ptr))
181 return 0;
182
183 return read_user_stack_slow(ptr, ret, 8);
184}
185
186static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret)
187{
188 if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) ||
189 ((unsigned long)ptr & 3))
190 return -EFAULT;
191
192 if (!__get_user_inatomic(*ret, ptr))
193 return 0;
194
195 return read_user_stack_slow(ptr, ret, 4);
196}
197
198static inline int valid_user_sp(unsigned long sp, int is_64)
199{
200 if (!sp || (sp & 7) || sp > (is_64 ? TASK_SIZE : 0x100000000UL) - 32)
201 return 0;
202 return 1;
203}
204
205/*
206 * 64-bit user processes use the same stack frame for RT and non-RT signals.
207 */
208struct signal_frame_64 {
209 char dummy[__SIGNAL_FRAMESIZE];
210 struct ucontext uc;
211 unsigned long unused[2];
212 unsigned int tramp[6];
213 struct siginfo *pinfo;
214 void *puc;
215 struct siginfo info;
216 char abigap[288];
217};
218
219static int is_sigreturn_64_address(unsigned long nip, unsigned long fp)
220{
221 if (nip == fp + offsetof(struct signal_frame_64, tramp))
222 return 1;
223 if (vdso64_rt_sigtramp && current->mm->context.vdso_base &&
224 nip == current->mm->context.vdso_base + vdso64_rt_sigtramp)
225 return 1;
226 return 0;
227}
228
229/*
230 * Do some sanity checking on the signal frame pointed to by sp.
231 * We check the pinfo and puc pointers in the frame.
232 */
233static int sane_signal_64_frame(unsigned long sp)
234{
235 struct signal_frame_64 __user *sf;
236 unsigned long pinfo, puc;
237
238 sf = (struct signal_frame_64 __user *) sp;
239 if (read_user_stack_64((unsigned long __user *) &sf->pinfo, &pinfo) ||
240 read_user_stack_64((unsigned long __user *) &sf->puc, &puc))
241 return 0;
242 return pinfo == (unsigned long) &sf->info &&
243 puc == (unsigned long) &sf->uc;
244}
245
246static void perf_callchain_user_64(struct pt_regs *regs,
247 struct perf_callchain_entry *entry)
248{
249 unsigned long sp, next_sp;
250 unsigned long next_ip;
251 unsigned long lr;
252 long level = 0;
253 struct signal_frame_64 __user *sigframe;
254 unsigned long __user *fp, *uregs;
255
256 next_ip = regs->nip;
257 lr = regs->link;
258 sp = regs->gpr[1];
259 callchain_store(entry, PERF_CONTEXT_USER);
260 callchain_store(entry, next_ip);
261
262 for (;;) {
263 fp = (unsigned long __user *) sp;
264 if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp))
265 return;
266 if (level > 0 && read_user_stack_64(&fp[2], &next_ip))
267 return;
268
269 /*
270 * Note: the next_sp - sp >= signal frame size check
271 * is true when next_sp < sp, which can happen when
272 * transitioning from an alternate signal stack to the
273 * normal stack.
274 */
275 if (next_sp - sp >= sizeof(struct signal_frame_64) &&
276 (is_sigreturn_64_address(next_ip, sp) ||
277 (level <= 1 && is_sigreturn_64_address(lr, sp))) &&
278 sane_signal_64_frame(sp)) {
279 /*
280 * This looks like an signal frame
281 */
282 sigframe = (struct signal_frame_64 __user *) sp;
283 uregs = sigframe->uc.uc_mcontext.gp_regs;
284 if (read_user_stack_64(&uregs[PT_NIP], &next_ip) ||
285 read_user_stack_64(&uregs[PT_LNK], &lr) ||
286 read_user_stack_64(&uregs[PT_R1], &sp))
287 return;
288 level = 0;
289 callchain_store(entry, PERF_CONTEXT_USER);
290 callchain_store(entry, next_ip);
291 continue;
292 }
293
294 if (level == 0)
295 next_ip = lr;
296 callchain_store(entry, next_ip);
297 ++level;
298 sp = next_sp;
299 }
300}
301
302static inline int current_is_64bit(void)
303{
304 /*
305 * We can't use test_thread_flag() here because we may be on an
306 * interrupt stack, and the thread flags don't get copied over
307 * from the thread_info on the main stack to the interrupt stack.
308 */
309 return !test_ti_thread_flag(task_thread_info(current), TIF_32BIT);
310}
311
312#else /* CONFIG_PPC64 */
313/*
314 * On 32-bit we just access the address and let hash_page create a
315 * HPTE if necessary, so there is no need to fall back to reading
316 * the page tables. Since this is called at interrupt level,
317 * do_page_fault() won't treat a DSI as a page fault.
318 */
319static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret)
320{
321 if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) ||
322 ((unsigned long)ptr & 3))
323 return -EFAULT;
324
325 return __get_user_inatomic(*ret, ptr);
326}
327
328static inline void perf_callchain_user_64(struct pt_regs *regs,
329 struct perf_callchain_entry *entry)
330{
331}
332
333static inline int current_is_64bit(void)
334{
335 return 0;
336}
337
338static inline int valid_user_sp(unsigned long sp, int is_64)
339{
340 if (!sp || (sp & 7) || sp > TASK_SIZE - 32)
341 return 0;
342 return 1;
343}
344
345#define __SIGNAL_FRAMESIZE32 __SIGNAL_FRAMESIZE
346#define sigcontext32 sigcontext
347#define mcontext32 mcontext
348#define ucontext32 ucontext
349#define compat_siginfo_t struct siginfo
350
351#endif /* CONFIG_PPC64 */
352
353/*
354 * Layout for non-RT signal frames
355 */
356struct signal_frame_32 {
357 char dummy[__SIGNAL_FRAMESIZE32];
358 struct sigcontext32 sctx;
359 struct mcontext32 mctx;
360 int abigap[56];
361};
362
363/*
364 * Layout for RT signal frames
365 */
366struct rt_signal_frame_32 {
367 char dummy[__SIGNAL_FRAMESIZE32 + 16];
368 compat_siginfo_t info;
369 struct ucontext32 uc;
370 int abigap[56];
371};
372
373static int is_sigreturn_32_address(unsigned int nip, unsigned int fp)
374{
375 if (nip == fp + offsetof(struct signal_frame_32, mctx.mc_pad))
376 return 1;
377 if (vdso32_sigtramp && current->mm->context.vdso_base &&
378 nip == current->mm->context.vdso_base + vdso32_sigtramp)
379 return 1;
380 return 0;
381}
382
383static int is_rt_sigreturn_32_address(unsigned int nip, unsigned int fp)
384{
385 if (nip == fp + offsetof(struct rt_signal_frame_32,
386 uc.uc_mcontext.mc_pad))
387 return 1;
388 if (vdso32_rt_sigtramp && current->mm->context.vdso_base &&
389 nip == current->mm->context.vdso_base + vdso32_rt_sigtramp)
390 return 1;
391 return 0;
392}
393
394static int sane_signal_32_frame(unsigned int sp)
395{
396 struct signal_frame_32 __user *sf;
397 unsigned int regs;
398
399 sf = (struct signal_frame_32 __user *) (unsigned long) sp;
400 if (read_user_stack_32((unsigned int __user *) &sf->sctx.regs, &regs))
401 return 0;
402 return regs == (unsigned long) &sf->mctx;
403}
404
405static int sane_rt_signal_32_frame(unsigned int sp)
406{
407 struct rt_signal_frame_32 __user *sf;
408 unsigned int regs;
409
410 sf = (struct rt_signal_frame_32 __user *) (unsigned long) sp;
411 if (read_user_stack_32((unsigned int __user *) &sf->uc.uc_regs, &regs))
412 return 0;
413 return regs == (unsigned long) &sf->uc.uc_mcontext;
414}
415
416static unsigned int __user *signal_frame_32_regs(unsigned int sp,
417 unsigned int next_sp, unsigned int next_ip)
418{
419 struct mcontext32 __user *mctx = NULL;
420 struct signal_frame_32 __user *sf;
421 struct rt_signal_frame_32 __user *rt_sf;
422
423 /*
424 * Note: the next_sp - sp >= signal frame size check
425 * is true when next_sp < sp, for example, when
426 * transitioning from an alternate signal stack to the
427 * normal stack.
428 */
429 if (next_sp - sp >= sizeof(struct signal_frame_32) &&
430 is_sigreturn_32_address(next_ip, sp) &&
431 sane_signal_32_frame(sp)) {
432 sf = (struct signal_frame_32 __user *) (unsigned long) sp;
433 mctx = &sf->mctx;
434 }
435
436 if (!mctx && next_sp - sp >= sizeof(struct rt_signal_frame_32) &&
437 is_rt_sigreturn_32_address(next_ip, sp) &&
438 sane_rt_signal_32_frame(sp)) {
439 rt_sf = (struct rt_signal_frame_32 __user *) (unsigned long) sp;
440 mctx = &rt_sf->uc.uc_mcontext;
441 }
442
443 if (!mctx)
444 return NULL;
445 return mctx->mc_gregs;
446}
447
448static void perf_callchain_user_32(struct pt_regs *regs,
449 struct perf_callchain_entry *entry)
450{
451 unsigned int sp, next_sp;
452 unsigned int next_ip;
453 unsigned int lr;
454 long level = 0;
455 unsigned int __user *fp, *uregs;
456
457 next_ip = regs->nip;
458 lr = regs->link;
459 sp = regs->gpr[1];
460 callchain_store(entry, PERF_CONTEXT_USER);
461 callchain_store(entry, next_ip);
462
463 while (entry->nr < PERF_MAX_STACK_DEPTH) {
464 fp = (unsigned int __user *) (unsigned long) sp;
465 if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp))
466 return;
467 if (level > 0 && read_user_stack_32(&fp[1], &next_ip))
468 return;
469
470 uregs = signal_frame_32_regs(sp, next_sp, next_ip);
471 if (!uregs && level <= 1)
472 uregs = signal_frame_32_regs(sp, next_sp, lr);
473 if (uregs) {
474 /*
475 * This looks like an signal frame, so restart
476 * the stack trace with the values in it.
477 */
478 if (read_user_stack_32(&uregs[PT_NIP], &next_ip) ||
479 read_user_stack_32(&uregs[PT_LNK], &lr) ||
480 read_user_stack_32(&uregs[PT_R1], &sp))
481 return;
482 level = 0;
483 callchain_store(entry, PERF_CONTEXT_USER);
484 callchain_store(entry, next_ip);
485 continue;
486 }
487
488 if (level == 0)
489 next_ip = lr;
490 callchain_store(entry, next_ip);
491 ++level;
492 sp = next_sp;
493 }
494}
495
496/*
497 * Since we can't get PMU interrupts inside a PMU interrupt handler,
498 * we don't need separate irq and nmi entries here.
499 */
500static DEFINE_PER_CPU(struct perf_callchain_entry, callchain);
501
502struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
503{
504 struct perf_callchain_entry *entry = &__get_cpu_var(callchain);
505
506 entry->nr = 0;
507
508 if (current->pid == 0) /* idle task? */
509 return entry;
510
511 if (!user_mode(regs)) {
512 perf_callchain_kernel(regs, entry);
513 if (current->mm)
514 regs = task_pt_regs(current);
515 else
516 regs = NULL;
517 }
518
519 if (regs) {
520 if (current_is_64bit())
521 perf_callchain_user_64(regs, entry);
522 else
523 perf_callchain_user_32(regs, entry);
524 }
525
526 return entry;
527}
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 5b7038f248b6..a685652effeb 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -92,15 +92,13 @@ static inline void create_shadowed_slbe(unsigned long ea, int ssize,
92 : "memory" ); 92 : "memory" );
93} 93}
94 94
95void slb_flush_and_rebolt(void) 95static void __slb_flush_and_rebolt(void)
96{ 96{
97 /* If you change this make sure you change SLB_NUM_BOLTED 97 /* If you change this make sure you change SLB_NUM_BOLTED
98 * appropriately too. */ 98 * appropriately too. */
99 unsigned long linear_llp, vmalloc_llp, lflags, vflags; 99 unsigned long linear_llp, vmalloc_llp, lflags, vflags;
100 unsigned long ksp_esid_data, ksp_vsid_data; 100 unsigned long ksp_esid_data, ksp_vsid_data;
101 101
102 WARN_ON(!irqs_disabled());
103
104 linear_llp = mmu_psize_defs[mmu_linear_psize].sllp; 102 linear_llp = mmu_psize_defs[mmu_linear_psize].sllp;
105 vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp; 103 vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp;
106 lflags = SLB_VSID_KERNEL | linear_llp; 104 lflags = SLB_VSID_KERNEL | linear_llp;
@@ -117,12 +115,6 @@ void slb_flush_and_rebolt(void)
117 ksp_vsid_data = get_slb_shadow()->save_area[2].vsid; 115 ksp_vsid_data = get_slb_shadow()->save_area[2].vsid;
118 } 116 }
119 117
120 /*
121 * We can't take a PMU exception in the following code, so hard
122 * disable interrupts.
123 */
124 hard_irq_disable();
125
126 /* We need to do this all in asm, so we're sure we don't touch 118 /* We need to do this all in asm, so we're sure we don't touch
127 * the stack between the slbia and rebolting it. */ 119 * the stack between the slbia and rebolting it. */
128 asm volatile("isync\n" 120 asm volatile("isync\n"
@@ -139,6 +131,21 @@ void slb_flush_and_rebolt(void)
139 : "memory"); 131 : "memory");
140} 132}
141 133
134void slb_flush_and_rebolt(void)
135{
136
137 WARN_ON(!irqs_disabled());
138
139 /*
140 * We can't take a PMU exception in the following code, so hard
141 * disable interrupts.
142 */
143 hard_irq_disable();
144
145 __slb_flush_and_rebolt();
146 get_paca()->slb_cache_ptr = 0;
147}
148
142void slb_vmalloc_update(void) 149void slb_vmalloc_update(void)
143{ 150{
144 unsigned long vflags; 151 unsigned long vflags;
@@ -180,12 +187,20 @@ static inline int esids_match(unsigned long addr1, unsigned long addr2)
180/* Flush all user entries from the segment table of the current processor. */ 187/* Flush all user entries from the segment table of the current processor. */
181void switch_slb(struct task_struct *tsk, struct mm_struct *mm) 188void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
182{ 189{
183 unsigned long offset = get_paca()->slb_cache_ptr; 190 unsigned long offset;
184 unsigned long slbie_data = 0; 191 unsigned long slbie_data = 0;
185 unsigned long pc = KSTK_EIP(tsk); 192 unsigned long pc = KSTK_EIP(tsk);
186 unsigned long stack = KSTK_ESP(tsk); 193 unsigned long stack = KSTK_ESP(tsk);
187 unsigned long unmapped_base; 194 unsigned long unmapped_base;
188 195
196 /*
197 * We need interrupts hard-disabled here, not just soft-disabled,
198 * so that a PMU interrupt can't occur, which might try to access
199 * user memory (to get a stack trace) and possible cause an SLB miss
200 * which would update the slb_cache/slb_cache_ptr fields in the PACA.
201 */
202 hard_irq_disable();
203 offset = get_paca()->slb_cache_ptr;
189 if (!cpu_has_feature(CPU_FTR_NO_SLBIE_B) && 204 if (!cpu_has_feature(CPU_FTR_NO_SLBIE_B) &&
190 offset <= SLB_CACHE_ENTRIES) { 205 offset <= SLB_CACHE_ENTRIES) {
191 int i; 206 int i;
@@ -200,7 +215,7 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
200 } 215 }
201 asm volatile("isync" : : : "memory"); 216 asm volatile("isync" : : : "memory");
202 } else { 217 } else {
203 slb_flush_and_rebolt(); 218 __slb_flush_and_rebolt();
204 } 219 }
205 220
206 /* Workaround POWER5 < DD2.1 issue */ 221 /* Workaround POWER5 < DD2.1 issue */
diff --git a/arch/powerpc/mm/stab.c b/arch/powerpc/mm/stab.c
index 98cd1dc2ae75..ab5fb48b3e90 100644
--- a/arch/powerpc/mm/stab.c
+++ b/arch/powerpc/mm/stab.c
@@ -164,7 +164,7 @@ void switch_stab(struct task_struct *tsk, struct mm_struct *mm)
164{ 164{
165 struct stab_entry *stab = (struct stab_entry *) get_paca()->stab_addr; 165 struct stab_entry *stab = (struct stab_entry *) get_paca()->stab_addr;
166 struct stab_entry *ste; 166 struct stab_entry *ste;
167 unsigned long offset = __get_cpu_var(stab_cache_ptr); 167 unsigned long offset;
168 unsigned long pc = KSTK_EIP(tsk); 168 unsigned long pc = KSTK_EIP(tsk);
169 unsigned long stack = KSTK_ESP(tsk); 169 unsigned long stack = KSTK_ESP(tsk);
170 unsigned long unmapped_base; 170 unsigned long unmapped_base;
@@ -172,6 +172,15 @@ void switch_stab(struct task_struct *tsk, struct mm_struct *mm)
172 /* Force previous translations to complete. DRENG */ 172 /* Force previous translations to complete. DRENG */
173 asm volatile("isync" : : : "memory"); 173 asm volatile("isync" : : : "memory");
174 174
175 /*
176 * We need interrupts hard-disabled here, not just soft-disabled,
177 * so that a PMU interrupt can't occur, which might try to access
178 * user memory (to get a stack trace) and possible cause an STAB miss
179 * which would update the stab_cache/stab_cache_ptr per-cpu variables.
180 */
181 hard_irq_disable();
182
183 offset = __get_cpu_var(stab_cache_ptr);
175 if (offset <= NR_STAB_CACHE_ENTRIES) { 184 if (offset <= NR_STAB_CACHE_ENTRIES) {
176 int i; 185 int i;
177 186
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index e030e86ff6a3..1c866efd217d 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -84,7 +84,7 @@ config S390
84 select HAVE_FUNCTION_TRACER 84 select HAVE_FUNCTION_TRACER
85 select HAVE_FUNCTION_TRACE_MCOUNT_TEST 85 select HAVE_FUNCTION_TRACE_MCOUNT_TEST
86 select HAVE_FTRACE_MCOUNT_RECORD 86 select HAVE_FTRACE_MCOUNT_RECORD
87 select HAVE_FTRACE_SYSCALLS 87 select HAVE_SYSCALL_TRACEPOINTS
88 select HAVE_DYNAMIC_FTRACE 88 select HAVE_DYNAMIC_FTRACE
89 select HAVE_FUNCTION_GRAPH_TRACER 89 select HAVE_FUNCTION_GRAPH_TRACER
90 select HAVE_DEFAULT_NO_SPIN_MUTEXES 90 select HAVE_DEFAULT_NO_SPIN_MUTEXES
diff --git a/arch/s390/defconfig b/arch/s390/defconfig
index fcba206529f3..4e91a2573cc4 100644
--- a/arch/s390/defconfig
+++ b/arch/s390/defconfig
@@ -900,7 +900,7 @@ CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
900CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y 900CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
901CONFIG_HAVE_DYNAMIC_FTRACE=y 901CONFIG_HAVE_DYNAMIC_FTRACE=y
902CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y 902CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
903CONFIG_HAVE_FTRACE_SYSCALLS=y 903CONFIG_HAVE_SYSCALL_TRACEPOINTS=y
904CONFIG_TRACING_SUPPORT=y 904CONFIG_TRACING_SUPPORT=y
905CONFIG_FTRACE=y 905CONFIG_FTRACE=y
906# CONFIG_FUNCTION_TRACER is not set 906# CONFIG_FUNCTION_TRACER is not set
diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h
index c9af0d19c7ab..41ce6861174e 100644
--- a/arch/s390/include/asm/spinlock.h
+++ b/arch/s390/include/asm/spinlock.h
@@ -191,4 +191,33 @@ static inline int __raw_write_trylock(raw_rwlock_t *rw)
191#define _raw_read_relax(lock) cpu_relax() 191#define _raw_read_relax(lock) cpu_relax()
192#define _raw_write_relax(lock) cpu_relax() 192#define _raw_write_relax(lock) cpu_relax()
193 193
194#define __always_inline__spin_lock
195#define __always_inline__read_lock
196#define __always_inline__write_lock
197#define __always_inline__spin_lock_bh
198#define __always_inline__read_lock_bh
199#define __always_inline__write_lock_bh
200#define __always_inline__spin_lock_irq
201#define __always_inline__read_lock_irq
202#define __always_inline__write_lock_irq
203#define __always_inline__spin_lock_irqsave
204#define __always_inline__read_lock_irqsave
205#define __always_inline__write_lock_irqsave
206#define __always_inline__spin_trylock
207#define __always_inline__read_trylock
208#define __always_inline__write_trylock
209#define __always_inline__spin_trylock_bh
210#define __always_inline__spin_unlock
211#define __always_inline__read_unlock
212#define __always_inline__write_unlock
213#define __always_inline__spin_unlock_bh
214#define __always_inline__read_unlock_bh
215#define __always_inline__write_unlock_bh
216#define __always_inline__spin_unlock_irq
217#define __always_inline__read_unlock_irq
218#define __always_inline__write_unlock_irq
219#define __always_inline__spin_unlock_irqrestore
220#define __always_inline__read_unlock_irqrestore
221#define __always_inline__write_unlock_irqrestore
222
194#endif /* __ASM_SPINLOCK_H */ 223#endif /* __ASM_SPINLOCK_H */
diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h
index ba1cab9fc1f9..07eb61b2fb3a 100644
--- a/arch/s390/include/asm/thread_info.h
+++ b/arch/s390/include/asm/thread_info.h
@@ -92,7 +92,7 @@ static inline struct thread_info *current_thread_info(void)
92#define TIF_SYSCALL_TRACE 8 /* syscall trace active */ 92#define TIF_SYSCALL_TRACE 8 /* syscall trace active */
93#define TIF_SYSCALL_AUDIT 9 /* syscall auditing active */ 93#define TIF_SYSCALL_AUDIT 9 /* syscall auditing active */
94#define TIF_SECCOMP 10 /* secure computing */ 94#define TIF_SECCOMP 10 /* secure computing */
95#define TIF_SYSCALL_FTRACE 11 /* ftrace syscall instrumentation */ 95#define TIF_SYSCALL_TRACEPOINT 11 /* syscall tracepoint instrumentation */
96#define TIF_USEDFPU 16 /* FPU was used by this task this quantum (SMP) */ 96#define TIF_USEDFPU 16 /* FPU was used by this task this quantum (SMP) */
97#define TIF_POLLING_NRFLAG 17 /* true if poll_idle() is polling 97#define TIF_POLLING_NRFLAG 17 /* true if poll_idle() is polling
98 TIF_NEED_RESCHED */ 98 TIF_NEED_RESCHED */
@@ -111,7 +111,7 @@ static inline struct thread_info *current_thread_info(void)
111#define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) 111#define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
112#define _TIF_SYSCALL_AUDIT (1<<TIF_SYSCALL_AUDIT) 112#define _TIF_SYSCALL_AUDIT (1<<TIF_SYSCALL_AUDIT)
113#define _TIF_SECCOMP (1<<TIF_SECCOMP) 113#define _TIF_SECCOMP (1<<TIF_SECCOMP)
114#define _TIF_SYSCALL_FTRACE (1<<TIF_SYSCALL_FTRACE) 114#define _TIF_SYSCALL_TRACEPOINT (1<<TIF_SYSCALL_TRACEPOINT)
115#define _TIF_USEDFPU (1<<TIF_USEDFPU) 115#define _TIF_USEDFPU (1<<TIF_USEDFPU)
116#define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG) 116#define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG)
117#define _TIF_31BIT (1<<TIF_31BIT) 117#define _TIF_31BIT (1<<TIF_31BIT)
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index f78580a74039..f43d2ee54464 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -54,7 +54,7 @@ _TIF_WORK_SVC = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \
54_TIF_WORK_INT = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \ 54_TIF_WORK_INT = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \
55 _TIF_MCCK_PENDING) 55 _TIF_MCCK_PENDING)
56_TIF_SYSCALL = (_TIF_SYSCALL_TRACE>>8 | _TIF_SYSCALL_AUDIT>>8 | \ 56_TIF_SYSCALL = (_TIF_SYSCALL_TRACE>>8 | _TIF_SYSCALL_AUDIT>>8 | \
57 _TIF_SECCOMP>>8 | _TIF_SYSCALL_FTRACE>>8) 57 _TIF_SECCOMP>>8 | _TIF_SYSCALL_TRACEPOINT>>8)
58 58
59STACK_SHIFT = PAGE_SHIFT + THREAD_ORDER 59STACK_SHIFT = PAGE_SHIFT + THREAD_ORDER
60STACK_SIZE = 1 << STACK_SHIFT 60STACK_SIZE = 1 << STACK_SHIFT
diff --git a/arch/s390/kernel/entry64.S b/arch/s390/kernel/entry64.S
index 009ca6175db9..a6f7b20df616 100644
--- a/arch/s390/kernel/entry64.S
+++ b/arch/s390/kernel/entry64.S
@@ -57,7 +57,7 @@ _TIF_WORK_SVC = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \
57_TIF_WORK_INT = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \ 57_TIF_WORK_INT = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \
58 _TIF_MCCK_PENDING) 58 _TIF_MCCK_PENDING)
59_TIF_SYSCALL = (_TIF_SYSCALL_TRACE>>8 | _TIF_SYSCALL_AUDIT>>8 | \ 59_TIF_SYSCALL = (_TIF_SYSCALL_TRACE>>8 | _TIF_SYSCALL_AUDIT>>8 | \
60 _TIF_SECCOMP>>8 | _TIF_SYSCALL_FTRACE>>8) 60 _TIF_SECCOMP>>8 | _TIF_SYSCALL_TRACEPOINT>>8)
61 61
62#define BASED(name) name-system_call(%r13) 62#define BASED(name) name-system_call(%r13)
63 63
diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
index 3e298e64f0db..57bdcb1e3cdf 100644
--- a/arch/s390/kernel/ftrace.c
+++ b/arch/s390/kernel/ftrace.c
@@ -220,6 +220,29 @@ struct syscall_metadata *syscall_nr_to_meta(int nr)
220 return syscalls_metadata[nr]; 220 return syscalls_metadata[nr];
221} 221}
222 222
223int syscall_name_to_nr(char *name)
224{
225 int i;
226
227 if (!syscalls_metadata)
228 return -1;
229 for (i = 0; i < NR_syscalls; i++)
230 if (syscalls_metadata[i])
231 if (!strcmp(syscalls_metadata[i]->name, name))
232 return i;
233 return -1;
234}
235
236void set_syscall_enter_id(int num, int id)
237{
238 syscalls_metadata[num]->enter_id = id;
239}
240
241void set_syscall_exit_id(int num, int id)
242{
243 syscalls_metadata[num]->exit_id = id;
244}
245
223static struct syscall_metadata *find_syscall_meta(unsigned long syscall) 246static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
224{ 247{
225 struct syscall_metadata *start; 248 struct syscall_metadata *start;
@@ -237,24 +260,19 @@ static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
237 return NULL; 260 return NULL;
238} 261}
239 262
240void arch_init_ftrace_syscalls(void) 263static int __init arch_init_ftrace_syscalls(void)
241{ 264{
242 struct syscall_metadata *meta; 265 struct syscall_metadata *meta;
243 int i; 266 int i;
244 static atomic_t refs;
245
246 if (atomic_inc_return(&refs) != 1)
247 goto out;
248 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * NR_syscalls, 267 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * NR_syscalls,
249 GFP_KERNEL); 268 GFP_KERNEL);
250 if (!syscalls_metadata) 269 if (!syscalls_metadata)
251 goto out; 270 return -ENOMEM;
252 for (i = 0; i < NR_syscalls; i++) { 271 for (i = 0; i < NR_syscalls; i++) {
253 meta = find_syscall_meta((unsigned long)sys_call_table[i]); 272 meta = find_syscall_meta((unsigned long)sys_call_table[i]);
254 syscalls_metadata[i] = meta; 273 syscalls_metadata[i] = meta;
255 } 274 }
256 return; 275 return 0;
257out:
258 atomic_dec(&refs);
259} 276}
277arch_initcall(arch_init_ftrace_syscalls);
260#endif 278#endif
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 43acd73105b7..f3ddd7ac06c5 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -51,6 +51,9 @@
51#include "compat_ptrace.h" 51#include "compat_ptrace.h"
52#endif 52#endif
53 53
54#define CREATE_TRACE_POINTS
55#include <trace/events/syscalls.h>
56
54enum s390_regset { 57enum s390_regset {
55 REGSET_GENERAL, 58 REGSET_GENERAL,
56 REGSET_FP, 59 REGSET_FP,
@@ -661,8 +664,8 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
661 ret = -1; 664 ret = -1;
662 } 665 }
663 666
664 if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) 667 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
665 ftrace_syscall_enter(regs); 668 trace_sys_enter(regs, regs->gprs[2]);
666 669
667 if (unlikely(current->audit_context)) 670 if (unlikely(current->audit_context))
668 audit_syscall_entry(is_compat_task() ? 671 audit_syscall_entry(is_compat_task() ?
@@ -679,8 +682,8 @@ asmlinkage void do_syscall_trace_exit(struct pt_regs *regs)
679 audit_syscall_exit(AUDITSC_RESULT(regs->gprs[2]), 682 audit_syscall_exit(AUDITSC_RESULT(regs->gprs[2]),
680 regs->gprs[2]); 683 regs->gprs[2]);
681 684
682 if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) 685 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
683 ftrace_syscall_exit(regs); 686 trace_sys_exit(regs, regs->gprs[2]);
684 687
685 if (test_thread_flag(TIF_SYSCALL_TRACE)) 688 if (test_thread_flag(TIF_SYSCALL_TRACE))
686 tracehook_report_syscall_exit(regs, 0); 689 tracehook_report_syscall_exit(regs, 0);
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 3f8b6a92eabd..233cff53a623 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -25,6 +25,8 @@ config SPARC
25 select ARCH_WANT_OPTIONAL_GPIOLIB 25 select ARCH_WANT_OPTIONAL_GPIOLIB
26 select RTC_CLASS 26 select RTC_CLASS
27 select RTC_DRV_M48T59 27 select RTC_DRV_M48T59
28 select HAVE_DMA_ATTRS
29 select HAVE_DMA_API_DEBUG
28 30
29config SPARC32 31config SPARC32
30 def_bool !64BIT 32 def_bool !64BIT
diff --git a/arch/sparc/include/asm/dma-mapping.h b/arch/sparc/include/asm/dma-mapping.h
index 204e4bf64438..5a8c308e2b5c 100644
--- a/arch/sparc/include/asm/dma-mapping.h
+++ b/arch/sparc/include/asm/dma-mapping.h
@@ -3,6 +3,7 @@
3 3
4#include <linux/scatterlist.h> 4#include <linux/scatterlist.h>
5#include <linux/mm.h> 5#include <linux/mm.h>
6#include <linux/dma-debug.h>
6 7
7#define DMA_ERROR_CODE (~(dma_addr_t)0x0) 8#define DMA_ERROR_CODE (~(dma_addr_t)0x0)
8 9
@@ -13,142 +14,40 @@ extern int dma_set_mask(struct device *dev, u64 dma_mask);
13#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) 14#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
14#define dma_is_consistent(d, h) (1) 15#define dma_is_consistent(d, h) (1)
15 16
16struct dma_ops { 17extern struct dma_map_ops *dma_ops, pci32_dma_ops;
17 void *(*alloc_coherent)(struct device *dev, size_t size, 18extern struct bus_type pci_bus_type;
18 dma_addr_t *dma_handle, gfp_t flag);
19 void (*free_coherent)(struct device *dev, size_t size,
20 void *cpu_addr, dma_addr_t dma_handle);
21 dma_addr_t (*map_page)(struct device *dev, struct page *page,
22 unsigned long offset, size_t size,
23 enum dma_data_direction direction);
24 void (*unmap_page)(struct device *dev, dma_addr_t dma_addr,
25 size_t size,
26 enum dma_data_direction direction);
27 int (*map_sg)(struct device *dev, struct scatterlist *sg, int nents,
28 enum dma_data_direction direction);
29 void (*unmap_sg)(struct device *dev, struct scatterlist *sg,
30 int nhwentries,
31 enum dma_data_direction direction);
32 void (*sync_single_for_cpu)(struct device *dev,
33 dma_addr_t dma_handle, size_t size,
34 enum dma_data_direction direction);
35 void (*sync_single_for_device)(struct device *dev,
36 dma_addr_t dma_handle, size_t size,
37 enum dma_data_direction direction);
38 void (*sync_sg_for_cpu)(struct device *dev, struct scatterlist *sg,
39 int nelems,
40 enum dma_data_direction direction);
41 void (*sync_sg_for_device)(struct device *dev,
42 struct scatterlist *sg, int nents,
43 enum dma_data_direction dir);
44};
45extern const struct dma_ops *dma_ops;
46 19
47static inline void *dma_alloc_coherent(struct device *dev, size_t size, 20static inline struct dma_map_ops *get_dma_ops(struct device *dev)
48 dma_addr_t *dma_handle, gfp_t flag)
49{
50 return dma_ops->alloc_coherent(dev, size, dma_handle, flag);
51}
52
53static inline void dma_free_coherent(struct device *dev, size_t size,
54 void *cpu_addr, dma_addr_t dma_handle)
55{
56 dma_ops->free_coherent(dev, size, cpu_addr, dma_handle);
57}
58
59static inline dma_addr_t dma_map_single(struct device *dev, void *cpu_addr,
60 size_t size,
61 enum dma_data_direction direction)
62{
63 return dma_ops->map_page(dev, virt_to_page(cpu_addr),
64 (unsigned long)cpu_addr & ~PAGE_MASK, size,
65 direction);
66}
67
68static inline void dma_unmap_single(struct device *dev, dma_addr_t dma_addr,
69 size_t size,
70 enum dma_data_direction direction)
71{
72 dma_ops->unmap_page(dev, dma_addr, size, direction);
73}
74
75static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
76 unsigned long offset, size_t size,
77 enum dma_data_direction direction)
78{
79 return dma_ops->map_page(dev, page, offset, size, direction);
80}
81
82static inline void dma_unmap_page(struct device *dev, dma_addr_t dma_address,
83 size_t size,
84 enum dma_data_direction direction)
85{
86 dma_ops->unmap_page(dev, dma_address, size, direction);
87}
88
89static inline int dma_map_sg(struct device *dev, struct scatterlist *sg,
90 int nents, enum dma_data_direction direction)
91{
92 return dma_ops->map_sg(dev, sg, nents, direction);
93}
94
95static inline void dma_unmap_sg(struct device *dev, struct scatterlist *sg,
96 int nents, enum dma_data_direction direction)
97{ 21{
98 dma_ops->unmap_sg(dev, sg, nents, direction); 22#if defined(CONFIG_SPARC32) && defined(CONFIG_PCI)
99} 23 if (dev->bus == &pci_bus_type)
100 24 return &pci32_dma_ops;
101static inline void dma_sync_single_for_cpu(struct device *dev, 25#endif
102 dma_addr_t dma_handle, size_t size, 26 return dma_ops;
103 enum dma_data_direction direction)
104{
105 dma_ops->sync_single_for_cpu(dev, dma_handle, size, direction);
106} 27}
107 28
108static inline void dma_sync_single_for_device(struct device *dev, 29#include <asm-generic/dma-mapping-common.h>
109 dma_addr_t dma_handle,
110 size_t size,
111 enum dma_data_direction direction)
112{
113 if (dma_ops->sync_single_for_device)
114 dma_ops->sync_single_for_device(dev, dma_handle, size,
115 direction);
116}
117 30
118static inline void dma_sync_sg_for_cpu(struct device *dev, 31static inline void *dma_alloc_coherent(struct device *dev, size_t size,
119 struct scatterlist *sg, int nelems, 32 dma_addr_t *dma_handle, gfp_t flag)
120 enum dma_data_direction direction)
121{ 33{
122 dma_ops->sync_sg_for_cpu(dev, sg, nelems, direction); 34 struct dma_map_ops *ops = get_dma_ops(dev);
123} 35 void *cpu_addr;
124 36
125static inline void dma_sync_sg_for_device(struct device *dev, 37 cpu_addr = ops->alloc_coherent(dev, size, dma_handle, flag);
126 struct scatterlist *sg, int nelems, 38 debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
127 enum dma_data_direction direction) 39 return cpu_addr;
128{
129 if (dma_ops->sync_sg_for_device)
130 dma_ops->sync_sg_for_device(dev, sg, nelems, direction);
131} 40}
132 41
133static inline void dma_sync_single_range_for_cpu(struct device *dev, 42static inline void dma_free_coherent(struct device *dev, size_t size,
134 dma_addr_t dma_handle, 43 void *cpu_addr, dma_addr_t dma_handle)
135 unsigned long offset,
136 size_t size,
137 enum dma_data_direction dir)
138{ 44{
139 dma_sync_single_for_cpu(dev, dma_handle+offset, size, dir); 45 struct dma_map_ops *ops = get_dma_ops(dev);
140}
141 46
142static inline void dma_sync_single_range_for_device(struct device *dev, 47 debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
143 dma_addr_t dma_handle, 48 ops->free_coherent(dev, size, cpu_addr, dma_handle);
144 unsigned long offset,
145 size_t size,
146 enum dma_data_direction dir)
147{
148 dma_sync_single_for_device(dev, dma_handle+offset, size, dir);
149} 49}
150 50
151
152static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) 51static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
153{ 52{
154 return (dma_addr == DMA_ERROR_CODE); 53 return (dma_addr == DMA_ERROR_CODE);
diff --git a/arch/sparc/include/asm/irq_64.h b/arch/sparc/include/asm/irq_64.h
index 1934f2cbf513..a0b443cb3c1f 100644
--- a/arch/sparc/include/asm/irq_64.h
+++ b/arch/sparc/include/asm/irq_64.h
@@ -89,8 +89,8 @@ static inline unsigned long get_softint(void)
89 return retval; 89 return retval;
90} 90}
91 91
92void __trigger_all_cpu_backtrace(void); 92void arch_trigger_all_cpu_backtrace(void);
93#define trigger_all_cpu_backtrace() __trigger_all_cpu_backtrace() 93#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
94 94
95extern void *hardirq_stack[NR_CPUS]; 95extern void *hardirq_stack[NR_CPUS];
96extern void *softirq_stack[NR_CPUS]; 96extern void *softirq_stack[NR_CPUS];
diff --git a/arch/sparc/include/asm/pci.h b/arch/sparc/include/asm/pci.h
index 6e14fd179335..d9c031f9910f 100644
--- a/arch/sparc/include/asm/pci.h
+++ b/arch/sparc/include/asm/pci.h
@@ -5,4 +5,7 @@
5#else 5#else
6#include <asm/pci_32.h> 6#include <asm/pci_32.h>
7#endif 7#endif
8
9#include <asm-generic/pci-dma-compat.h>
10
8#endif 11#endif
diff --git a/arch/sparc/include/asm/pci_32.h b/arch/sparc/include/asm/pci_32.h
index b41c4c198159..ac0e8369fd97 100644
--- a/arch/sparc/include/asm/pci_32.h
+++ b/arch/sparc/include/asm/pci_32.h
@@ -31,42 +31,8 @@ static inline void pcibios_penalize_isa_irq(int irq, int active)
31 */ 31 */
32#define PCI_DMA_BUS_IS_PHYS (0) 32#define PCI_DMA_BUS_IS_PHYS (0)
33 33
34#include <asm/scatterlist.h>
35
36struct pci_dev; 34struct pci_dev;
37 35
38/* Allocate and map kernel buffer using consistent mode DMA for a device.
39 * hwdev should be valid struct pci_dev pointer for PCI devices.
40 */
41extern void *pci_alloc_consistent(struct pci_dev *hwdev, size_t size, dma_addr_t *dma_handle);
42
43/* Free and unmap a consistent DMA buffer.
44 * cpu_addr is what was returned from pci_alloc_consistent,
45 * size must be the same as what as passed into pci_alloc_consistent,
46 * and likewise dma_addr must be the same as what *dma_addrp was set to.
47 *
48 * References to the memory and mappings assosciated with cpu_addr/dma_addr
49 * past this call are illegal.
50 */
51extern void pci_free_consistent(struct pci_dev *hwdev, size_t size, void *vaddr, dma_addr_t dma_handle);
52
53/* Map a single buffer of the indicated size for DMA in streaming mode.
54 * The 32-bit bus address to use is returned.
55 *
56 * Once the device is given the dma address, the device owns this memory
57 * until either pci_unmap_single or pci_dma_sync_single_for_cpu is performed.
58 */
59extern dma_addr_t pci_map_single(struct pci_dev *hwdev, void *ptr, size_t size, int direction);
60
61/* Unmap a single streaming mode DMA translation. The dma_addr and size
62 * must match what was provided for in a previous pci_map_single call. All
63 * other usages are undefined.
64 *
65 * After this call, reads by the cpu to the buffer are guaranteed to see
66 * whatever the device wrote there.
67 */
68extern void pci_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_addr, size_t size, int direction);
69
70/* pci_unmap_{single,page} is not a nop, thus... */ 36/* pci_unmap_{single,page} is not a nop, thus... */
71#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \ 37#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
72 dma_addr_t ADDR_NAME; 38 dma_addr_t ADDR_NAME;
@@ -81,69 +47,6 @@ extern void pci_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_addr, size_t
81#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \ 47#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
82 (((PTR)->LEN_NAME) = (VAL)) 48 (((PTR)->LEN_NAME) = (VAL))
83 49
84/*
85 * Same as above, only with pages instead of mapped addresses.
86 */
87extern dma_addr_t pci_map_page(struct pci_dev *hwdev, struct page *page,
88 unsigned long offset, size_t size, int direction);
89extern void pci_unmap_page(struct pci_dev *hwdev,
90 dma_addr_t dma_address, size_t size, int direction);
91
92/* Map a set of buffers described by scatterlist in streaming
93 * mode for DMA. This is the scather-gather version of the
94 * above pci_map_single interface. Here the scatter gather list
95 * elements are each tagged with the appropriate dma address
96 * and length. They are obtained via sg_dma_{address,length}(SG).
97 *
98 * NOTE: An implementation may be able to use a smaller number of
99 * DMA address/length pairs than there are SG table elements.
100 * (for example via virtual mapping capabilities)
101 * The routine returns the number of addr/length pairs actually
102 * used, at most nents.
103 *
104 * Device ownership issues as mentioned above for pci_map_single are
105 * the same here.
106 */
107extern int pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sg, int nents, int direction);
108
109/* Unmap a set of streaming mode DMA translations.
110 * Again, cpu read rules concerning calls here are the same as for
111 * pci_unmap_single() above.
112 */
113extern void pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sg, int nhwents, int direction);
114
115/* Make physical memory consistent for a single
116 * streaming mode DMA translation after a transfer.
117 *
118 * If you perform a pci_map_single() but wish to interrogate the
119 * buffer using the cpu, yet do not wish to teardown the PCI dma
120 * mapping, you must call this function before doing so. At the
121 * next point you give the PCI dma address back to the card, you
122 * must first perform a pci_dma_sync_for_device, and then the device
123 * again owns the buffer.
124 */
125extern void pci_dma_sync_single_for_cpu(struct pci_dev *hwdev, dma_addr_t dma_handle, size_t size, int direction);
126extern void pci_dma_sync_single_for_device(struct pci_dev *hwdev, dma_addr_t dma_handle, size_t size, int direction);
127
128/* Make physical memory consistent for a set of streaming
129 * mode DMA translations after a transfer.
130 *
131 * The same as pci_dma_sync_single_* but for a scatter-gather list,
132 * same rules and usage.
133 */
134extern void pci_dma_sync_sg_for_cpu(struct pci_dev *hwdev, struct scatterlist *sg, int nelems, int direction);
135extern void pci_dma_sync_sg_for_device(struct pci_dev *hwdev, struct scatterlist *sg, int nelems, int direction);
136
137/* Return whether the given PCI device DMA address mask can
138 * be supported properly. For example, if your device can
139 * only drive the low 24-bits during PCI bus mastering, then
140 * you would pass 0x00ffffff as the mask to this function.
141 */
142static inline int pci_dma_supported(struct pci_dev *hwdev, u64 mask)
143{
144 return 1;
145}
146
147#ifdef CONFIG_PCI 50#ifdef CONFIG_PCI
148static inline void pci_dma_burst_advice(struct pci_dev *pdev, 51static inline void pci_dma_burst_advice(struct pci_dev *pdev,
149 enum pci_dma_burst_strategy *strat, 52 enum pci_dma_burst_strategy *strat,
@@ -154,14 +57,6 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
154} 57}
155#endif 58#endif
156 59
157#define PCI_DMA_ERROR_CODE (~(dma_addr_t)0x0)
158
159static inline int pci_dma_mapping_error(struct pci_dev *pdev,
160 dma_addr_t dma_addr)
161{
162 return (dma_addr == PCI_DMA_ERROR_CODE);
163}
164
165struct device_node; 60struct device_node;
166extern struct device_node *pci_device_to_OF_node(struct pci_dev *pdev); 61extern struct device_node *pci_device_to_OF_node(struct pci_dev *pdev);
167 62
diff --git a/arch/sparc/include/asm/pci_64.h b/arch/sparc/include/asm/pci_64.h
index 7a1e3566e59c..5cc9f6aa5494 100644
--- a/arch/sparc/include/asm/pci_64.h
+++ b/arch/sparc/include/asm/pci_64.h
@@ -35,37 +35,6 @@ static inline void pcibios_penalize_isa_irq(int irq, int active)
35 */ 35 */
36#define PCI_DMA_BUS_IS_PHYS (0) 36#define PCI_DMA_BUS_IS_PHYS (0)
37 37
38static inline void *pci_alloc_consistent(struct pci_dev *pdev, size_t size,
39 dma_addr_t *dma_handle)
40{
41 return dma_alloc_coherent(&pdev->dev, size, dma_handle, GFP_ATOMIC);
42}
43
44static inline void pci_free_consistent(struct pci_dev *pdev, size_t size,
45 void *vaddr, dma_addr_t dma_handle)
46{
47 return dma_free_coherent(&pdev->dev, size, vaddr, dma_handle);
48}
49
50static inline dma_addr_t pci_map_single(struct pci_dev *pdev, void *ptr,
51 size_t size, int direction)
52{
53 return dma_map_single(&pdev->dev, ptr, size,
54 (enum dma_data_direction) direction);
55}
56
57static inline void pci_unmap_single(struct pci_dev *pdev, dma_addr_t dma_addr,
58 size_t size, int direction)
59{
60 dma_unmap_single(&pdev->dev, dma_addr, size,
61 (enum dma_data_direction) direction);
62}
63
64#define pci_map_page(dev, page, off, size, dir) \
65 pci_map_single(dev, (page_address(page) + (off)), size, dir)
66#define pci_unmap_page(dev,addr,sz,dir) \
67 pci_unmap_single(dev,addr,sz,dir)
68
69/* pci_unmap_{single,page} is not a nop, thus... */ 38/* pci_unmap_{single,page} is not a nop, thus... */
70#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \ 39#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
71 dma_addr_t ADDR_NAME; 40 dma_addr_t ADDR_NAME;
@@ -80,57 +49,6 @@ static inline void pci_unmap_single(struct pci_dev *pdev, dma_addr_t dma_addr,
80#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \ 49#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
81 (((PTR)->LEN_NAME) = (VAL)) 50 (((PTR)->LEN_NAME) = (VAL))
82 51
83static inline int pci_map_sg(struct pci_dev *pdev, struct scatterlist *sg,
84 int nents, int direction)
85{
86 return dma_map_sg(&pdev->dev, sg, nents,
87 (enum dma_data_direction) direction);
88}
89
90static inline void pci_unmap_sg(struct pci_dev *pdev, struct scatterlist *sg,
91 int nents, int direction)
92{
93 dma_unmap_sg(&pdev->dev, sg, nents,
94 (enum dma_data_direction) direction);
95}
96
97static inline void pci_dma_sync_single_for_cpu(struct pci_dev *pdev,
98 dma_addr_t dma_handle,
99 size_t size, int direction)
100{
101 dma_sync_single_for_cpu(&pdev->dev, dma_handle, size,
102 (enum dma_data_direction) direction);
103}
104
105static inline void pci_dma_sync_single_for_device(struct pci_dev *pdev,
106 dma_addr_t dma_handle,
107 size_t size, int direction)
108{
109 /* No flushing needed to sync cpu writes to the device. */
110}
111
112static inline void pci_dma_sync_sg_for_cpu(struct pci_dev *pdev,
113 struct scatterlist *sg,
114 int nents, int direction)
115{
116 dma_sync_sg_for_cpu(&pdev->dev, sg, nents,
117 (enum dma_data_direction) direction);
118}
119
120static inline void pci_dma_sync_sg_for_device(struct pci_dev *pdev,
121 struct scatterlist *sg,
122 int nelems, int direction)
123{
124 /* No flushing needed to sync cpu writes to the device. */
125}
126
127/* Return whether the given PCI device DMA address mask can
128 * be supported properly. For example, if your device can
129 * only drive the low 24-bits during PCI bus mastering, then
130 * you would pass 0x00ffffff as the mask to this function.
131 */
132extern int pci_dma_supported(struct pci_dev *hwdev, u64 mask);
133
134/* PCI IOMMU mapping bypass support. */ 52/* PCI IOMMU mapping bypass support. */
135 53
136/* PCI 64-bit addressing works for all slots on all controller 54/* PCI 64-bit addressing works for all slots on all controller
@@ -140,12 +58,6 @@ extern int pci_dma_supported(struct pci_dev *hwdev, u64 mask);
140#define PCI64_REQUIRED_MASK (~(dma64_addr_t)0) 58#define PCI64_REQUIRED_MASK (~(dma64_addr_t)0)
141#define PCI64_ADDR_BASE 0xfffc000000000000UL 59#define PCI64_ADDR_BASE 0xfffc000000000000UL
142 60
143static inline int pci_dma_mapping_error(struct pci_dev *pdev,
144 dma_addr_t dma_addr)
145{
146 return dma_mapping_error(&pdev->dev, dma_addr);
147}
148
149#ifdef CONFIG_PCI 61#ifdef CONFIG_PCI
150static inline void pci_dma_burst_advice(struct pci_dev *pdev, 62static inline void pci_dma_burst_advice(struct pci_dev *pdev,
151 enum pci_dma_burst_strategy *strat, 63 enum pci_dma_burst_strategy *strat,
diff --git a/arch/sparc/include/asm/spinlock_32.h b/arch/sparc/include/asm/spinlock_32.h
index 46f91ab66a50..857630cff636 100644
--- a/arch/sparc/include/asm/spinlock_32.h
+++ b/arch/sparc/include/asm/spinlock_32.h
@@ -76,7 +76,7 @@ static inline void __raw_spin_unlock(raw_spinlock_t *lock)
76 * 76 *
77 * Unfortunately this scheme limits us to ~16,000,000 cpus. 77 * Unfortunately this scheme limits us to ~16,000,000 cpus.
78 */ 78 */
79static inline void __read_lock(raw_rwlock_t *rw) 79static inline void arch_read_lock(raw_rwlock_t *rw)
80{ 80{
81 register raw_rwlock_t *lp asm("g1"); 81 register raw_rwlock_t *lp asm("g1");
82 lp = rw; 82 lp = rw;
@@ -92,11 +92,11 @@ static inline void __read_lock(raw_rwlock_t *rw)
92#define __raw_read_lock(lock) \ 92#define __raw_read_lock(lock) \
93do { unsigned long flags; \ 93do { unsigned long flags; \
94 local_irq_save(flags); \ 94 local_irq_save(flags); \
95 __read_lock(lock); \ 95 arch_read_lock(lock); \
96 local_irq_restore(flags); \ 96 local_irq_restore(flags); \
97} while(0) 97} while(0)
98 98
99static inline void __read_unlock(raw_rwlock_t *rw) 99static inline void arch_read_unlock(raw_rwlock_t *rw)
100{ 100{
101 register raw_rwlock_t *lp asm("g1"); 101 register raw_rwlock_t *lp asm("g1");
102 lp = rw; 102 lp = rw;
@@ -112,7 +112,7 @@ static inline void __read_unlock(raw_rwlock_t *rw)
112#define __raw_read_unlock(lock) \ 112#define __raw_read_unlock(lock) \
113do { unsigned long flags; \ 113do { unsigned long flags; \
114 local_irq_save(flags); \ 114 local_irq_save(flags); \
115 __read_unlock(lock); \ 115 arch_read_unlock(lock); \
116 local_irq_restore(flags); \ 116 local_irq_restore(flags); \
117} while(0) 117} while(0)
118 118
@@ -150,7 +150,7 @@ static inline int __raw_write_trylock(raw_rwlock_t *rw)
150 return (val == 0); 150 return (val == 0);
151} 151}
152 152
153static inline int __read_trylock(raw_rwlock_t *rw) 153static inline int arch_read_trylock(raw_rwlock_t *rw)
154{ 154{
155 register raw_rwlock_t *lp asm("g1"); 155 register raw_rwlock_t *lp asm("g1");
156 register int res asm("o0"); 156 register int res asm("o0");
@@ -169,7 +169,7 @@ static inline int __read_trylock(raw_rwlock_t *rw)
169({ unsigned long flags; \ 169({ unsigned long flags; \
170 int res; \ 170 int res; \
171 local_irq_save(flags); \ 171 local_irq_save(flags); \
172 res = __read_trylock(lock); \ 172 res = arch_read_trylock(lock); \
173 local_irq_restore(flags); \ 173 local_irq_restore(flags); \
174 res; \ 174 res; \
175}) 175})
diff --git a/arch/sparc/include/asm/spinlock_64.h b/arch/sparc/include/asm/spinlock_64.h
index f6b2b92ad8d2..43e514783582 100644
--- a/arch/sparc/include/asm/spinlock_64.h
+++ b/arch/sparc/include/asm/spinlock_64.h
@@ -92,7 +92,7 @@ static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long fla
92 92
93/* Multi-reader locks, these are much saner than the 32-bit Sparc ones... */ 93/* Multi-reader locks, these are much saner than the 32-bit Sparc ones... */
94 94
95static void inline __read_lock(raw_rwlock_t *lock) 95static void inline arch_read_lock(raw_rwlock_t *lock)
96{ 96{
97 unsigned long tmp1, tmp2; 97 unsigned long tmp1, tmp2;
98 98
@@ -115,7 +115,7 @@ static void inline __read_lock(raw_rwlock_t *lock)
115 : "memory"); 115 : "memory");
116} 116}
117 117
118static int inline __read_trylock(raw_rwlock_t *lock) 118static int inline arch_read_trylock(raw_rwlock_t *lock)
119{ 119{
120 int tmp1, tmp2; 120 int tmp1, tmp2;
121 121
@@ -136,7 +136,7 @@ static int inline __read_trylock(raw_rwlock_t *lock)
136 return tmp1; 136 return tmp1;
137} 137}
138 138
139static void inline __read_unlock(raw_rwlock_t *lock) 139static void inline arch_read_unlock(raw_rwlock_t *lock)
140{ 140{
141 unsigned long tmp1, tmp2; 141 unsigned long tmp1, tmp2;
142 142
@@ -152,7 +152,7 @@ static void inline __read_unlock(raw_rwlock_t *lock)
152 : "memory"); 152 : "memory");
153} 153}
154 154
155static void inline __write_lock(raw_rwlock_t *lock) 155static void inline arch_write_lock(raw_rwlock_t *lock)
156{ 156{
157 unsigned long mask, tmp1, tmp2; 157 unsigned long mask, tmp1, tmp2;
158 158
@@ -177,7 +177,7 @@ static void inline __write_lock(raw_rwlock_t *lock)
177 : "memory"); 177 : "memory");
178} 178}
179 179
180static void inline __write_unlock(raw_rwlock_t *lock) 180static void inline arch_write_unlock(raw_rwlock_t *lock)
181{ 181{
182 __asm__ __volatile__( 182 __asm__ __volatile__(
183" stw %%g0, [%0]" 183" stw %%g0, [%0]"
@@ -186,7 +186,7 @@ static void inline __write_unlock(raw_rwlock_t *lock)
186 : "memory"); 186 : "memory");
187} 187}
188 188
189static int inline __write_trylock(raw_rwlock_t *lock) 189static int inline arch_write_trylock(raw_rwlock_t *lock)
190{ 190{
191 unsigned long mask, tmp1, tmp2, result; 191 unsigned long mask, tmp1, tmp2, result;
192 192
@@ -210,14 +210,14 @@ static int inline __write_trylock(raw_rwlock_t *lock)
210 return result; 210 return result;
211} 211}
212 212
213#define __raw_read_lock(p) __read_lock(p) 213#define __raw_read_lock(p) arch_read_lock(p)
214#define __raw_read_lock_flags(p, f) __read_lock(p) 214#define __raw_read_lock_flags(p, f) arch_read_lock(p)
215#define __raw_read_trylock(p) __read_trylock(p) 215#define __raw_read_trylock(p) arch_read_trylock(p)
216#define __raw_read_unlock(p) __read_unlock(p) 216#define __raw_read_unlock(p) arch_read_unlock(p)
217#define __raw_write_lock(p) __write_lock(p) 217#define __raw_write_lock(p) arch_write_lock(p)
218#define __raw_write_lock_flags(p, f) __write_lock(p) 218#define __raw_write_lock_flags(p, f) arch_write_lock(p)
219#define __raw_write_unlock(p) __write_unlock(p) 219#define __raw_write_unlock(p) arch_write_unlock(p)
220#define __raw_write_trylock(p) __write_trylock(p) 220#define __raw_write_trylock(p) arch_write_trylock(p)
221 221
222#define __raw_read_can_lock(rw) (!((rw)->lock & 0x80000000UL)) 222#define __raw_read_can_lock(rw) (!((rw)->lock & 0x80000000UL))
223#define __raw_write_can_lock(rw) (!(rw)->lock) 223#define __raw_write_can_lock(rw) (!(rw)->lock)
diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile
index 475ce4696acd..29b88a580661 100644
--- a/arch/sparc/kernel/Makefile
+++ b/arch/sparc/kernel/Makefile
@@ -61,7 +61,7 @@ obj-$(CONFIG_SPARC64_SMP) += cpumap.o
61obj-$(CONFIG_SPARC32) += devres.o 61obj-$(CONFIG_SPARC32) += devres.o
62devres-y := ../../../kernel/irq/devres.o 62devres-y := ../../../kernel/irq/devres.o
63 63
64obj-$(CONFIG_SPARC32) += dma.o 64obj-y += dma.o
65 65
66obj-$(CONFIG_SPARC32_PCI) += pcic.o 66obj-$(CONFIG_SPARC32_PCI) += pcic.o
67 67
diff --git a/arch/sparc/kernel/dma.c b/arch/sparc/kernel/dma.c
index 524c32f97c55..e1ba8ee21b9a 100644
--- a/arch/sparc/kernel/dma.c
+++ b/arch/sparc/kernel/dma.c
@@ -1,178 +1,13 @@
1/* dma.c: PCI and SBUS DMA accessors for 32-bit sparc.
2 *
3 * Copyright (C) 2008 David S. Miller <davem@davemloft.net>
4 */
5
6#include <linux/kernel.h> 1#include <linux/kernel.h>
7#include <linux/module.h> 2#include <linux/module.h>
8#include <linux/dma-mapping.h> 3#include <linux/dma-mapping.h>
9#include <linux/scatterlist.h> 4#include <linux/dma-debug.h>
10#include <linux/mm.h>
11
12#ifdef CONFIG_PCI
13#include <linux/pci.h>
14#endif
15 5
16#include "dma.h" 6#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 15)
17 7
18int dma_supported(struct device *dev, u64 mask) 8static int __init dma_init(void)
19{ 9{
20#ifdef CONFIG_PCI 10 dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
21 if (dev->bus == &pci_bus_type)
22 return pci_dma_supported(to_pci_dev(dev), mask);
23#endif
24 return 0; 11 return 0;
25} 12}
26EXPORT_SYMBOL(dma_supported); 13fs_initcall(dma_init);
27
28int dma_set_mask(struct device *dev, u64 dma_mask)
29{
30#ifdef CONFIG_PCI
31 if (dev->bus == &pci_bus_type)
32 return pci_set_dma_mask(to_pci_dev(dev), dma_mask);
33#endif
34 return -EOPNOTSUPP;
35}
36EXPORT_SYMBOL(dma_set_mask);
37
38static void *dma32_alloc_coherent(struct device *dev, size_t size,
39 dma_addr_t *dma_handle, gfp_t flag)
40{
41#ifdef CONFIG_PCI
42 if (dev->bus == &pci_bus_type)
43 return pci_alloc_consistent(to_pci_dev(dev), size, dma_handle);
44#endif
45 return sbus_alloc_consistent(dev, size, dma_handle);
46}
47
48static void dma32_free_coherent(struct device *dev, size_t size,
49 void *cpu_addr, dma_addr_t dma_handle)
50{
51#ifdef CONFIG_PCI
52 if (dev->bus == &pci_bus_type) {
53 pci_free_consistent(to_pci_dev(dev), size,
54 cpu_addr, dma_handle);
55 return;
56 }
57#endif
58 sbus_free_consistent(dev, size, cpu_addr, dma_handle);
59}
60
61static dma_addr_t dma32_map_page(struct device *dev, struct page *page,
62 unsigned long offset, size_t size,
63 enum dma_data_direction direction)
64{
65#ifdef CONFIG_PCI
66 if (dev->bus == &pci_bus_type)
67 return pci_map_page(to_pci_dev(dev), page, offset,
68 size, (int)direction);
69#endif
70 return sbus_map_single(dev, page_address(page) + offset,
71 size, (int)direction);
72}
73
74static void dma32_unmap_page(struct device *dev, dma_addr_t dma_address,
75 size_t size, enum dma_data_direction direction)
76{
77#ifdef CONFIG_PCI
78 if (dev->bus == &pci_bus_type) {
79 pci_unmap_page(to_pci_dev(dev), dma_address,
80 size, (int)direction);
81 return;
82 }
83#endif
84 sbus_unmap_single(dev, dma_address, size, (int)direction);
85}
86
87static int dma32_map_sg(struct device *dev, struct scatterlist *sg,
88 int nents, enum dma_data_direction direction)
89{
90#ifdef CONFIG_PCI
91 if (dev->bus == &pci_bus_type)
92 return pci_map_sg(to_pci_dev(dev), sg, nents, (int)direction);
93#endif
94 return sbus_map_sg(dev, sg, nents, direction);
95}
96
97void dma32_unmap_sg(struct device *dev, struct scatterlist *sg,
98 int nents, enum dma_data_direction direction)
99{
100#ifdef CONFIG_PCI
101 if (dev->bus == &pci_bus_type) {
102 pci_unmap_sg(to_pci_dev(dev), sg, nents, (int)direction);
103 return;
104 }
105#endif
106 sbus_unmap_sg(dev, sg, nents, (int)direction);
107}
108
109static void dma32_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle,
110 size_t size,
111 enum dma_data_direction direction)
112{
113#ifdef CONFIG_PCI
114 if (dev->bus == &pci_bus_type) {
115 pci_dma_sync_single_for_cpu(to_pci_dev(dev), dma_handle,
116 size, (int)direction);
117 return;
118 }
119#endif
120 sbus_dma_sync_single_for_cpu(dev, dma_handle, size, (int) direction);
121}
122
123static void dma32_sync_single_for_device(struct device *dev,
124 dma_addr_t dma_handle, size_t size,
125 enum dma_data_direction direction)
126{
127#ifdef CONFIG_PCI
128 if (dev->bus == &pci_bus_type) {
129 pci_dma_sync_single_for_device(to_pci_dev(dev), dma_handle,
130 size, (int)direction);
131 return;
132 }
133#endif
134 sbus_dma_sync_single_for_device(dev, dma_handle, size, (int) direction);
135}
136
137static void dma32_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
138 int nelems, enum dma_data_direction direction)
139{
140#ifdef CONFIG_PCI
141 if (dev->bus == &pci_bus_type) {
142 pci_dma_sync_sg_for_cpu(to_pci_dev(dev), sg,
143 nelems, (int)direction);
144 return;
145 }
146#endif
147 BUG();
148}
149
150static void dma32_sync_sg_for_device(struct device *dev,
151 struct scatterlist *sg, int nelems,
152 enum dma_data_direction direction)
153{
154#ifdef CONFIG_PCI
155 if (dev->bus == &pci_bus_type) {
156 pci_dma_sync_sg_for_device(to_pci_dev(dev), sg,
157 nelems, (int)direction);
158 return;
159 }
160#endif
161 BUG();
162}
163
164static const struct dma_ops dma32_dma_ops = {
165 .alloc_coherent = dma32_alloc_coherent,
166 .free_coherent = dma32_free_coherent,
167 .map_page = dma32_map_page,
168 .unmap_page = dma32_unmap_page,
169 .map_sg = dma32_map_sg,
170 .unmap_sg = dma32_unmap_sg,
171 .sync_single_for_cpu = dma32_sync_single_for_cpu,
172 .sync_single_for_device = dma32_sync_single_for_device,
173 .sync_sg_for_cpu = dma32_sync_sg_for_cpu,
174 .sync_sg_for_device = dma32_sync_sg_for_device,
175};
176
177const struct dma_ops *dma_ops = &dma32_dma_ops;
178EXPORT_SYMBOL(dma_ops);
diff --git a/arch/sparc/kernel/dma.h b/arch/sparc/kernel/dma.h
deleted file mode 100644
index f8d8951adb53..000000000000
--- a/arch/sparc/kernel/dma.h
+++ /dev/null
@@ -1,14 +0,0 @@
1void *sbus_alloc_consistent(struct device *dev, long len, u32 *dma_addrp);
2void sbus_free_consistent(struct device *dev, long n, void *p, u32 ba);
3dma_addr_t sbus_map_single(struct device *dev, void *va,
4 size_t len, int direction);
5void sbus_unmap_single(struct device *dev, dma_addr_t ba,
6 size_t n, int direction);
7int sbus_map_sg(struct device *dev, struct scatterlist *sg,
8 int n, int direction);
9void sbus_unmap_sg(struct device *dev, struct scatterlist *sg,
10 int n, int direction);
11void sbus_dma_sync_single_for_cpu(struct device *dev, dma_addr_t ba,
12 size_t size, int direction);
13void sbus_dma_sync_single_for_device(struct device *dev, dma_addr_t ba,
14 size_t size, int direction);
diff --git a/arch/sparc/kernel/iommu.c b/arch/sparc/kernel/iommu.c
index 0aeaefe696b9..7690cc219ecc 100644
--- a/arch/sparc/kernel/iommu.c
+++ b/arch/sparc/kernel/iommu.c
@@ -353,7 +353,8 @@ static void dma_4u_free_coherent(struct device *dev, size_t size,
353 353
354static dma_addr_t dma_4u_map_page(struct device *dev, struct page *page, 354static dma_addr_t dma_4u_map_page(struct device *dev, struct page *page,
355 unsigned long offset, size_t sz, 355 unsigned long offset, size_t sz,
356 enum dma_data_direction direction) 356 enum dma_data_direction direction,
357 struct dma_attrs *attrs)
357{ 358{
358 struct iommu *iommu; 359 struct iommu *iommu;
359 struct strbuf *strbuf; 360 struct strbuf *strbuf;
@@ -474,7 +475,8 @@ do_flush_sync:
474} 475}
475 476
476static void dma_4u_unmap_page(struct device *dev, dma_addr_t bus_addr, 477static void dma_4u_unmap_page(struct device *dev, dma_addr_t bus_addr,
477 size_t sz, enum dma_data_direction direction) 478 size_t sz, enum dma_data_direction direction,
479 struct dma_attrs *attrs)
478{ 480{
479 struct iommu *iommu; 481 struct iommu *iommu;
480 struct strbuf *strbuf; 482 struct strbuf *strbuf;
@@ -520,7 +522,8 @@ static void dma_4u_unmap_page(struct device *dev, dma_addr_t bus_addr,
520} 522}
521 523
522static int dma_4u_map_sg(struct device *dev, struct scatterlist *sglist, 524static int dma_4u_map_sg(struct device *dev, struct scatterlist *sglist,
523 int nelems, enum dma_data_direction direction) 525 int nelems, enum dma_data_direction direction,
526 struct dma_attrs *attrs)
524{ 527{
525 struct scatterlist *s, *outs, *segstart; 528 struct scatterlist *s, *outs, *segstart;
526 unsigned long flags, handle, prot, ctx; 529 unsigned long flags, handle, prot, ctx;
@@ -691,7 +694,8 @@ static unsigned long fetch_sg_ctx(struct iommu *iommu, struct scatterlist *sg)
691} 694}
692 695
693static void dma_4u_unmap_sg(struct device *dev, struct scatterlist *sglist, 696static void dma_4u_unmap_sg(struct device *dev, struct scatterlist *sglist,
694 int nelems, enum dma_data_direction direction) 697 int nelems, enum dma_data_direction direction,
698 struct dma_attrs *attrs)
695{ 699{
696 unsigned long flags, ctx; 700 unsigned long flags, ctx;
697 struct scatterlist *sg; 701 struct scatterlist *sg;
@@ -822,7 +826,7 @@ static void dma_4u_sync_sg_for_cpu(struct device *dev,
822 spin_unlock_irqrestore(&iommu->lock, flags); 826 spin_unlock_irqrestore(&iommu->lock, flags);
823} 827}
824 828
825static const struct dma_ops sun4u_dma_ops = { 829static struct dma_map_ops sun4u_dma_ops = {
826 .alloc_coherent = dma_4u_alloc_coherent, 830 .alloc_coherent = dma_4u_alloc_coherent,
827 .free_coherent = dma_4u_free_coherent, 831 .free_coherent = dma_4u_free_coherent,
828 .map_page = dma_4u_map_page, 832 .map_page = dma_4u_map_page,
@@ -833,9 +837,11 @@ static const struct dma_ops sun4u_dma_ops = {
833 .sync_sg_for_cpu = dma_4u_sync_sg_for_cpu, 837 .sync_sg_for_cpu = dma_4u_sync_sg_for_cpu,
834}; 838};
835 839
836const struct dma_ops *dma_ops = &sun4u_dma_ops; 840struct dma_map_ops *dma_ops = &sun4u_dma_ops;
837EXPORT_SYMBOL(dma_ops); 841EXPORT_SYMBOL(dma_ops);
838 842
843extern int pci64_dma_supported(struct pci_dev *pdev, u64 device_mask);
844
839int dma_supported(struct device *dev, u64 device_mask) 845int dma_supported(struct device *dev, u64 device_mask)
840{ 846{
841 struct iommu *iommu = dev->archdata.iommu; 847 struct iommu *iommu = dev->archdata.iommu;
@@ -849,7 +855,7 @@ int dma_supported(struct device *dev, u64 device_mask)
849 855
850#ifdef CONFIG_PCI 856#ifdef CONFIG_PCI
851 if (dev->bus == &pci_bus_type) 857 if (dev->bus == &pci_bus_type)
852 return pci_dma_supported(to_pci_dev(dev), device_mask); 858 return pci64_dma_supported(to_pci_dev(dev), device_mask);
853#endif 859#endif
854 860
855 return 0; 861 return 0;
diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c
index 87ea0d03d975..edbea232c617 100644
--- a/arch/sparc/kernel/ioport.c
+++ b/arch/sparc/kernel/ioport.c
@@ -48,8 +48,6 @@
48#include <asm/iommu.h> 48#include <asm/iommu.h>
49#include <asm/io-unit.h> 49#include <asm/io-unit.h>
50 50
51#include "dma.h"
52
53#define mmu_inval_dma_area(p, l) /* Anton pulled it out for 2.4.0-xx */ 51#define mmu_inval_dma_area(p, l) /* Anton pulled it out for 2.4.0-xx */
54 52
55static struct resource *_sparc_find_resource(struct resource *r, 53static struct resource *_sparc_find_resource(struct resource *r,
@@ -246,7 +244,8 @@ EXPORT_SYMBOL(sbus_set_sbus64);
246 * Typically devices use them for control blocks. 244 * Typically devices use them for control blocks.
247 * CPU may access them without any explicit flushing. 245 * CPU may access them without any explicit flushing.
248 */ 246 */
249void *sbus_alloc_consistent(struct device *dev, long len, u32 *dma_addrp) 247static void *sbus_alloc_coherent(struct device *dev, size_t len,
248 dma_addr_t *dma_addrp, gfp_t gfp)
250{ 249{
251 struct of_device *op = to_of_device(dev); 250 struct of_device *op = to_of_device(dev);
252 unsigned long len_total = (len + PAGE_SIZE-1) & PAGE_MASK; 251 unsigned long len_total = (len + PAGE_SIZE-1) & PAGE_MASK;
@@ -299,7 +298,8 @@ err_nopages:
299 return NULL; 298 return NULL;
300} 299}
301 300
302void sbus_free_consistent(struct device *dev, long n, void *p, u32 ba) 301static void sbus_free_coherent(struct device *dev, size_t n, void *p,
302 dma_addr_t ba)
303{ 303{
304 struct resource *res; 304 struct resource *res;
305 struct page *pgv; 305 struct page *pgv;
@@ -317,7 +317,7 @@ void sbus_free_consistent(struct device *dev, long n, void *p, u32 ba)
317 317
318 n = (n + PAGE_SIZE-1) & PAGE_MASK; 318 n = (n + PAGE_SIZE-1) & PAGE_MASK;
319 if ((res->end-res->start)+1 != n) { 319 if ((res->end-res->start)+1 != n) {
320 printk("sbus_free_consistent: region 0x%lx asked 0x%lx\n", 320 printk("sbus_free_consistent: region 0x%lx asked 0x%zx\n",
321 (long)((res->end-res->start)+1), n); 321 (long)((res->end-res->start)+1), n);
322 return; 322 return;
323 } 323 }
@@ -337,8 +337,13 @@ void sbus_free_consistent(struct device *dev, long n, void *p, u32 ba)
337 * CPU view of this memory may be inconsistent with 337 * CPU view of this memory may be inconsistent with
338 * a device view and explicit flushing is necessary. 338 * a device view and explicit flushing is necessary.
339 */ 339 */
340dma_addr_t sbus_map_single(struct device *dev, void *va, size_t len, int direction) 340static dma_addr_t sbus_map_page(struct device *dev, struct page *page,
341 unsigned long offset, size_t len,
342 enum dma_data_direction dir,
343 struct dma_attrs *attrs)
341{ 344{
345 void *va = page_address(page) + offset;
346
342 /* XXX why are some lengths signed, others unsigned? */ 347 /* XXX why are some lengths signed, others unsigned? */
343 if (len <= 0) { 348 if (len <= 0) {
344 return 0; 349 return 0;
@@ -350,12 +355,14 @@ dma_addr_t sbus_map_single(struct device *dev, void *va, size_t len, int directi
350 return mmu_get_scsi_one(dev, va, len); 355 return mmu_get_scsi_one(dev, va, len);
351} 356}
352 357
353void sbus_unmap_single(struct device *dev, dma_addr_t ba, size_t n, int direction) 358static void sbus_unmap_page(struct device *dev, dma_addr_t ba, size_t n,
359 enum dma_data_direction dir, struct dma_attrs *attrs)
354{ 360{
355 mmu_release_scsi_one(dev, ba, n); 361 mmu_release_scsi_one(dev, ba, n);
356} 362}
357 363
358int sbus_map_sg(struct device *dev, struct scatterlist *sg, int n, int direction) 364static int sbus_map_sg(struct device *dev, struct scatterlist *sg, int n,
365 enum dma_data_direction dir, struct dma_attrs *attrs)
359{ 366{
360 mmu_get_scsi_sgl(dev, sg, n); 367 mmu_get_scsi_sgl(dev, sg, n);
361 368
@@ -366,19 +373,38 @@ int sbus_map_sg(struct device *dev, struct scatterlist *sg, int n, int direction
366 return n; 373 return n;
367} 374}
368 375
369void sbus_unmap_sg(struct device *dev, struct scatterlist *sg, int n, int direction) 376static void sbus_unmap_sg(struct device *dev, struct scatterlist *sg, int n,
377 enum dma_data_direction dir, struct dma_attrs *attrs)
370{ 378{
371 mmu_release_scsi_sgl(dev, sg, n); 379 mmu_release_scsi_sgl(dev, sg, n);
372} 380}
373 381
374void sbus_dma_sync_single_for_cpu(struct device *dev, dma_addr_t ba, size_t size, int direction) 382static void sbus_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
383 int n, enum dma_data_direction dir)
375{ 384{
385 BUG();
376} 386}
377 387
378void sbus_dma_sync_single_for_device(struct device *dev, dma_addr_t ba, size_t size, int direction) 388static void sbus_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
389 int n, enum dma_data_direction dir)
379{ 390{
391 BUG();
380} 392}
381 393
394struct dma_map_ops sbus_dma_ops = {
395 .alloc_coherent = sbus_alloc_coherent,
396 .free_coherent = sbus_free_coherent,
397 .map_page = sbus_map_page,
398 .unmap_page = sbus_unmap_page,
399 .map_sg = sbus_map_sg,
400 .unmap_sg = sbus_unmap_sg,
401 .sync_sg_for_cpu = sbus_sync_sg_for_cpu,
402 .sync_sg_for_device = sbus_sync_sg_for_device,
403};
404
405struct dma_map_ops *dma_ops = &sbus_dma_ops;
406EXPORT_SYMBOL(dma_ops);
407
382static int __init sparc_register_ioport(void) 408static int __init sparc_register_ioport(void)
383{ 409{
384 register_proc_sparc_ioport(); 410 register_proc_sparc_ioport();
@@ -395,7 +421,8 @@ arch_initcall(sparc_register_ioport);
395/* Allocate and map kernel buffer using consistent mode DMA for a device. 421/* Allocate and map kernel buffer using consistent mode DMA for a device.
396 * hwdev should be valid struct pci_dev pointer for PCI devices. 422 * hwdev should be valid struct pci_dev pointer for PCI devices.
397 */ 423 */
398void *pci_alloc_consistent(struct pci_dev *pdev, size_t len, dma_addr_t *pba) 424static void *pci32_alloc_coherent(struct device *dev, size_t len,
425 dma_addr_t *pba, gfp_t gfp)
399{ 426{
400 unsigned long len_total = (len + PAGE_SIZE-1) & PAGE_MASK; 427 unsigned long len_total = (len + PAGE_SIZE-1) & PAGE_MASK;
401 unsigned long va; 428 unsigned long va;
@@ -439,7 +466,6 @@ void *pci_alloc_consistent(struct pci_dev *pdev, size_t len, dma_addr_t *pba)
439 *pba = virt_to_phys(va); /* equals virt_to_bus (R.I.P.) for us. */ 466 *pba = virt_to_phys(va); /* equals virt_to_bus (R.I.P.) for us. */
440 return (void *) res->start; 467 return (void *) res->start;
441} 468}
442EXPORT_SYMBOL(pci_alloc_consistent);
443 469
444/* Free and unmap a consistent DMA buffer. 470/* Free and unmap a consistent DMA buffer.
445 * cpu_addr is what was returned from pci_alloc_consistent, 471 * cpu_addr is what was returned from pci_alloc_consistent,
@@ -449,7 +475,8 @@ EXPORT_SYMBOL(pci_alloc_consistent);
449 * References to the memory and mappings associated with cpu_addr/dma_addr 475 * References to the memory and mappings associated with cpu_addr/dma_addr
450 * past this call are illegal. 476 * past this call are illegal.
451 */ 477 */
452void pci_free_consistent(struct pci_dev *pdev, size_t n, void *p, dma_addr_t ba) 478static void pci32_free_coherent(struct device *dev, size_t n, void *p,
479 dma_addr_t ba)
453{ 480{
454 struct resource *res; 481 struct resource *res;
455 unsigned long pgp; 482 unsigned long pgp;
@@ -481,60 +508,18 @@ void pci_free_consistent(struct pci_dev *pdev, size_t n, void *p, dma_addr_t ba)
481 508
482 free_pages(pgp, get_order(n)); 509 free_pages(pgp, get_order(n));
483} 510}
484EXPORT_SYMBOL(pci_free_consistent);
485
486/* Map a single buffer of the indicated size for DMA in streaming mode.
487 * The 32-bit bus address to use is returned.
488 *
489 * Once the device is given the dma address, the device owns this memory
490 * until either pci_unmap_single or pci_dma_sync_single_* is performed.
491 */
492dma_addr_t pci_map_single(struct pci_dev *hwdev, void *ptr, size_t size,
493 int direction)
494{
495 BUG_ON(direction == PCI_DMA_NONE);
496 /* IIep is write-through, not flushing. */
497 return virt_to_phys(ptr);
498}
499EXPORT_SYMBOL(pci_map_single);
500
501/* Unmap a single streaming mode DMA translation. The dma_addr and size
502 * must match what was provided for in a previous pci_map_single call. All
503 * other usages are undefined.
504 *
505 * After this call, reads by the cpu to the buffer are guaranteed to see
506 * whatever the device wrote there.
507 */
508void pci_unmap_single(struct pci_dev *hwdev, dma_addr_t ba, size_t size,
509 int direction)
510{
511 BUG_ON(direction == PCI_DMA_NONE);
512 if (direction != PCI_DMA_TODEVICE) {
513 mmu_inval_dma_area((unsigned long)phys_to_virt(ba),
514 (size + PAGE_SIZE-1) & PAGE_MASK);
515 }
516}
517EXPORT_SYMBOL(pci_unmap_single);
518 511
519/* 512/*
520 * Same as pci_map_single, but with pages. 513 * Same as pci_map_single, but with pages.
521 */ 514 */
522dma_addr_t pci_map_page(struct pci_dev *hwdev, struct page *page, 515static dma_addr_t pci32_map_page(struct device *dev, struct page *page,
523 unsigned long offset, size_t size, int direction) 516 unsigned long offset, size_t size,
517 enum dma_data_direction dir,
518 struct dma_attrs *attrs)
524{ 519{
525 BUG_ON(direction == PCI_DMA_NONE);
526 /* IIep is write-through, not flushing. */ 520 /* IIep is write-through, not flushing. */
527 return page_to_phys(page) + offset; 521 return page_to_phys(page) + offset;
528} 522}
529EXPORT_SYMBOL(pci_map_page);
530
531void pci_unmap_page(struct pci_dev *hwdev,
532 dma_addr_t dma_address, size_t size, int direction)
533{
534 BUG_ON(direction == PCI_DMA_NONE);
535 /* mmu_inval_dma_area XXX */
536}
537EXPORT_SYMBOL(pci_unmap_page);
538 523
539/* Map a set of buffers described by scatterlist in streaming 524/* Map a set of buffers described by scatterlist in streaming
540 * mode for DMA. This is the scather-gather version of the 525 * mode for DMA. This is the scather-gather version of the
@@ -551,13 +536,13 @@ EXPORT_SYMBOL(pci_unmap_page);
551 * Device ownership issues as mentioned above for pci_map_single are 536 * Device ownership issues as mentioned above for pci_map_single are
552 * the same here. 537 * the same here.
553 */ 538 */
554int pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sgl, int nents, 539static int pci32_map_sg(struct device *device, struct scatterlist *sgl,
555 int direction) 540 int nents, enum dma_data_direction dir,
541 struct dma_attrs *attrs)
556{ 542{
557 struct scatterlist *sg; 543 struct scatterlist *sg;
558 int n; 544 int n;
559 545
560 BUG_ON(direction == PCI_DMA_NONE);
561 /* IIep is write-through, not flushing. */ 546 /* IIep is write-through, not flushing. */
562 for_each_sg(sgl, sg, nents, n) { 547 for_each_sg(sgl, sg, nents, n) {
563 BUG_ON(page_address(sg_page(sg)) == NULL); 548 BUG_ON(page_address(sg_page(sg)) == NULL);
@@ -566,20 +551,19 @@ int pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sgl, int nents,
566 } 551 }
567 return nents; 552 return nents;
568} 553}
569EXPORT_SYMBOL(pci_map_sg);
570 554
571/* Unmap a set of streaming mode DMA translations. 555/* Unmap a set of streaming mode DMA translations.
572 * Again, cpu read rules concerning calls here are the same as for 556 * Again, cpu read rules concerning calls here are the same as for
573 * pci_unmap_single() above. 557 * pci_unmap_single() above.
574 */ 558 */
575void pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sgl, int nents, 559static void pci32_unmap_sg(struct device *dev, struct scatterlist *sgl,
576 int direction) 560 int nents, enum dma_data_direction dir,
561 struct dma_attrs *attrs)
577{ 562{
578 struct scatterlist *sg; 563 struct scatterlist *sg;
579 int n; 564 int n;
580 565
581 BUG_ON(direction == PCI_DMA_NONE); 566 if (dir != PCI_DMA_TODEVICE) {
582 if (direction != PCI_DMA_TODEVICE) {
583 for_each_sg(sgl, sg, nents, n) { 567 for_each_sg(sgl, sg, nents, n) {
584 BUG_ON(page_address(sg_page(sg)) == NULL); 568 BUG_ON(page_address(sg_page(sg)) == NULL);
585 mmu_inval_dma_area( 569 mmu_inval_dma_area(
@@ -588,7 +572,6 @@ void pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sgl, int nents,
588 } 572 }
589 } 573 }
590} 574}
591EXPORT_SYMBOL(pci_unmap_sg);
592 575
593/* Make physical memory consistent for a single 576/* Make physical memory consistent for a single
594 * streaming mode DMA translation before or after a transfer. 577 * streaming mode DMA translation before or after a transfer.
@@ -600,25 +583,23 @@ EXPORT_SYMBOL(pci_unmap_sg);
600 * must first perform a pci_dma_sync_for_device, and then the 583 * must first perform a pci_dma_sync_for_device, and then the
601 * device again owns the buffer. 584 * device again owns the buffer.
602 */ 585 */
603void pci_dma_sync_single_for_cpu(struct pci_dev *hwdev, dma_addr_t ba, size_t size, int direction) 586static void pci32_sync_single_for_cpu(struct device *dev, dma_addr_t ba,
587 size_t size, enum dma_data_direction dir)
604{ 588{
605 BUG_ON(direction == PCI_DMA_NONE); 589 if (dir != PCI_DMA_TODEVICE) {
606 if (direction != PCI_DMA_TODEVICE) {
607 mmu_inval_dma_area((unsigned long)phys_to_virt(ba), 590 mmu_inval_dma_area((unsigned long)phys_to_virt(ba),
608 (size + PAGE_SIZE-1) & PAGE_MASK); 591 (size + PAGE_SIZE-1) & PAGE_MASK);
609 } 592 }
610} 593}
611EXPORT_SYMBOL(pci_dma_sync_single_for_cpu);
612 594
613void pci_dma_sync_single_for_device(struct pci_dev *hwdev, dma_addr_t ba, size_t size, int direction) 595static void pci32_sync_single_for_device(struct device *dev, dma_addr_t ba,
596 size_t size, enum dma_data_direction dir)
614{ 597{
615 BUG_ON(direction == PCI_DMA_NONE); 598 if (dir != PCI_DMA_TODEVICE) {
616 if (direction != PCI_DMA_TODEVICE) {
617 mmu_inval_dma_area((unsigned long)phys_to_virt(ba), 599 mmu_inval_dma_area((unsigned long)phys_to_virt(ba),
618 (size + PAGE_SIZE-1) & PAGE_MASK); 600 (size + PAGE_SIZE-1) & PAGE_MASK);
619 } 601 }
620} 602}
621EXPORT_SYMBOL(pci_dma_sync_single_for_device);
622 603
623/* Make physical memory consistent for a set of streaming 604/* Make physical memory consistent for a set of streaming
624 * mode DMA translations after a transfer. 605 * mode DMA translations after a transfer.
@@ -626,13 +607,13 @@ EXPORT_SYMBOL(pci_dma_sync_single_for_device);
626 * The same as pci_dma_sync_single_* but for a scatter-gather list, 607 * The same as pci_dma_sync_single_* but for a scatter-gather list,
627 * same rules and usage. 608 * same rules and usage.
628 */ 609 */
629void pci_dma_sync_sg_for_cpu(struct pci_dev *hwdev, struct scatterlist *sgl, int nents, int direction) 610static void pci32_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl,
611 int nents, enum dma_data_direction dir)
630{ 612{
631 struct scatterlist *sg; 613 struct scatterlist *sg;
632 int n; 614 int n;
633 615
634 BUG_ON(direction == PCI_DMA_NONE); 616 if (dir != PCI_DMA_TODEVICE) {
635 if (direction != PCI_DMA_TODEVICE) {
636 for_each_sg(sgl, sg, nents, n) { 617 for_each_sg(sgl, sg, nents, n) {
637 BUG_ON(page_address(sg_page(sg)) == NULL); 618 BUG_ON(page_address(sg_page(sg)) == NULL);
638 mmu_inval_dma_area( 619 mmu_inval_dma_area(
@@ -641,15 +622,14 @@ void pci_dma_sync_sg_for_cpu(struct pci_dev *hwdev, struct scatterlist *sgl, int
641 } 622 }
642 } 623 }
643} 624}
644EXPORT_SYMBOL(pci_dma_sync_sg_for_cpu);
645 625
646void pci_dma_sync_sg_for_device(struct pci_dev *hwdev, struct scatterlist *sgl, int nents, int direction) 626static void pci32_sync_sg_for_device(struct device *device, struct scatterlist *sgl,
627 int nents, enum dma_data_direction dir)
647{ 628{
648 struct scatterlist *sg; 629 struct scatterlist *sg;
649 int n; 630 int n;
650 631
651 BUG_ON(direction == PCI_DMA_NONE); 632 if (dir != PCI_DMA_TODEVICE) {
652 if (direction != PCI_DMA_TODEVICE) {
653 for_each_sg(sgl, sg, nents, n) { 633 for_each_sg(sgl, sg, nents, n) {
654 BUG_ON(page_address(sg_page(sg)) == NULL); 634 BUG_ON(page_address(sg_page(sg)) == NULL);
655 mmu_inval_dma_area( 635 mmu_inval_dma_area(
@@ -658,9 +638,49 @@ void pci_dma_sync_sg_for_device(struct pci_dev *hwdev, struct scatterlist *sgl,
658 } 638 }
659 } 639 }
660} 640}
661EXPORT_SYMBOL(pci_dma_sync_sg_for_device); 641
642struct dma_map_ops pci32_dma_ops = {
643 .alloc_coherent = pci32_alloc_coherent,
644 .free_coherent = pci32_free_coherent,
645 .map_page = pci32_map_page,
646 .map_sg = pci32_map_sg,
647 .unmap_sg = pci32_unmap_sg,
648 .sync_single_for_cpu = pci32_sync_single_for_cpu,
649 .sync_single_for_device = pci32_sync_single_for_device,
650 .sync_sg_for_cpu = pci32_sync_sg_for_cpu,
651 .sync_sg_for_device = pci32_sync_sg_for_device,
652};
653EXPORT_SYMBOL(pci32_dma_ops);
654
662#endif /* CONFIG_PCI */ 655#endif /* CONFIG_PCI */
663 656
657/*
658 * Return whether the given PCI device DMA address mask can be
659 * supported properly. For example, if your device can only drive the
660 * low 24-bits during PCI bus mastering, then you would pass
661 * 0x00ffffff as the mask to this function.
662 */
663int dma_supported(struct device *dev, u64 mask)
664{
665#ifdef CONFIG_PCI
666 if (dev->bus == &pci_bus_type)
667 return 1;
668#endif
669 return 0;
670}
671EXPORT_SYMBOL(dma_supported);
672
673int dma_set_mask(struct device *dev, u64 dma_mask)
674{
675#ifdef CONFIG_PCI
676 if (dev->bus == &pci_bus_type)
677 return pci_set_dma_mask(to_pci_dev(dev), dma_mask);
678#endif
679 return -EOPNOTSUPP;
680}
681EXPORT_SYMBOL(dma_set_mask);
682
683
664#ifdef CONFIG_PROC_FS 684#ifdef CONFIG_PROC_FS
665 685
666static int 686static int
diff --git a/arch/sparc/kernel/pci.c b/arch/sparc/kernel/pci.c
index 57859ad23547..c68648662802 100644
--- a/arch/sparc/kernel/pci.c
+++ b/arch/sparc/kernel/pci.c
@@ -1039,7 +1039,7 @@ static void ali_sound_dma_hack(struct pci_dev *pdev, int set_bit)
1039 pci_dev_put(ali_isa_bridge); 1039 pci_dev_put(ali_isa_bridge);
1040} 1040}
1041 1041
1042int pci_dma_supported(struct pci_dev *pdev, u64 device_mask) 1042int pci64_dma_supported(struct pci_dev *pdev, u64 device_mask)
1043{ 1043{
1044 u64 dma_addr_mask; 1044 u64 dma_addr_mask;
1045 1045
diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c
index 2485eaa23101..23c33ff9c31e 100644
--- a/arch/sparc/kernel/pci_sun4v.c
+++ b/arch/sparc/kernel/pci_sun4v.c
@@ -232,7 +232,8 @@ static void dma_4v_free_coherent(struct device *dev, size_t size, void *cpu,
232 232
233static dma_addr_t dma_4v_map_page(struct device *dev, struct page *page, 233static dma_addr_t dma_4v_map_page(struct device *dev, struct page *page,
234 unsigned long offset, size_t sz, 234 unsigned long offset, size_t sz,
235 enum dma_data_direction direction) 235 enum dma_data_direction direction,
236 struct dma_attrs *attrs)
236{ 237{
237 struct iommu *iommu; 238 struct iommu *iommu;
238 unsigned long flags, npages, oaddr; 239 unsigned long flags, npages, oaddr;
@@ -296,7 +297,8 @@ iommu_map_fail:
296} 297}
297 298
298static void dma_4v_unmap_page(struct device *dev, dma_addr_t bus_addr, 299static void dma_4v_unmap_page(struct device *dev, dma_addr_t bus_addr,
299 size_t sz, enum dma_data_direction direction) 300 size_t sz, enum dma_data_direction direction,
301 struct dma_attrs *attrs)
300{ 302{
301 struct pci_pbm_info *pbm; 303 struct pci_pbm_info *pbm;
302 struct iommu *iommu; 304 struct iommu *iommu;
@@ -336,7 +338,8 @@ static void dma_4v_unmap_page(struct device *dev, dma_addr_t bus_addr,
336} 338}
337 339
338static int dma_4v_map_sg(struct device *dev, struct scatterlist *sglist, 340static int dma_4v_map_sg(struct device *dev, struct scatterlist *sglist,
339 int nelems, enum dma_data_direction direction) 341 int nelems, enum dma_data_direction direction,
342 struct dma_attrs *attrs)
340{ 343{
341 struct scatterlist *s, *outs, *segstart; 344 struct scatterlist *s, *outs, *segstart;
342 unsigned long flags, handle, prot; 345 unsigned long flags, handle, prot;
@@ -478,7 +481,8 @@ iommu_map_failed:
478} 481}
479 482
480static void dma_4v_unmap_sg(struct device *dev, struct scatterlist *sglist, 483static void dma_4v_unmap_sg(struct device *dev, struct scatterlist *sglist,
481 int nelems, enum dma_data_direction direction) 484 int nelems, enum dma_data_direction direction,
485 struct dma_attrs *attrs)
482{ 486{
483 struct pci_pbm_info *pbm; 487 struct pci_pbm_info *pbm;
484 struct scatterlist *sg; 488 struct scatterlist *sg;
@@ -521,29 +525,13 @@ static void dma_4v_unmap_sg(struct device *dev, struct scatterlist *sglist,
521 spin_unlock_irqrestore(&iommu->lock, flags); 525 spin_unlock_irqrestore(&iommu->lock, flags);
522} 526}
523 527
524static void dma_4v_sync_single_for_cpu(struct device *dev, 528static struct dma_map_ops sun4v_dma_ops = {
525 dma_addr_t bus_addr, size_t sz,
526 enum dma_data_direction direction)
527{
528 /* Nothing to do... */
529}
530
531static void dma_4v_sync_sg_for_cpu(struct device *dev,
532 struct scatterlist *sglist, int nelems,
533 enum dma_data_direction direction)
534{
535 /* Nothing to do... */
536}
537
538static const struct dma_ops sun4v_dma_ops = {
539 .alloc_coherent = dma_4v_alloc_coherent, 529 .alloc_coherent = dma_4v_alloc_coherent,
540 .free_coherent = dma_4v_free_coherent, 530 .free_coherent = dma_4v_free_coherent,
541 .map_page = dma_4v_map_page, 531 .map_page = dma_4v_map_page,
542 .unmap_page = dma_4v_unmap_page, 532 .unmap_page = dma_4v_unmap_page,
543 .map_sg = dma_4v_map_sg, 533 .map_sg = dma_4v_map_sg,
544 .unmap_sg = dma_4v_unmap_sg, 534 .unmap_sg = dma_4v_unmap_sg,
545 .sync_single_for_cpu = dma_4v_sync_single_for_cpu,
546 .sync_sg_for_cpu = dma_4v_sync_sg_for_cpu,
547}; 535};
548 536
549static void __devinit pci_sun4v_scan_bus(struct pci_pbm_info *pbm, 537static void __devinit pci_sun4v_scan_bus(struct pci_pbm_info *pbm,
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index 4041f94e7724..18d67854a1b8 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -251,7 +251,7 @@ static void __global_reg_poll(struct global_reg_snapshot *gp)
251 } 251 }
252} 252}
253 253
254void __trigger_all_cpu_backtrace(void) 254void arch_trigger_all_cpu_backtrace(void)
255{ 255{
256 struct thread_info *tp = current_thread_info(); 256 struct thread_info *tp = current_thread_info();
257 struct pt_regs *regs = get_irq_regs(); 257 struct pt_regs *regs = get_irq_regs();
@@ -304,7 +304,7 @@ void __trigger_all_cpu_backtrace(void)
304 304
305static void sysrq_handle_globreg(int key, struct tty_struct *tty) 305static void sysrq_handle_globreg(int key, struct tty_struct *tty)
306{ 306{
307 __trigger_all_cpu_backtrace(); 307 arch_trigger_all_cpu_backtrace();
308} 308}
309 309
310static struct sysrq_key_op sparc_globalreg_op = { 310static struct sysrq_key_op sparc_globalreg_op = {
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 13ffa5df37d7..fc20fdc0f7f2 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -38,7 +38,7 @@ config X86
38 select HAVE_FUNCTION_GRAPH_FP_TEST 38 select HAVE_FUNCTION_GRAPH_FP_TEST
39 select HAVE_FUNCTION_TRACE_MCOUNT_TEST 39 select HAVE_FUNCTION_TRACE_MCOUNT_TEST
40 select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE 40 select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE
41 select HAVE_FTRACE_SYSCALLS 41 select HAVE_SYSCALL_TRACEPOINTS
42 select HAVE_KVM 42 select HAVE_KVM
43 select HAVE_ARCH_KGDB 43 select HAVE_ARCH_KGDB
44 select HAVE_ARCH_TRACEHOOK 44 select HAVE_ARCH_TRACEHOOK
@@ -586,7 +586,6 @@ config GART_IOMMU
586 bool "GART IOMMU support" if EMBEDDED 586 bool "GART IOMMU support" if EMBEDDED
587 default y 587 default y
588 select SWIOTLB 588 select SWIOTLB
589 select AGP
590 depends on X86_64 && PCI 589 depends on X86_64 && PCI
591 ---help--- 590 ---help---
592 Support for full DMA access of devices with 32bit memory access only 591 Support for full DMA access of devices with 32bit memory access only
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index edb992ebef92..d28fad19654a 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -2355,7 +2355,7 @@ CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
2355CONFIG_HAVE_DYNAMIC_FTRACE=y 2355CONFIG_HAVE_DYNAMIC_FTRACE=y
2356CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y 2356CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
2357CONFIG_HAVE_HW_BRANCH_TRACER=y 2357CONFIG_HAVE_HW_BRANCH_TRACER=y
2358CONFIG_HAVE_FTRACE_SYSCALLS=y 2358CONFIG_HAVE_SYSCALL_TRACEPOINTS=y
2359CONFIG_RING_BUFFER=y 2359CONFIG_RING_BUFFER=y
2360CONFIG_TRACING=y 2360CONFIG_TRACING=y
2361CONFIG_TRACING_SUPPORT=y 2361CONFIG_TRACING_SUPPORT=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index cee1dd2e69b2..6c86acd847a4 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -2329,7 +2329,7 @@ CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
2329CONFIG_HAVE_DYNAMIC_FTRACE=y 2329CONFIG_HAVE_DYNAMIC_FTRACE=y
2330CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y 2330CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
2331CONFIG_HAVE_HW_BRANCH_TRACER=y 2331CONFIG_HAVE_HW_BRANCH_TRACER=y
2332CONFIG_HAVE_FTRACE_SYSCALLS=y 2332CONFIG_HAVE_SYSCALL_TRACEPOINTS=y
2333CONFIG_RING_BUFFER=y 2333CONFIG_RING_BUFFER=y
2334CONFIG_TRACING=y 2334CONFIG_TRACING=y
2335CONFIG_TRACING_SUPPORT=y 2335CONFIG_TRACING_SUPPORT=y
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
index bdf96f119f06..ac95995b7bad 100644
--- a/arch/x86/include/asm/amd_iommu.h
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -25,6 +25,7 @@
25#ifdef CONFIG_AMD_IOMMU 25#ifdef CONFIG_AMD_IOMMU
26extern int amd_iommu_init(void); 26extern int amd_iommu_init(void);
27extern int amd_iommu_init_dma_ops(void); 27extern int amd_iommu_init_dma_ops(void);
28extern int amd_iommu_init_passthrough(void);
28extern void amd_iommu_detect(void); 29extern void amd_iommu_detect(void);
29extern irqreturn_t amd_iommu_int_handler(int irq, void *data); 30extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
30extern void amd_iommu_flush_all_domains(void); 31extern void amd_iommu_flush_all_domains(void);
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 0c878caaa0a2..2a2cc7a78a81 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -143,22 +143,29 @@
143#define EVT_BUFFER_SIZE 8192 /* 512 entries */ 143#define EVT_BUFFER_SIZE 8192 /* 512 entries */
144#define EVT_LEN_MASK (0x9ULL << 56) 144#define EVT_LEN_MASK (0x9ULL << 56)
145 145
146#define PAGE_MODE_NONE 0x00
146#define PAGE_MODE_1_LEVEL 0x01 147#define PAGE_MODE_1_LEVEL 0x01
147#define PAGE_MODE_2_LEVEL 0x02 148#define PAGE_MODE_2_LEVEL 0x02
148#define PAGE_MODE_3_LEVEL 0x03 149#define PAGE_MODE_3_LEVEL 0x03
149 150#define PAGE_MODE_4_LEVEL 0x04
150#define IOMMU_PDE_NL_0 0x000ULL 151#define PAGE_MODE_5_LEVEL 0x05
151#define IOMMU_PDE_NL_1 0x200ULL 152#define PAGE_MODE_6_LEVEL 0x06
152#define IOMMU_PDE_NL_2 0x400ULL 153
153#define IOMMU_PDE_NL_3 0x600ULL 154#define PM_LEVEL_SHIFT(x) (12 + ((x) * 9))
154 155#define PM_LEVEL_SIZE(x) (((x) < 6) ? \
155#define IOMMU_PTE_L2_INDEX(address) (((address) >> 30) & 0x1ffULL) 156 ((1ULL << PM_LEVEL_SHIFT((x))) - 1): \
156#define IOMMU_PTE_L1_INDEX(address) (((address) >> 21) & 0x1ffULL) 157 (0xffffffffffffffffULL))
157#define IOMMU_PTE_L0_INDEX(address) (((address) >> 12) & 0x1ffULL) 158#define PM_LEVEL_INDEX(x, a) (((a) >> PM_LEVEL_SHIFT((x))) & 0x1ffULL)
158 159#define PM_LEVEL_ENC(x) (((x) << 9) & 0xe00ULL)
159#define IOMMU_MAP_SIZE_L1 (1ULL << 21) 160#define PM_LEVEL_PDE(x, a) ((a) | PM_LEVEL_ENC((x)) | \
160#define IOMMU_MAP_SIZE_L2 (1ULL << 30) 161 IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW)
161#define IOMMU_MAP_SIZE_L3 (1ULL << 39) 162#define PM_PTE_LEVEL(pte) (((pte) >> 9) & 0x7ULL)
163
164#define PM_MAP_4k 0
165#define PM_ADDR_MASK 0x000ffffffffff000ULL
166#define PM_MAP_MASK(lvl) (PM_ADDR_MASK & \
167 (~((1ULL << (12 + ((lvl) * 9))) - 1)))
168#define PM_ALIGNED(lvl, addr) ((PM_MAP_MASK(lvl) & (addr)) == (addr))
162 169
163#define IOMMU_PTE_P (1ULL << 0) 170#define IOMMU_PTE_P (1ULL << 0)
164#define IOMMU_PTE_TV (1ULL << 1) 171#define IOMMU_PTE_TV (1ULL << 1)
@@ -167,11 +174,6 @@
167#define IOMMU_PTE_IR (1ULL << 61) 174#define IOMMU_PTE_IR (1ULL << 61)
168#define IOMMU_PTE_IW (1ULL << 62) 175#define IOMMU_PTE_IW (1ULL << 62)
169 176
170#define IOMMU_L1_PDE(address) \
171 ((address) | IOMMU_PDE_NL_1 | IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW)
172#define IOMMU_L2_PDE(address) \
173 ((address) | IOMMU_PDE_NL_2 | IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW)
174
175#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL) 177#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
176#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P) 178#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P)
177#define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK)) 179#define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK))
@@ -194,11 +196,14 @@
194#define PD_DMA_OPS_MASK (1UL << 0) /* domain used for dma_ops */ 196#define PD_DMA_OPS_MASK (1UL << 0) /* domain used for dma_ops */
195#define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops 197#define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops
196 domain for an IOMMU */ 198 domain for an IOMMU */
199#define PD_PASSTHROUGH_MASK (1UL << 2) /* domain has no page
200 translation */
201
197extern bool amd_iommu_dump; 202extern bool amd_iommu_dump;
198#define DUMP_printk(format, arg...) \ 203#define DUMP_printk(format, arg...) \
199 do { \ 204 do { \
200 if (amd_iommu_dump) \ 205 if (amd_iommu_dump) \
201 printk(KERN_INFO "AMD IOMMU: " format, ## arg); \ 206 printk(KERN_INFO "AMD-Vi: " format, ## arg); \
202 } while(0); 207 } while(0);
203 208
204/* 209/*
@@ -226,6 +231,7 @@ struct protection_domain {
226 int mode; /* paging mode (0-6 levels) */ 231 int mode; /* paging mode (0-6 levels) */
227 u64 *pt_root; /* page table root pointer */ 232 u64 *pt_root; /* page table root pointer */
228 unsigned long flags; /* flags to find out type of domain */ 233 unsigned long flags; /* flags to find out type of domain */
234 bool updated; /* complete domain flush required */
229 unsigned dev_cnt; /* devices assigned to this domain */ 235 unsigned dev_cnt; /* devices assigned to this domain */
230 void *priv; /* private data */ 236 void *priv; /* private data */
231}; 237};
@@ -337,6 +343,9 @@ struct amd_iommu {
337 /* if one, we need to send a completion wait command */ 343 /* if one, we need to send a completion wait command */
338 bool need_sync; 344 bool need_sync;
339 345
346 /* becomes true if a command buffer reset is running */
347 bool reset_in_progress;
348
340 /* default dma_ops domain for that IOMMU */ 349 /* default dma_ops domain for that IOMMU */
341 struct dma_ops_domain *default_dom; 350 struct dma_ops_domain *default_dom;
342}; 351};
@@ -457,4 +466,7 @@ static inline void amd_iommu_stats_init(void) { }
457 466
458#endif /* CONFIG_AMD_IOMMU_STATS */ 467#endif /* CONFIG_AMD_IOMMU_STATS */
459 468
469/* some function prototypes */
470extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu);
471
460#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */ 472#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 1c3f9435f1c9..0ee770d23d0e 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -55,6 +55,24 @@ extern int dma_set_mask(struct device *dev, u64 mask);
55extern void *dma_generic_alloc_coherent(struct device *dev, size_t size, 55extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
56 dma_addr_t *dma_addr, gfp_t flag); 56 dma_addr_t *dma_addr, gfp_t flag);
57 57
58static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
59{
60 if (!dev->dma_mask)
61 return 0;
62
63 return addr + size <= *dev->dma_mask;
64}
65
66static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
67{
68 return paddr;
69}
70
71static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
72{
73 return daddr;
74}
75
58static inline void 76static inline void
59dma_cache_sync(struct device *dev, void *vaddr, size_t size, 77dma_cache_sync(struct device *dev, void *vaddr, size_t size,
60 enum dma_data_direction dir) 78 enum dma_data_direction dir)
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index bd2c6511c887..db24c2278be0 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -28,13 +28,6 @@
28 28
29#endif 29#endif
30 30
31/* FIXME: I don't want to stay hardcoded */
32#ifdef CONFIG_X86_64
33# define FTRACE_SYSCALL_MAX 296
34#else
35# define FTRACE_SYSCALL_MAX 333
36#endif
37
38#ifdef CONFIG_FUNCTION_TRACER 31#ifdef CONFIG_FUNCTION_TRACER
39#define MCOUNT_ADDR ((long)(mcount)) 32#define MCOUNT_ADDR ((long)(mcount))
40#define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ 33#define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index c86e5ed4af51..e63cf7d441e1 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -45,8 +45,8 @@ extern int proc_nmi_enabled(struct ctl_table *, int , struct file *,
45 void __user *, size_t *, loff_t *); 45 void __user *, size_t *, loff_t *);
46extern int unknown_nmi_panic; 46extern int unknown_nmi_panic;
47 47
48void __trigger_all_cpu_backtrace(void); 48void arch_trigger_all_cpu_backtrace(void);
49#define trigger_all_cpu_backtrace() __trigger_all_cpu_backtrace() 49#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
50 50
51static inline void localise_nmi_watchdog(void) 51static inline void localise_nmi_watchdog(void)
52{ 52{
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index fa64e401589d..e7b7c938ae27 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -84,6 +84,16 @@ union cpuid10_edx {
84#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b 84#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b
85#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2) 85#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2)
86 86
87/*
88 * We model BTS tracing as another fixed-mode PMC.
89 *
90 * We choose a value in the middle of the fixed counter range, since lower
91 * values are used by actual fixed counters and higher values are used
92 * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr.
93 */
94#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16)
95
96
87#ifdef CONFIG_PERF_COUNTERS 97#ifdef CONFIG_PERF_COUNTERS
88extern void init_hw_perf_counters(void); 98extern void init_hw_perf_counters(void);
89extern void perf_counters_lapic_init(void); 99extern void perf_counters_lapic_init(void);
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index fad7d40b75f8..6f7786aea4fc 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -95,7 +95,7 @@ struct thread_info {
95#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ 95#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */
96#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ 96#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */
97#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ 97#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
98#define TIF_SYSCALL_FTRACE 28 /* for ftrace syscall instrumentation */ 98#define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */
99 99
100#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) 100#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
101#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) 101#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
@@ -118,17 +118,17 @@ struct thread_info {
118#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) 118#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR)
119#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) 119#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR)
120#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) 120#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES)
121#define _TIF_SYSCALL_FTRACE (1 << TIF_SYSCALL_FTRACE) 121#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
122 122
123/* work to do in syscall_trace_enter() */ 123/* work to do in syscall_trace_enter() */
124#define _TIF_WORK_SYSCALL_ENTRY \ 124#define _TIF_WORK_SYSCALL_ENTRY \
125 (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_FTRACE | \ 125 (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT | \
126 _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | _TIF_SINGLESTEP) 126 _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)
127 127
128/* work to do in syscall_trace_leave() */ 128/* work to do in syscall_trace_leave() */
129#define _TIF_WORK_SYSCALL_EXIT \ 129#define _TIF_WORK_SYSCALL_EXIT \
130 (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \ 130 (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \
131 _TIF_SYSCALL_FTRACE) 131 _TIF_SYSCALL_TRACEPOINT)
132 132
133/* work to do on interrupt/exception return */ 133/* work to do on interrupt/exception return */
134#define _TIF_WORK_MASK \ 134#define _TIF_WORK_MASK \
@@ -137,7 +137,8 @@ struct thread_info {
137 _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU)) 137 _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU))
138 138
139/* work to do on any return to user space */ 139/* work to do on any return to user space */
140#define _TIF_ALLWORK_MASK ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_FTRACE) 140#define _TIF_ALLWORK_MASK \
141 ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT)
141 142
142/* Only used for 64 bit */ 143/* Only used for 64 bit */
143#define _TIF_DO_NOTIFY_MASK \ 144#define _TIF_DO_NOTIFY_MASK \
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 066ef590d7e0..26d06e052a18 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -129,25 +129,34 @@ extern unsigned long node_remap_size[];
129#endif 129#endif
130 130
131/* sched_domains SD_NODE_INIT for NUMA machines */ 131/* sched_domains SD_NODE_INIT for NUMA machines */
132#define SD_NODE_INIT (struct sched_domain) { \ 132#define SD_NODE_INIT (struct sched_domain) { \
133 .min_interval = 8, \ 133 .min_interval = 8, \
134 .max_interval = 32, \ 134 .max_interval = 32, \
135 .busy_factor = 32, \ 135 .busy_factor = 32, \
136 .imbalance_pct = 125, \ 136 .imbalance_pct = 125, \
137 .cache_nice_tries = SD_CACHE_NICE_TRIES, \ 137 .cache_nice_tries = SD_CACHE_NICE_TRIES, \
138 .busy_idx = 3, \ 138 .busy_idx = 3, \
139 .idle_idx = SD_IDLE_IDX, \ 139 .idle_idx = SD_IDLE_IDX, \
140 .newidle_idx = SD_NEWIDLE_IDX, \ 140 .newidle_idx = SD_NEWIDLE_IDX, \
141 .wake_idx = 1, \ 141 .wake_idx = 1, \
142 .forkexec_idx = SD_FORKEXEC_IDX, \ 142 .forkexec_idx = SD_FORKEXEC_IDX, \
143 .flags = SD_LOAD_BALANCE \ 143 \
144 | SD_BALANCE_EXEC \ 144 .flags = 1*SD_LOAD_BALANCE \
145 | SD_BALANCE_FORK \ 145 | 1*SD_BALANCE_NEWIDLE \
146 | SD_WAKE_AFFINE \ 146 | 1*SD_BALANCE_EXEC \
147 | SD_WAKE_BALANCE \ 147 | 1*SD_BALANCE_FORK \
148 | SD_SERIALIZE, \ 148 | 0*SD_WAKE_IDLE \
149 .last_balance = jiffies, \ 149 | 1*SD_WAKE_AFFINE \
150 .balance_interval = 1, \ 150 | 1*SD_WAKE_BALANCE \
151 | 0*SD_SHARE_CPUPOWER \
152 | 0*SD_POWERSAVINGS_BALANCE \
153 | 0*SD_SHARE_PKG_RESOURCES \
154 | 1*SD_SERIALIZE \
155 | 1*SD_WAKE_IDLE_FAR \
156 | 0*SD_PREFER_SIBLING \
157 , \
158 .last_balance = jiffies, \
159 .balance_interval = 1, \
151} 160}
152 161
153#ifdef CONFIG_X86_64_ACPI_NUMA 162#ifdef CONFIG_X86_64_ACPI_NUMA
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 732a30706153..8deaada61bc8 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -345,6 +345,8 @@
345 345
346#ifdef __KERNEL__ 346#ifdef __KERNEL__
347 347
348#define NR_syscalls 337
349
348#define __ARCH_WANT_IPC_PARSE_VERSION 350#define __ARCH_WANT_IPC_PARSE_VERSION
349#define __ARCH_WANT_OLD_READDIR 351#define __ARCH_WANT_OLD_READDIR
350#define __ARCH_WANT_OLD_STAT 352#define __ARCH_WANT_OLD_STAT
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 900e1617e672..b9f3c60de5f7 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -688,6 +688,12 @@ __SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
688#endif /* __NO_STUBS */ 688#endif /* __NO_STUBS */
689 689
690#ifdef __KERNEL__ 690#ifdef __KERNEL__
691
692#ifndef COMPILE_OFFSETS
693#include <asm/asm-offsets.h>
694#define NR_syscalls (__NR_syscall_max + 1)
695#endif
696
691/* 697/*
692 * "Conditional" syscalls 698 * "Conditional" syscalls
693 * 699 *
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 6c99f5037801..98f230f6a28d 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -41,9 +41,13 @@ static DEFINE_RWLOCK(amd_iommu_devtable_lock);
41static LIST_HEAD(iommu_pd_list); 41static LIST_HEAD(iommu_pd_list);
42static DEFINE_SPINLOCK(iommu_pd_list_lock); 42static DEFINE_SPINLOCK(iommu_pd_list_lock);
43 43
44#ifdef CONFIG_IOMMU_API 44/*
45 * Domain for untranslated devices - only allocated
46 * if iommu=pt passed on kernel cmd line.
47 */
48static struct protection_domain *pt_domain;
49
45static struct iommu_ops amd_iommu_ops; 50static struct iommu_ops amd_iommu_ops;
46#endif
47 51
48/* 52/*
49 * general struct to manage commands send to an IOMMU 53 * general struct to manage commands send to an IOMMU
@@ -55,16 +59,16 @@ struct iommu_cmd {
55static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, 59static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
56 struct unity_map_entry *e); 60 struct unity_map_entry *e);
57static struct dma_ops_domain *find_protection_domain(u16 devid); 61static struct dma_ops_domain *find_protection_domain(u16 devid);
58static u64* alloc_pte(struct protection_domain *dom, 62static u64 *alloc_pte(struct protection_domain *domain,
59 unsigned long address, u64 63 unsigned long address, int end_lvl,
60 **pte_page, gfp_t gfp); 64 u64 **pte_page, gfp_t gfp);
61static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, 65static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
62 unsigned long start_page, 66 unsigned long start_page,
63 unsigned int pages); 67 unsigned int pages);
64 68static void reset_iommu_command_buffer(struct amd_iommu *iommu);
65#ifndef BUS_NOTIFY_UNBOUND_DRIVER 69static u64 *fetch_pte(struct protection_domain *domain,
66#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005 70 unsigned long address, int map_size);
67#endif 71static void update_domain(struct protection_domain *domain);
68 72
69#ifdef CONFIG_AMD_IOMMU_STATS 73#ifdef CONFIG_AMD_IOMMU_STATS
70 74
@@ -138,7 +142,25 @@ static int iommu_has_npcache(struct amd_iommu *iommu)
138 * 142 *
139 ****************************************************************************/ 143 ****************************************************************************/
140 144
141static void iommu_print_event(void *__evt) 145static void dump_dte_entry(u16 devid)
146{
147 int i;
148
149 for (i = 0; i < 8; ++i)
150 pr_err("AMD-Vi: DTE[%d]: %08x\n", i,
151 amd_iommu_dev_table[devid].data[i]);
152}
153
154static void dump_command(unsigned long phys_addr)
155{
156 struct iommu_cmd *cmd = phys_to_virt(phys_addr);
157 int i;
158
159 for (i = 0; i < 4; ++i)
160 pr_err("AMD-Vi: CMD[%d]: %08x\n", i, cmd->data[i]);
161}
162
163static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
142{ 164{
143 u32 *event = __evt; 165 u32 *event = __evt;
144 int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK; 166 int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK;
@@ -147,7 +169,7 @@ static void iommu_print_event(void *__evt)
147 int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK; 169 int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
148 u64 address = (u64)(((u64)event[3]) << 32) | event[2]; 170 u64 address = (u64)(((u64)event[3]) << 32) | event[2];
149 171
150 printk(KERN_ERR "AMD IOMMU: Event logged ["); 172 printk(KERN_ERR "AMD-Vi: Event logged [");
151 173
152 switch (type) { 174 switch (type) {
153 case EVENT_TYPE_ILL_DEV: 175 case EVENT_TYPE_ILL_DEV:
@@ -155,6 +177,7 @@ static void iommu_print_event(void *__evt)
155 "address=0x%016llx flags=0x%04x]\n", 177 "address=0x%016llx flags=0x%04x]\n",
156 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), 178 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
157 address, flags); 179 address, flags);
180 dump_dte_entry(devid);
158 break; 181 break;
159 case EVENT_TYPE_IO_FAULT: 182 case EVENT_TYPE_IO_FAULT:
160 printk("IO_PAGE_FAULT device=%02x:%02x.%x " 183 printk("IO_PAGE_FAULT device=%02x:%02x.%x "
@@ -176,6 +199,8 @@ static void iommu_print_event(void *__evt)
176 break; 199 break;
177 case EVENT_TYPE_ILL_CMD: 200 case EVENT_TYPE_ILL_CMD:
178 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); 201 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
202 reset_iommu_command_buffer(iommu);
203 dump_command(address);
179 break; 204 break;
180 case EVENT_TYPE_CMD_HARD_ERR: 205 case EVENT_TYPE_CMD_HARD_ERR:
181 printk("COMMAND_HARDWARE_ERROR address=0x%016llx " 206 printk("COMMAND_HARDWARE_ERROR address=0x%016llx "
@@ -209,7 +234,7 @@ static void iommu_poll_events(struct amd_iommu *iommu)
209 tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET); 234 tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
210 235
211 while (head != tail) { 236 while (head != tail) {
212 iommu_print_event(iommu->evt_buf + head); 237 iommu_print_event(iommu, iommu->evt_buf + head);
213 head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size; 238 head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size;
214 } 239 }
215 240
@@ -296,8 +321,11 @@ static void __iommu_wait_for_completion(struct amd_iommu *iommu)
296 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; 321 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
297 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); 322 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
298 323
299 if (unlikely(i == EXIT_LOOP_COUNT)) 324 if (unlikely(i == EXIT_LOOP_COUNT)) {
300 panic("AMD IOMMU: Completion wait loop failed\n"); 325 spin_unlock(&iommu->lock);
326 reset_iommu_command_buffer(iommu);
327 spin_lock(&iommu->lock);
328 }
301} 329}
302 330
303/* 331/*
@@ -445,47 +473,78 @@ static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid)
445} 473}
446 474
447/* 475/*
476 * This function flushes one domain on one IOMMU
477 */
478static void flush_domain_on_iommu(struct amd_iommu *iommu, u16 domid)
479{
480 struct iommu_cmd cmd;
481 unsigned long flags;
482
483 __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
484 domid, 1, 1);
485
486 spin_lock_irqsave(&iommu->lock, flags);
487 __iommu_queue_command(iommu, &cmd);
488 __iommu_completion_wait(iommu);
489 __iommu_wait_for_completion(iommu);
490 spin_unlock_irqrestore(&iommu->lock, flags);
491}
492
493static void flush_all_domains_on_iommu(struct amd_iommu *iommu)
494{
495 int i;
496
497 for (i = 1; i < MAX_DOMAIN_ID; ++i) {
498 if (!test_bit(i, amd_iommu_pd_alloc_bitmap))
499 continue;
500 flush_domain_on_iommu(iommu, i);
501 }
502
503}
504
505/*
448 * This function is used to flush the IO/TLB for a given protection domain 506 * This function is used to flush the IO/TLB for a given protection domain
449 * on every IOMMU in the system 507 * on every IOMMU in the system
450 */ 508 */
451static void iommu_flush_domain(u16 domid) 509static void iommu_flush_domain(u16 domid)
452{ 510{
453 unsigned long flags;
454 struct amd_iommu *iommu; 511 struct amd_iommu *iommu;
455 struct iommu_cmd cmd;
456 512
457 INC_STATS_COUNTER(domain_flush_all); 513 INC_STATS_COUNTER(domain_flush_all);
458 514
459 __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 515 for_each_iommu(iommu)
460 domid, 1, 1); 516 flush_domain_on_iommu(iommu, domid);
461
462 for_each_iommu(iommu) {
463 spin_lock_irqsave(&iommu->lock, flags);
464 __iommu_queue_command(iommu, &cmd);
465 __iommu_completion_wait(iommu);
466 __iommu_wait_for_completion(iommu);
467 spin_unlock_irqrestore(&iommu->lock, flags);
468 }
469} 517}
470 518
471void amd_iommu_flush_all_domains(void) 519void amd_iommu_flush_all_domains(void)
472{ 520{
521 struct amd_iommu *iommu;
522
523 for_each_iommu(iommu)
524 flush_all_domains_on_iommu(iommu);
525}
526
527static void flush_all_devices_for_iommu(struct amd_iommu *iommu)
528{
473 int i; 529 int i;
474 530
475 for (i = 1; i < MAX_DOMAIN_ID; ++i) { 531 for (i = 0; i <= amd_iommu_last_bdf; ++i) {
476 if (!test_bit(i, amd_iommu_pd_alloc_bitmap)) 532 if (iommu != amd_iommu_rlookup_table[i])
477 continue; 533 continue;
478 iommu_flush_domain(i); 534
535 iommu_queue_inv_dev_entry(iommu, i);
536 iommu_completion_wait(iommu);
479 } 537 }
480} 538}
481 539
482void amd_iommu_flush_all_devices(void) 540static void flush_devices_by_domain(struct protection_domain *domain)
483{ 541{
484 struct amd_iommu *iommu; 542 struct amd_iommu *iommu;
485 int i; 543 int i;
486 544
487 for (i = 0; i <= amd_iommu_last_bdf; ++i) { 545 for (i = 0; i <= amd_iommu_last_bdf; ++i) {
488 if (amd_iommu_pd_table[i] == NULL) 546 if ((domain == NULL && amd_iommu_pd_table[i] == NULL) ||
547 (amd_iommu_pd_table[i] != domain))
489 continue; 548 continue;
490 549
491 iommu = amd_iommu_rlookup_table[i]; 550 iommu = amd_iommu_rlookup_table[i];
@@ -497,6 +556,27 @@ void amd_iommu_flush_all_devices(void)
497 } 556 }
498} 557}
499 558
559static void reset_iommu_command_buffer(struct amd_iommu *iommu)
560{
561 pr_err("AMD-Vi: Resetting IOMMU command buffer\n");
562
563 if (iommu->reset_in_progress)
564 panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n");
565
566 iommu->reset_in_progress = true;
567
568 amd_iommu_reset_cmd_buffer(iommu);
569 flush_all_devices_for_iommu(iommu);
570 flush_all_domains_on_iommu(iommu);
571
572 iommu->reset_in_progress = false;
573}
574
575void amd_iommu_flush_all_devices(void)
576{
577 flush_devices_by_domain(NULL);
578}
579
500/**************************************************************************** 580/****************************************************************************
501 * 581 *
502 * The functions below are used the create the page table mappings for 582 * The functions below are used the create the page table mappings for
@@ -514,18 +594,21 @@ void amd_iommu_flush_all_devices(void)
514static int iommu_map_page(struct protection_domain *dom, 594static int iommu_map_page(struct protection_domain *dom,
515 unsigned long bus_addr, 595 unsigned long bus_addr,
516 unsigned long phys_addr, 596 unsigned long phys_addr,
517 int prot) 597 int prot,
598 int map_size)
518{ 599{
519 u64 __pte, *pte; 600 u64 __pte, *pte;
520 601
521 bus_addr = PAGE_ALIGN(bus_addr); 602 bus_addr = PAGE_ALIGN(bus_addr);
522 phys_addr = PAGE_ALIGN(phys_addr); 603 phys_addr = PAGE_ALIGN(phys_addr);
523 604
524 /* only support 512GB address spaces for now */ 605 BUG_ON(!PM_ALIGNED(map_size, bus_addr));
525 if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK)) 606 BUG_ON(!PM_ALIGNED(map_size, phys_addr));
607
608 if (!(prot & IOMMU_PROT_MASK))
526 return -EINVAL; 609 return -EINVAL;
527 610
528 pte = alloc_pte(dom, bus_addr, NULL, GFP_KERNEL); 611 pte = alloc_pte(dom, bus_addr, map_size, NULL, GFP_KERNEL);
529 612
530 if (IOMMU_PTE_PRESENT(*pte)) 613 if (IOMMU_PTE_PRESENT(*pte))
531 return -EBUSY; 614 return -EBUSY;
@@ -538,29 +621,18 @@ static int iommu_map_page(struct protection_domain *dom,
538 621
539 *pte = __pte; 622 *pte = __pte;
540 623
624 update_domain(dom);
625
541 return 0; 626 return 0;
542} 627}
543 628
544static void iommu_unmap_page(struct protection_domain *dom, 629static void iommu_unmap_page(struct protection_domain *dom,
545 unsigned long bus_addr) 630 unsigned long bus_addr, int map_size)
546{ 631{
547 u64 *pte; 632 u64 *pte = fetch_pte(dom, bus_addr, map_size);
548
549 pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)];
550
551 if (!IOMMU_PTE_PRESENT(*pte))
552 return;
553
554 pte = IOMMU_PTE_PAGE(*pte);
555 pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
556 633
557 if (!IOMMU_PTE_PRESENT(*pte)) 634 if (pte)
558 return; 635 *pte = 0;
559
560 pte = IOMMU_PTE_PAGE(*pte);
561 pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
562
563 *pte = 0;
564} 636}
565 637
566/* 638/*
@@ -615,7 +687,8 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
615 687
616 for (addr = e->address_start; addr < e->address_end; 688 for (addr = e->address_start; addr < e->address_end;
617 addr += PAGE_SIZE) { 689 addr += PAGE_SIZE) {
618 ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot); 690 ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot,
691 PM_MAP_4k);
619 if (ret) 692 if (ret)
620 return ret; 693 return ret;
621 /* 694 /*
@@ -670,24 +743,29 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
670 * This function checks if there is a PTE for a given dma address. If 743 * This function checks if there is a PTE for a given dma address. If
671 * there is one, it returns the pointer to it. 744 * there is one, it returns the pointer to it.
672 */ 745 */
673static u64* fetch_pte(struct protection_domain *domain, 746static u64 *fetch_pte(struct protection_domain *domain,
674 unsigned long address) 747 unsigned long address, int map_size)
675{ 748{
749 int level;
676 u64 *pte; 750 u64 *pte;
677 751
678 pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(address)]; 752 level = domain->mode - 1;
753 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
679 754
680 if (!IOMMU_PTE_PRESENT(*pte)) 755 while (level > map_size) {
681 return NULL; 756 if (!IOMMU_PTE_PRESENT(*pte))
757 return NULL;
682 758
683 pte = IOMMU_PTE_PAGE(*pte); 759 level -= 1;
684 pte = &pte[IOMMU_PTE_L1_INDEX(address)];
685 760
686 if (!IOMMU_PTE_PRESENT(*pte)) 761 pte = IOMMU_PTE_PAGE(*pte);
687 return NULL; 762 pte = &pte[PM_LEVEL_INDEX(level, address)];
688 763
689 pte = IOMMU_PTE_PAGE(*pte); 764 if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) {
690 pte = &pte[IOMMU_PTE_L0_INDEX(address)]; 765 pte = NULL;
766 break;
767 }
768 }
691 769
692 return pte; 770 return pte;
693} 771}
@@ -727,7 +805,7 @@ static int alloc_new_range(struct amd_iommu *iommu,
727 u64 *pte, *pte_page; 805 u64 *pte, *pte_page;
728 806
729 for (i = 0; i < num_ptes; ++i) { 807 for (i = 0; i < num_ptes; ++i) {
730 pte = alloc_pte(&dma_dom->domain, address, 808 pte = alloc_pte(&dma_dom->domain, address, PM_MAP_4k,
731 &pte_page, gfp); 809 &pte_page, gfp);
732 if (!pte) 810 if (!pte)
733 goto out_free; 811 goto out_free;
@@ -760,16 +838,20 @@ static int alloc_new_range(struct amd_iommu *iommu,
760 for (i = dma_dom->aperture[index]->offset; 838 for (i = dma_dom->aperture[index]->offset;
761 i < dma_dom->aperture_size; 839 i < dma_dom->aperture_size;
762 i += PAGE_SIZE) { 840 i += PAGE_SIZE) {
763 u64 *pte = fetch_pte(&dma_dom->domain, i); 841 u64 *pte = fetch_pte(&dma_dom->domain, i, PM_MAP_4k);
764 if (!pte || !IOMMU_PTE_PRESENT(*pte)) 842 if (!pte || !IOMMU_PTE_PRESENT(*pte))
765 continue; 843 continue;
766 844
767 dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1); 845 dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1);
768 } 846 }
769 847
848 update_domain(&dma_dom->domain);
849
770 return 0; 850 return 0;
771 851
772out_free: 852out_free:
853 update_domain(&dma_dom->domain);
854
773 free_page((unsigned long)dma_dom->aperture[index]->bitmap); 855 free_page((unsigned long)dma_dom->aperture[index]->bitmap);
774 856
775 kfree(dma_dom->aperture[index]); 857 kfree(dma_dom->aperture[index]);
@@ -1009,7 +1091,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
1009 dma_dom->domain.id = domain_id_alloc(); 1091 dma_dom->domain.id = domain_id_alloc();
1010 if (dma_dom->domain.id == 0) 1092 if (dma_dom->domain.id == 0)
1011 goto free_dma_dom; 1093 goto free_dma_dom;
1012 dma_dom->domain.mode = PAGE_MODE_3_LEVEL; 1094 dma_dom->domain.mode = PAGE_MODE_2_LEVEL;
1013 dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL); 1095 dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
1014 dma_dom->domain.flags = PD_DMA_OPS_MASK; 1096 dma_dom->domain.flags = PD_DMA_OPS_MASK;
1015 dma_dom->domain.priv = dma_dom; 1097 dma_dom->domain.priv = dma_dom;
@@ -1063,6 +1145,41 @@ static struct protection_domain *domain_for_device(u16 devid)
1063 return dom; 1145 return dom;
1064} 1146}
1065 1147
1148static void set_dte_entry(u16 devid, struct protection_domain *domain)
1149{
1150 u64 pte_root = virt_to_phys(domain->pt_root);
1151
1152 pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
1153 << DEV_ENTRY_MODE_SHIFT;
1154 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
1155
1156 amd_iommu_dev_table[devid].data[2] = domain->id;
1157 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
1158 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
1159
1160 amd_iommu_pd_table[devid] = domain;
1161}
1162
1163/*
1164 * If a device is not yet associated with a domain, this function does
1165 * assigns it visible for the hardware
1166 */
1167static void __attach_device(struct amd_iommu *iommu,
1168 struct protection_domain *domain,
1169 u16 devid)
1170{
1171 /* lock domain */
1172 spin_lock(&domain->lock);
1173
1174 /* update DTE entry */
1175 set_dte_entry(devid, domain);
1176
1177 domain->dev_cnt += 1;
1178
1179 /* ready */
1180 spin_unlock(&domain->lock);
1181}
1182
1066/* 1183/*
1067 * If a device is not yet associated with a domain, this function does 1184 * If a device is not yet associated with a domain, this function does
1068 * assigns it visible for the hardware 1185 * assigns it visible for the hardware
@@ -1072,27 +1189,16 @@ static void attach_device(struct amd_iommu *iommu,
1072 u16 devid) 1189 u16 devid)
1073{ 1190{
1074 unsigned long flags; 1191 unsigned long flags;
1075 u64 pte_root = virt_to_phys(domain->pt_root);
1076
1077 domain->dev_cnt += 1;
1078
1079 pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
1080 << DEV_ENTRY_MODE_SHIFT;
1081 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
1082 1192
1083 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 1193 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1084 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); 1194 __attach_device(iommu, domain, devid);
1085 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
1086 amd_iommu_dev_table[devid].data[2] = domain->id;
1087
1088 amd_iommu_pd_table[devid] = domain;
1089 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1195 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1090 1196
1091 /* 1197 /*
1092 * We might boot into a crash-kernel here. The crashed kernel 1198 * We might boot into a crash-kernel here. The crashed kernel
1093 * left the caches in the IOMMU dirty. So we have to flush 1199 * left the caches in the IOMMU dirty. So we have to flush
1094 * here to evict all dirty stuff. 1200 * here to evict all dirty stuff.
1095 */ 1201 */
1096 iommu_queue_inv_dev_entry(iommu, devid); 1202 iommu_queue_inv_dev_entry(iommu, devid);
1097 iommu_flush_tlb_pde(iommu, domain->id); 1203 iommu_flush_tlb_pde(iommu, domain->id);
1098} 1204}
@@ -1119,6 +1225,15 @@ static void __detach_device(struct protection_domain *domain, u16 devid)
1119 1225
1120 /* ready */ 1226 /* ready */
1121 spin_unlock(&domain->lock); 1227 spin_unlock(&domain->lock);
1228
1229 /*
1230 * If we run in passthrough mode the device must be assigned to the
1231 * passthrough domain if it is detached from any other domain
1232 */
1233 if (iommu_pass_through) {
1234 struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
1235 __attach_device(iommu, pt_domain, devid);
1236 }
1122} 1237}
1123 1238
1124/* 1239/*
@@ -1164,6 +1279,8 @@ static int device_change_notifier(struct notifier_block *nb,
1164 case BUS_NOTIFY_UNBOUND_DRIVER: 1279 case BUS_NOTIFY_UNBOUND_DRIVER:
1165 if (!domain) 1280 if (!domain)
1166 goto out; 1281 goto out;
1282 if (iommu_pass_through)
1283 break;
1167 detach_device(domain, devid); 1284 detach_device(domain, devid);
1168 break; 1285 break;
1169 case BUS_NOTIFY_ADD_DEVICE: 1286 case BUS_NOTIFY_ADD_DEVICE:
@@ -1292,39 +1409,91 @@ static int get_device_resources(struct device *dev,
1292 return 1; 1409 return 1;
1293} 1410}
1294 1411
1412static void update_device_table(struct protection_domain *domain)
1413{
1414 unsigned long flags;
1415 int i;
1416
1417 for (i = 0; i <= amd_iommu_last_bdf; ++i) {
1418 if (amd_iommu_pd_table[i] != domain)
1419 continue;
1420 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1421 set_dte_entry(i, domain);
1422 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1423 }
1424}
1425
1426static void update_domain(struct protection_domain *domain)
1427{
1428 if (!domain->updated)
1429 return;
1430
1431 update_device_table(domain);
1432 flush_devices_by_domain(domain);
1433 iommu_flush_domain(domain->id);
1434
1435 domain->updated = false;
1436}
1437
1295/* 1438/*
1296 * If the pte_page is not yet allocated this function is called 1439 * This function is used to add another level to an IO page table. Adding
1440 * another level increases the size of the address space by 9 bits to a size up
1441 * to 64 bits.
1297 */ 1442 */
1298static u64* alloc_pte(struct protection_domain *dom, 1443static bool increase_address_space(struct protection_domain *domain,
1299 unsigned long address, u64 **pte_page, gfp_t gfp) 1444 gfp_t gfp)
1445{
1446 u64 *pte;
1447
1448 if (domain->mode == PAGE_MODE_6_LEVEL)
1449 /* address space already 64 bit large */
1450 return false;
1451
1452 pte = (void *)get_zeroed_page(gfp);
1453 if (!pte)
1454 return false;
1455
1456 *pte = PM_LEVEL_PDE(domain->mode,
1457 virt_to_phys(domain->pt_root));
1458 domain->pt_root = pte;
1459 domain->mode += 1;
1460 domain->updated = true;
1461
1462 return true;
1463}
1464
1465static u64 *alloc_pte(struct protection_domain *domain,
1466 unsigned long address,
1467 int end_lvl,
1468 u64 **pte_page,
1469 gfp_t gfp)
1300{ 1470{
1301 u64 *pte, *page; 1471 u64 *pte, *page;
1472 int level;
1302 1473
1303 pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(address)]; 1474 while (address > PM_LEVEL_SIZE(domain->mode))
1475 increase_address_space(domain, gfp);
1304 1476
1305 if (!IOMMU_PTE_PRESENT(*pte)) { 1477 level = domain->mode - 1;
1306 page = (u64 *)get_zeroed_page(gfp); 1478 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
1307 if (!page)
1308 return NULL;
1309 *pte = IOMMU_L2_PDE(virt_to_phys(page));
1310 }
1311 1479
1312 pte = IOMMU_PTE_PAGE(*pte); 1480 while (level > end_lvl) {
1313 pte = &pte[IOMMU_PTE_L1_INDEX(address)]; 1481 if (!IOMMU_PTE_PRESENT(*pte)) {
1482 page = (u64 *)get_zeroed_page(gfp);
1483 if (!page)
1484 return NULL;
1485 *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
1486 }
1314 1487
1315 if (!IOMMU_PTE_PRESENT(*pte)) { 1488 level -= 1;
1316 page = (u64 *)get_zeroed_page(gfp);
1317 if (!page)
1318 return NULL;
1319 *pte = IOMMU_L1_PDE(virt_to_phys(page));
1320 }
1321 1489
1322 pte = IOMMU_PTE_PAGE(*pte); 1490 pte = IOMMU_PTE_PAGE(*pte);
1323 1491
1324 if (pte_page) 1492 if (pte_page && level == end_lvl)
1325 *pte_page = pte; 1493 *pte_page = pte;
1326 1494
1327 pte = &pte[IOMMU_PTE_L0_INDEX(address)]; 1495 pte = &pte[PM_LEVEL_INDEX(level, address)];
1496 }
1328 1497
1329 return pte; 1498 return pte;
1330} 1499}
@@ -1344,10 +1513,13 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
1344 1513
1345 pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; 1514 pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
1346 if (!pte) { 1515 if (!pte) {
1347 pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC); 1516 pte = alloc_pte(&dom->domain, address, PM_MAP_4k, &pte_page,
1517 GFP_ATOMIC);
1348 aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page; 1518 aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
1349 } else 1519 } else
1350 pte += IOMMU_PTE_L0_INDEX(address); 1520 pte += PM_LEVEL_INDEX(0, address);
1521
1522 update_domain(&dom->domain);
1351 1523
1352 return pte; 1524 return pte;
1353} 1525}
@@ -1409,7 +1581,7 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
1409 if (!pte) 1581 if (!pte)
1410 return; 1582 return;
1411 1583
1412 pte += IOMMU_PTE_L0_INDEX(address); 1584 pte += PM_LEVEL_INDEX(0, address);
1413 1585
1414 WARN_ON(!*pte); 1586 WARN_ON(!*pte);
1415 1587
@@ -1988,19 +2160,47 @@ static void cleanup_domain(struct protection_domain *domain)
1988 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 2160 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1989} 2161}
1990 2162
1991static int amd_iommu_domain_init(struct iommu_domain *dom) 2163static void protection_domain_free(struct protection_domain *domain)
2164{
2165 if (!domain)
2166 return;
2167
2168 if (domain->id)
2169 domain_id_free(domain->id);
2170
2171 kfree(domain);
2172}
2173
2174static struct protection_domain *protection_domain_alloc(void)
1992{ 2175{
1993 struct protection_domain *domain; 2176 struct protection_domain *domain;
1994 2177
1995 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 2178 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1996 if (!domain) 2179 if (!domain)
1997 return -ENOMEM; 2180 return NULL;
1998 2181
1999 spin_lock_init(&domain->lock); 2182 spin_lock_init(&domain->lock);
2000 domain->mode = PAGE_MODE_3_LEVEL;
2001 domain->id = domain_id_alloc(); 2183 domain->id = domain_id_alloc();
2002 if (!domain->id) 2184 if (!domain->id)
2185 goto out_err;
2186
2187 return domain;
2188
2189out_err:
2190 kfree(domain);
2191
2192 return NULL;
2193}
2194
2195static int amd_iommu_domain_init(struct iommu_domain *dom)
2196{
2197 struct protection_domain *domain;
2198
2199 domain = protection_domain_alloc();
2200 if (!domain)
2003 goto out_free; 2201 goto out_free;
2202
2203 domain->mode = PAGE_MODE_3_LEVEL;
2004 domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL); 2204 domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
2005 if (!domain->pt_root) 2205 if (!domain->pt_root)
2006 goto out_free; 2206 goto out_free;
@@ -2010,7 +2210,7 @@ static int amd_iommu_domain_init(struct iommu_domain *dom)
2010 return 0; 2210 return 0;
2011 2211
2012out_free: 2212out_free:
2013 kfree(domain); 2213 protection_domain_free(domain);
2014 2214
2015 return -ENOMEM; 2215 return -ENOMEM;
2016} 2216}
@@ -2115,7 +2315,7 @@ static int amd_iommu_map_range(struct iommu_domain *dom,
2115 paddr &= PAGE_MASK; 2315 paddr &= PAGE_MASK;
2116 2316
2117 for (i = 0; i < npages; ++i) { 2317 for (i = 0; i < npages; ++i) {
2118 ret = iommu_map_page(domain, iova, paddr, prot); 2318 ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k);
2119 if (ret) 2319 if (ret)
2120 return ret; 2320 return ret;
2121 2321
@@ -2136,7 +2336,7 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom,
2136 iova &= PAGE_MASK; 2336 iova &= PAGE_MASK;
2137 2337
2138 for (i = 0; i < npages; ++i) { 2338 for (i = 0; i < npages; ++i) {
2139 iommu_unmap_page(domain, iova); 2339 iommu_unmap_page(domain, iova, PM_MAP_4k);
2140 iova += PAGE_SIZE; 2340 iova += PAGE_SIZE;
2141 } 2341 }
2142 2342
@@ -2151,21 +2351,9 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
2151 phys_addr_t paddr; 2351 phys_addr_t paddr;
2152 u64 *pte; 2352 u64 *pte;
2153 2353
2154 pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(iova)]; 2354 pte = fetch_pte(domain, iova, PM_MAP_4k);
2155
2156 if (!IOMMU_PTE_PRESENT(*pte))
2157 return 0;
2158
2159 pte = IOMMU_PTE_PAGE(*pte);
2160 pte = &pte[IOMMU_PTE_L1_INDEX(iova)];
2161
2162 if (!IOMMU_PTE_PRESENT(*pte))
2163 return 0;
2164
2165 pte = IOMMU_PTE_PAGE(*pte);
2166 pte = &pte[IOMMU_PTE_L0_INDEX(iova)];
2167 2355
2168 if (!IOMMU_PTE_PRESENT(*pte)) 2356 if (!pte || !IOMMU_PTE_PRESENT(*pte))
2169 return 0; 2357 return 0;
2170 2358
2171 paddr = *pte & IOMMU_PAGE_MASK; 2359 paddr = *pte & IOMMU_PAGE_MASK;
@@ -2191,3 +2379,46 @@ static struct iommu_ops amd_iommu_ops = {
2191 .domain_has_cap = amd_iommu_domain_has_cap, 2379 .domain_has_cap = amd_iommu_domain_has_cap,
2192}; 2380};
2193 2381
2382/*****************************************************************************
2383 *
2384 * The next functions do a basic initialization of IOMMU for pass through
2385 * mode
2386 *
2387 * In passthrough mode the IOMMU is initialized and enabled but not used for
2388 * DMA-API translation.
2389 *
2390 *****************************************************************************/
2391
2392int __init amd_iommu_init_passthrough(void)
2393{
2394 struct pci_dev *dev = NULL;
2395 u16 devid, devid2;
2396
2397 /* allocate passthroug domain */
2398 pt_domain = protection_domain_alloc();
2399 if (!pt_domain)
2400 return -ENOMEM;
2401
2402 pt_domain->mode |= PAGE_MODE_NONE;
2403
2404 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
2405 struct amd_iommu *iommu;
2406
2407 devid = calc_devid(dev->bus->number, dev->devfn);
2408 if (devid > amd_iommu_last_bdf)
2409 continue;
2410
2411 devid2 = amd_iommu_alias_table[devid];
2412
2413 iommu = amd_iommu_rlookup_table[devid2];
2414 if (!iommu)
2415 continue;
2416
2417 __attach_device(iommu, pt_domain, devid);
2418 __attach_device(iommu, pt_domain, devid2);
2419 }
2420
2421 pr_info("AMD-Vi: Initialized for Passthrough Mode\n");
2422
2423 return 0;
2424}
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index c1b17e97252e..b4b61d462dcc 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -252,7 +252,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
252/* Function to enable the hardware */ 252/* Function to enable the hardware */
253static void iommu_enable(struct amd_iommu *iommu) 253static void iommu_enable(struct amd_iommu *iommu)
254{ 254{
255 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n", 255 printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx\n",
256 dev_name(&iommu->dev->dev), iommu->cap_ptr); 256 dev_name(&iommu->dev->dev), iommu->cap_ptr);
257 257
258 iommu_feature_enable(iommu, CONTROL_IOMMU_EN); 258 iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
@@ -435,6 +435,20 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
435} 435}
436 436
437/* 437/*
438 * This function resets the command buffer if the IOMMU stopped fetching
439 * commands from it.
440 */
441void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu)
442{
443 iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
444
445 writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
446 writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
447
448 iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
449}
450
451/*
438 * This function writes the command buffer address to the hardware and 452 * This function writes the command buffer address to the hardware and
439 * enables it. 453 * enables it.
440 */ 454 */
@@ -450,11 +464,7 @@ static void iommu_enable_command_buffer(struct amd_iommu *iommu)
450 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, 464 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
451 &entry, sizeof(entry)); 465 &entry, sizeof(entry));
452 466
453 /* set head and tail to zero manually */ 467 amd_iommu_reset_cmd_buffer(iommu);
454 writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
455 writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
456
457 iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
458} 468}
459 469
460static void __init free_command_buffer(struct amd_iommu *iommu) 470static void __init free_command_buffer(struct amd_iommu *iommu)
@@ -858,7 +868,7 @@ static int __init init_iommu_all(struct acpi_table_header *table)
858 switch (*p) { 868 switch (*p) {
859 case ACPI_IVHD_TYPE: 869 case ACPI_IVHD_TYPE:
860 870
861 DUMP_printk("IOMMU: device: %02x:%02x.%01x cap: %04x " 871 DUMP_printk("device: %02x:%02x.%01x cap: %04x "
862 "seg: %d flags: %01x info %04x\n", 872 "seg: %d flags: %01x info %04x\n",
863 PCI_BUS(h->devid), PCI_SLOT(h->devid), 873 PCI_BUS(h->devid), PCI_SLOT(h->devid),
864 PCI_FUNC(h->devid), h->cap_ptr, 874 PCI_FUNC(h->devid), h->cap_ptr,
@@ -902,7 +912,7 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu)
902 912
903 r = request_irq(iommu->dev->irq, amd_iommu_int_handler, 913 r = request_irq(iommu->dev->irq, amd_iommu_int_handler,
904 IRQF_SAMPLE_RANDOM, 914 IRQF_SAMPLE_RANDOM,
905 "AMD IOMMU", 915 "AMD-Vi",
906 NULL); 916 NULL);
907 917
908 if (r) { 918 if (r) {
@@ -1150,7 +1160,7 @@ int __init amd_iommu_init(void)
1150 1160
1151 1161
1152 if (no_iommu) { 1162 if (no_iommu) {
1153 printk(KERN_INFO "AMD IOMMU disabled by kernel command line\n"); 1163 printk(KERN_INFO "AMD-Vi disabled by kernel command line\n");
1154 return 0; 1164 return 0;
1155 } 1165 }
1156 1166
@@ -1242,22 +1252,28 @@ int __init amd_iommu_init(void)
1242 if (ret) 1252 if (ret)
1243 goto free; 1253 goto free;
1244 1254
1245 ret = amd_iommu_init_dma_ops(); 1255 if (iommu_pass_through)
1256 ret = amd_iommu_init_passthrough();
1257 else
1258 ret = amd_iommu_init_dma_ops();
1246 if (ret) 1259 if (ret)
1247 goto free; 1260 goto free;
1248 1261
1249 enable_iommus(); 1262 enable_iommus();
1250 1263
1251 printk(KERN_INFO "AMD IOMMU: device isolation "); 1264 if (iommu_pass_through)
1265 goto out;
1266
1267 printk(KERN_INFO "AMD-Vi: device isolation ");
1252 if (amd_iommu_isolate) 1268 if (amd_iommu_isolate)
1253 printk("enabled\n"); 1269 printk("enabled\n");
1254 else 1270 else
1255 printk("disabled\n"); 1271 printk("disabled\n");
1256 1272
1257 if (amd_iommu_unmap_flush) 1273 if (amd_iommu_unmap_flush)
1258 printk(KERN_INFO "AMD IOMMU: IO/TLB flush on unmap enabled\n"); 1274 printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n");
1259 else 1275 else
1260 printk(KERN_INFO "AMD IOMMU: Lazy IO/TLB flushing enabled\n"); 1276 printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n");
1261 1277
1262out: 1278out:
1263 return ret; 1279 return ret;
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index b3025b43b63a..db7220220d09 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -39,7 +39,7 @@
39int unknown_nmi_panic; 39int unknown_nmi_panic;
40int nmi_watchdog_enabled; 40int nmi_watchdog_enabled;
41 41
42static cpumask_var_t backtrace_mask; 42static cpumask_t backtrace_mask __read_mostly;
43 43
44/* nmi_active: 44/* nmi_active:
45 * >0: the lapic NMI watchdog is active, but can be disabled 45 * >0: the lapic NMI watchdog is active, but can be disabled
@@ -138,7 +138,6 @@ int __init check_nmi_watchdog(void)
138 if (!prev_nmi_count) 138 if (!prev_nmi_count)
139 goto error; 139 goto error;
140 140
141 alloc_cpumask_var(&backtrace_mask, GFP_KERNEL|__GFP_ZERO);
142 printk(KERN_INFO "Testing NMI watchdog ... "); 141 printk(KERN_INFO "Testing NMI watchdog ... ");
143 142
144#ifdef CONFIG_SMP 143#ifdef CONFIG_SMP
@@ -415,14 +414,17 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
415 } 414 }
416 415
417 /* We can be called before check_nmi_watchdog, hence NULL check. */ 416 /* We can be called before check_nmi_watchdog, hence NULL check. */
418 if (backtrace_mask != NULL && cpumask_test_cpu(cpu, backtrace_mask)) { 417 if (cpumask_test_cpu(cpu, &backtrace_mask)) {
419 static DEFINE_SPINLOCK(lock); /* Serialise the printks */ 418 static DEFINE_SPINLOCK(lock); /* Serialise the printks */
420 419
421 spin_lock(&lock); 420 spin_lock(&lock);
422 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); 421 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
422 show_regs(regs);
423 dump_stack(); 423 dump_stack();
424 spin_unlock(&lock); 424 spin_unlock(&lock);
425 cpumask_clear_cpu(cpu, backtrace_mask); 425 cpumask_clear_cpu(cpu, &backtrace_mask);
426
427 rc = 1;
426 } 428 }
427 429
428 /* Could check oops_in_progress here too, but it's safer not to */ 430 /* Could check oops_in_progress here too, but it's safer not to */
@@ -552,14 +554,18 @@ int do_nmi_callback(struct pt_regs *regs, int cpu)
552 return 0; 554 return 0;
553} 555}
554 556
555void __trigger_all_cpu_backtrace(void) 557void arch_trigger_all_cpu_backtrace(void)
556{ 558{
557 int i; 559 int i;
558 560
559 cpumask_copy(backtrace_mask, cpu_online_mask); 561 cpumask_copy(&backtrace_mask, cpu_online_mask);
562
563 printk(KERN_INFO "sending NMI to all CPUs:\n");
564 apic->send_IPI_all(NMI_VECTOR);
565
560 /* Wait for up to 10 seconds for all CPUs to do the backtrace */ 566 /* Wait for up to 10 seconds for all CPUs to do the backtrace */
561 for (i = 0; i < 10 * 1000; i++) { 567 for (i = 0; i < 10 * 1000; i++) {
562 if (cpumask_empty(backtrace_mask)) 568 if (cpumask_empty(&backtrace_mask))
563 break; 569 break;
564 mdelay(1); 570 mdelay(1);
565 } 571 }
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 898ecc47e129..4a6aeedcd965 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -3,6 +3,7 @@
3 * This code generates raw asm output which is post-processed to extract 3 * This code generates raw asm output which is post-processed to extract
4 * and format the required data. 4 * and format the required data.
5 */ 5 */
6#define COMPILE_OFFSETS
6 7
7#include <linux/crypto.h> 8#include <linux/crypto.h>
8#include <linux/sched.h> 9#include <linux/sched.h>
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 900332b800f8..f9cd0849bd42 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -6,6 +6,7 @@
6 * Copyright (C) 2009 Jaswinder Singh Rajput 6 * Copyright (C) 2009 Jaswinder Singh Rajput
7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter 7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9 * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
9 * 10 *
10 * For licencing details see kernel-base/COPYING 11 * For licencing details see kernel-base/COPYING
11 */ 12 */
@@ -20,6 +21,7 @@
20#include <linux/sched.h> 21#include <linux/sched.h>
21#include <linux/uaccess.h> 22#include <linux/uaccess.h>
22#include <linux/highmem.h> 23#include <linux/highmem.h>
24#include <linux/cpu.h>
23 25
24#include <asm/apic.h> 26#include <asm/apic.h>
25#include <asm/stacktrace.h> 27#include <asm/stacktrace.h>
@@ -27,12 +29,52 @@
27 29
28static u64 perf_counter_mask __read_mostly; 30static u64 perf_counter_mask __read_mostly;
29 31
32/* The maximal number of PEBS counters: */
33#define MAX_PEBS_COUNTERS 4
34
35/* The size of a BTS record in bytes: */
36#define BTS_RECORD_SIZE 24
37
38/* The size of a per-cpu BTS buffer in bytes: */
39#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 1024)
40
41/* The BTS overflow threshold in bytes from the end of the buffer: */
42#define BTS_OVFL_TH (BTS_RECORD_SIZE * 64)
43
44
45/*
46 * Bits in the debugctlmsr controlling branch tracing.
47 */
48#define X86_DEBUGCTL_TR (1 << 6)
49#define X86_DEBUGCTL_BTS (1 << 7)
50#define X86_DEBUGCTL_BTINT (1 << 8)
51#define X86_DEBUGCTL_BTS_OFF_OS (1 << 9)
52#define X86_DEBUGCTL_BTS_OFF_USR (1 << 10)
53
54/*
55 * A debug store configuration.
56 *
57 * We only support architectures that use 64bit fields.
58 */
59struct debug_store {
60 u64 bts_buffer_base;
61 u64 bts_index;
62 u64 bts_absolute_maximum;
63 u64 bts_interrupt_threshold;
64 u64 pebs_buffer_base;
65 u64 pebs_index;
66 u64 pebs_absolute_maximum;
67 u64 pebs_interrupt_threshold;
68 u64 pebs_counter_reset[MAX_PEBS_COUNTERS];
69};
70
30struct cpu_hw_counters { 71struct cpu_hw_counters {
31 struct perf_counter *counters[X86_PMC_IDX_MAX]; 72 struct perf_counter *counters[X86_PMC_IDX_MAX];
32 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 73 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
33 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 74 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
34 unsigned long interrupts; 75 unsigned long interrupts;
35 int enabled; 76 int enabled;
77 struct debug_store *ds;
36}; 78};
37 79
38/* 80/*
@@ -58,6 +100,8 @@ struct x86_pmu {
58 int apic; 100 int apic;
59 u64 max_period; 101 u64 max_period;
60 u64 intel_ctrl; 102 u64 intel_ctrl;
103 void (*enable_bts)(u64 config);
104 void (*disable_bts)(void);
61}; 105};
62 106
63static struct x86_pmu x86_pmu __read_mostly; 107static struct x86_pmu x86_pmu __read_mostly;
@@ -577,6 +621,9 @@ x86_perf_counter_update(struct perf_counter *counter,
577 u64 prev_raw_count, new_raw_count; 621 u64 prev_raw_count, new_raw_count;
578 s64 delta; 622 s64 delta;
579 623
624 if (idx == X86_PMC_IDX_FIXED_BTS)
625 return 0;
626
580 /* 627 /*
581 * Careful: an NMI might modify the previous counter value. 628 * Careful: an NMI might modify the previous counter value.
582 * 629 *
@@ -666,10 +713,110 @@ static void release_pmc_hardware(void)
666#endif 713#endif
667} 714}
668 715
716static inline bool bts_available(void)
717{
718 return x86_pmu.enable_bts != NULL;
719}
720
721static inline void init_debug_store_on_cpu(int cpu)
722{
723 struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds;
724
725 if (!ds)
726 return;
727
728 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
729 (u32)((u64)(unsigned long)ds),
730 (u32)((u64)(unsigned long)ds >> 32));
731}
732
733static inline void fini_debug_store_on_cpu(int cpu)
734{
735 if (!per_cpu(cpu_hw_counters, cpu).ds)
736 return;
737
738 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
739}
740
741static void release_bts_hardware(void)
742{
743 int cpu;
744
745 if (!bts_available())
746 return;
747
748 get_online_cpus();
749
750 for_each_online_cpu(cpu)
751 fini_debug_store_on_cpu(cpu);
752
753 for_each_possible_cpu(cpu) {
754 struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds;
755
756 if (!ds)
757 continue;
758
759 per_cpu(cpu_hw_counters, cpu).ds = NULL;
760
761 kfree((void *)(unsigned long)ds->bts_buffer_base);
762 kfree(ds);
763 }
764
765 put_online_cpus();
766}
767
768static int reserve_bts_hardware(void)
769{
770 int cpu, err = 0;
771
772 if (!bts_available())
773 return 0;
774
775 get_online_cpus();
776
777 for_each_possible_cpu(cpu) {
778 struct debug_store *ds;
779 void *buffer;
780
781 err = -ENOMEM;
782 buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
783 if (unlikely(!buffer))
784 break;
785
786 ds = kzalloc(sizeof(*ds), GFP_KERNEL);
787 if (unlikely(!ds)) {
788 kfree(buffer);
789 break;
790 }
791
792 ds->bts_buffer_base = (u64)(unsigned long)buffer;
793 ds->bts_index = ds->bts_buffer_base;
794 ds->bts_absolute_maximum =
795 ds->bts_buffer_base + BTS_BUFFER_SIZE;
796 ds->bts_interrupt_threshold =
797 ds->bts_absolute_maximum - BTS_OVFL_TH;
798
799 per_cpu(cpu_hw_counters, cpu).ds = ds;
800 err = 0;
801 }
802
803 if (err)
804 release_bts_hardware();
805 else {
806 for_each_online_cpu(cpu)
807 init_debug_store_on_cpu(cpu);
808 }
809
810 put_online_cpus();
811
812 return err;
813}
814
669static void hw_perf_counter_destroy(struct perf_counter *counter) 815static void hw_perf_counter_destroy(struct perf_counter *counter)
670{ 816{
671 if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) { 817 if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) {
672 release_pmc_hardware(); 818 release_pmc_hardware();
819 release_bts_hardware();
673 mutex_unlock(&pmc_reserve_mutex); 820 mutex_unlock(&pmc_reserve_mutex);
674 } 821 }
675} 822}
@@ -712,6 +859,42 @@ set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
712 return 0; 859 return 0;
713} 860}
714 861
862static void intel_pmu_enable_bts(u64 config)
863{
864 unsigned long debugctlmsr;
865
866 debugctlmsr = get_debugctlmsr();
867
868 debugctlmsr |= X86_DEBUGCTL_TR;
869 debugctlmsr |= X86_DEBUGCTL_BTS;
870 debugctlmsr |= X86_DEBUGCTL_BTINT;
871
872 if (!(config & ARCH_PERFMON_EVENTSEL_OS))
873 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
874
875 if (!(config & ARCH_PERFMON_EVENTSEL_USR))
876 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
877
878 update_debugctlmsr(debugctlmsr);
879}
880
881static void intel_pmu_disable_bts(void)
882{
883 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
884 unsigned long debugctlmsr;
885
886 if (!cpuc->ds)
887 return;
888
889 debugctlmsr = get_debugctlmsr();
890
891 debugctlmsr &=
892 ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
893 X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
894
895 update_debugctlmsr(debugctlmsr);
896}
897
715/* 898/*
716 * Setup the hardware configuration for a given attr_type 899 * Setup the hardware configuration for a given attr_type
717 */ 900 */
@@ -728,9 +911,13 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
728 err = 0; 911 err = 0;
729 if (!atomic_inc_not_zero(&active_counters)) { 912 if (!atomic_inc_not_zero(&active_counters)) {
730 mutex_lock(&pmc_reserve_mutex); 913 mutex_lock(&pmc_reserve_mutex);
731 if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware()) 914 if (atomic_read(&active_counters) == 0) {
732 err = -EBUSY; 915 if (!reserve_pmc_hardware())
733 else 916 err = -EBUSY;
917 else
918 err = reserve_bts_hardware();
919 }
920 if (!err)
734 atomic_inc(&active_counters); 921 atomic_inc(&active_counters);
735 mutex_unlock(&pmc_reserve_mutex); 922 mutex_unlock(&pmc_reserve_mutex);
736 } 923 }
@@ -793,6 +980,20 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
793 if (config == -1LL) 980 if (config == -1LL)
794 return -EINVAL; 981 return -EINVAL;
795 982
983 /*
984 * Branch tracing:
985 */
986 if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
987 (hwc->sample_period == 1)) {
988 /* BTS is not supported by this architecture. */
989 if (!bts_available())
990 return -EOPNOTSUPP;
991
992 /* BTS is currently only allowed for user-mode. */
993 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
994 return -EOPNOTSUPP;
995 }
996
796 hwc->config |= config; 997 hwc->config |= config;
797 998
798 return 0; 999 return 0;
@@ -817,7 +1018,18 @@ static void p6_pmu_disable_all(void)
817 1018
818static void intel_pmu_disable_all(void) 1019static void intel_pmu_disable_all(void)
819{ 1020{
1021 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
1022
1023 if (!cpuc->enabled)
1024 return;
1025
1026 cpuc->enabled = 0;
1027 barrier();
1028
820 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); 1029 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
1030
1031 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
1032 intel_pmu_disable_bts();
821} 1033}
822 1034
823static void amd_pmu_disable_all(void) 1035static void amd_pmu_disable_all(void)
@@ -875,7 +1087,25 @@ static void p6_pmu_enable_all(void)
875 1087
876static void intel_pmu_enable_all(void) 1088static void intel_pmu_enable_all(void)
877{ 1089{
1090 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
1091
1092 if (cpuc->enabled)
1093 return;
1094
1095 cpuc->enabled = 1;
1096 barrier();
1097
878 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); 1098 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
1099
1100 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
1101 struct perf_counter *counter =
1102 cpuc->counters[X86_PMC_IDX_FIXED_BTS];
1103
1104 if (WARN_ON_ONCE(!counter))
1105 return;
1106
1107 intel_pmu_enable_bts(counter->hw.config);
1108 }
879} 1109}
880 1110
881static void amd_pmu_enable_all(void) 1111static void amd_pmu_enable_all(void)
@@ -962,6 +1192,11 @@ p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
962static inline void 1192static inline void
963intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) 1193intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
964{ 1194{
1195 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
1196 intel_pmu_disable_bts();
1197 return;
1198 }
1199
965 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { 1200 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
966 intel_pmu_disable_fixed(hwc, idx); 1201 intel_pmu_disable_fixed(hwc, idx);
967 return; 1202 return;
@@ -990,6 +1225,9 @@ x86_perf_counter_set_period(struct perf_counter *counter,
990 s64 period = hwc->sample_period; 1225 s64 period = hwc->sample_period;
991 int err, ret = 0; 1226 int err, ret = 0;
992 1227
1228 if (idx == X86_PMC_IDX_FIXED_BTS)
1229 return 0;
1230
993 /* 1231 /*
994 * If we are way outside a reasoable range then just skip forward: 1232 * If we are way outside a reasoable range then just skip forward:
995 */ 1233 */
@@ -1072,6 +1310,14 @@ static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
1072 1310
1073static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) 1311static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
1074{ 1312{
1313 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
1314 if (!__get_cpu_var(cpu_hw_counters).enabled)
1315 return;
1316
1317 intel_pmu_enable_bts(hwc->config);
1318 return;
1319 }
1320
1075 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { 1321 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
1076 intel_pmu_enable_fixed(hwc, idx); 1322 intel_pmu_enable_fixed(hwc, idx);
1077 return; 1323 return;
@@ -1093,11 +1339,16 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
1093{ 1339{
1094 unsigned int event; 1340 unsigned int event;
1095 1341
1342 event = hwc->config & ARCH_PERFMON_EVENT_MASK;
1343
1344 if (unlikely((event ==
1345 x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
1346 (hwc->sample_period == 1)))
1347 return X86_PMC_IDX_FIXED_BTS;
1348
1096 if (!x86_pmu.num_counters_fixed) 1349 if (!x86_pmu.num_counters_fixed)
1097 return -1; 1350 return -1;
1098 1351
1099 event = hwc->config & ARCH_PERFMON_EVENT_MASK;
1100
1101 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) 1352 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
1102 return X86_PMC_IDX_FIXED_INSTRUCTIONS; 1353 return X86_PMC_IDX_FIXED_INSTRUCTIONS;
1103 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) 1354 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
@@ -1118,7 +1369,15 @@ static int x86_pmu_enable(struct perf_counter *counter)
1118 int idx; 1369 int idx;
1119 1370
1120 idx = fixed_mode_idx(counter, hwc); 1371 idx = fixed_mode_idx(counter, hwc);
1121 if (idx >= 0) { 1372 if (idx == X86_PMC_IDX_FIXED_BTS) {
1373 /* BTS is already occupied. */
1374 if (test_and_set_bit(idx, cpuc->used_mask))
1375 return -EAGAIN;
1376
1377 hwc->config_base = 0;
1378 hwc->counter_base = 0;
1379 hwc->idx = idx;
1380 } else if (idx >= 0) {
1122 /* 1381 /*
1123 * Try to get the fixed counter, if that is already taken 1382 * Try to get the fixed counter, if that is already taken
1124 * then try to get a generic counter: 1383 * then try to get a generic counter:
@@ -1229,6 +1488,44 @@ void perf_counter_print_debug(void)
1229 local_irq_restore(flags); 1488 local_irq_restore(flags);
1230} 1489}
1231 1490
1491static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc,
1492 struct perf_sample_data *data)
1493{
1494 struct debug_store *ds = cpuc->ds;
1495 struct bts_record {
1496 u64 from;
1497 u64 to;
1498 u64 flags;
1499 };
1500 struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS];
1501 unsigned long orig_ip = data->regs->ip;
1502 struct bts_record *at, *top;
1503
1504 if (!counter)
1505 return;
1506
1507 if (!ds)
1508 return;
1509
1510 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
1511 top = (struct bts_record *)(unsigned long)ds->bts_index;
1512
1513 ds->bts_index = ds->bts_buffer_base;
1514
1515 for (; at < top; at++) {
1516 data->regs->ip = at->from;
1517 data->addr = at->to;
1518
1519 perf_counter_output(counter, 1, data);
1520 }
1521
1522 data->regs->ip = orig_ip;
1523 data->addr = 0;
1524
1525 /* There's new data available. */
1526 counter->pending_kill = POLL_IN;
1527}
1528
1232static void x86_pmu_disable(struct perf_counter *counter) 1529static void x86_pmu_disable(struct perf_counter *counter)
1233{ 1530{
1234 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1531 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
@@ -1253,6 +1550,15 @@ static void x86_pmu_disable(struct perf_counter *counter)
1253 * that we are disabling: 1550 * that we are disabling:
1254 */ 1551 */
1255 x86_perf_counter_update(counter, hwc, idx); 1552 x86_perf_counter_update(counter, hwc, idx);
1553
1554 /* Drain the remaining BTS records. */
1555 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
1556 struct perf_sample_data data;
1557 struct pt_regs regs;
1558
1559 data.regs = &regs;
1560 intel_pmu_drain_bts_buffer(cpuc, &data);
1561 }
1256 cpuc->counters[idx] = NULL; 1562 cpuc->counters[idx] = NULL;
1257 clear_bit(idx, cpuc->used_mask); 1563 clear_bit(idx, cpuc->used_mask);
1258 1564
@@ -1280,6 +1586,7 @@ static int intel_pmu_save_and_restart(struct perf_counter *counter)
1280 1586
1281static void intel_pmu_reset(void) 1587static void intel_pmu_reset(void)
1282{ 1588{
1589 struct debug_store *ds = __get_cpu_var(cpu_hw_counters).ds;
1283 unsigned long flags; 1590 unsigned long flags;
1284 int idx; 1591 int idx;
1285 1592
@@ -1297,6 +1604,8 @@ static void intel_pmu_reset(void)
1297 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { 1604 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
1298 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); 1605 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
1299 } 1606 }
1607 if (ds)
1608 ds->bts_index = ds->bts_buffer_base;
1300 1609
1301 local_irq_restore(flags); 1610 local_irq_restore(flags);
1302} 1611}
@@ -1362,6 +1671,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
1362 cpuc = &__get_cpu_var(cpu_hw_counters); 1671 cpuc = &__get_cpu_var(cpu_hw_counters);
1363 1672
1364 perf_disable(); 1673 perf_disable();
1674 intel_pmu_drain_bts_buffer(cpuc, &data);
1365 status = intel_pmu_get_status(); 1675 status = intel_pmu_get_status();
1366 if (!status) { 1676 if (!status) {
1367 perf_enable(); 1677 perf_enable();
@@ -1571,6 +1881,8 @@ static struct x86_pmu intel_pmu = {
1571 * the generic counter period: 1881 * the generic counter period:
1572 */ 1882 */
1573 .max_period = (1ULL << 31) - 1, 1883 .max_period = (1ULL << 31) - 1,
1884 .enable_bts = intel_pmu_enable_bts,
1885 .disable_bts = intel_pmu_disable_bts,
1574}; 1886};
1575 1887
1576static struct x86_pmu amd_pmu = { 1888static struct x86_pmu amd_pmu = {
@@ -1962,3 +2274,8 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1962 2274
1963 return entry; 2275 return entry;
1964} 2276}
2277
2278void hw_perf_counter_setup_online(int cpu)
2279{
2280 init_debug_store_on_cpu(cpu);
2281}
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index d94e1ea3b9fe..9dbb527e1652 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -417,10 +417,6 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
417 unsigned long return_hooker = (unsigned long) 417 unsigned long return_hooker = (unsigned long)
418 &return_to_handler; 418 &return_to_handler;
419 419
420 /* Nmi's are currently unsupported */
421 if (unlikely(in_nmi()))
422 return;
423
424 if (unlikely(atomic_read(&current->tracing_graph_pause))) 420 if (unlikely(atomic_read(&current->tracing_graph_pause)))
425 return; 421 return;
426 422
@@ -498,37 +494,56 @@ static struct syscall_metadata *find_syscall_meta(unsigned long *syscall)
498 494
499struct syscall_metadata *syscall_nr_to_meta(int nr) 495struct syscall_metadata *syscall_nr_to_meta(int nr)
500{ 496{
501 if (!syscalls_metadata || nr >= FTRACE_SYSCALL_MAX || nr < 0) 497 if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
502 return NULL; 498 return NULL;
503 499
504 return syscalls_metadata[nr]; 500 return syscalls_metadata[nr];
505} 501}
506 502
507void arch_init_ftrace_syscalls(void) 503int syscall_name_to_nr(char *name)
504{
505 int i;
506
507 if (!syscalls_metadata)
508 return -1;
509
510 for (i = 0; i < NR_syscalls; i++) {
511 if (syscalls_metadata[i]) {
512 if (!strcmp(syscalls_metadata[i]->name, name))
513 return i;
514 }
515 }
516 return -1;
517}
518
519void set_syscall_enter_id(int num, int id)
520{
521 syscalls_metadata[num]->enter_id = id;
522}
523
524void set_syscall_exit_id(int num, int id)
525{
526 syscalls_metadata[num]->exit_id = id;
527}
528
529static int __init arch_init_ftrace_syscalls(void)
508{ 530{
509 int i; 531 int i;
510 struct syscall_metadata *meta; 532 struct syscall_metadata *meta;
511 unsigned long **psys_syscall_table = &sys_call_table; 533 unsigned long **psys_syscall_table = &sys_call_table;
512 static atomic_t refs;
513
514 if (atomic_inc_return(&refs) != 1)
515 goto end;
516 534
517 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * 535 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
518 FTRACE_SYSCALL_MAX, GFP_KERNEL); 536 NR_syscalls, GFP_KERNEL);
519 if (!syscalls_metadata) { 537 if (!syscalls_metadata) {
520 WARN_ON(1); 538 WARN_ON(1);
521 return; 539 return -ENOMEM;
522 } 540 }
523 541
524 for (i = 0; i < FTRACE_SYSCALL_MAX; i++) { 542 for (i = 0; i < NR_syscalls; i++) {
525 meta = find_syscall_meta(psys_syscall_table[i]); 543 meta = find_syscall_meta(psys_syscall_table[i]);
526 syscalls_metadata[i] = meta; 544 syscalls_metadata[i] = meta;
527 } 545 }
528 return; 546 return 0;
529
530 /* Paranoid: avoid overflow */
531end:
532 atomic_dec(&refs);
533} 547}
548arch_initcall(arch_init_ftrace_syscalls);
534#endif 549#endif
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index fa80f60e9607..d71c8655905b 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -33,7 +33,14 @@ int no_iommu __read_mostly;
33/* Set this to 1 if there is a HW IOMMU in the system */ 33/* Set this to 1 if there is a HW IOMMU in the system */
34int iommu_detected __read_mostly = 0; 34int iommu_detected __read_mostly = 0;
35 35
36int iommu_pass_through; 36/*
37 * This variable becomes 1 if iommu=pt is passed on the kernel command line.
38 * If this variable is 1, IOMMU implementations do no DMA ranslation for
39 * devices and allow every device to access to whole physical memory. This is
40 * useful if a user want to use an IOMMU only for KVM device assignment to
41 * guests and not for driver dma translation.
42 */
43int iommu_pass_through __read_mostly;
37 44
38dma_addr_t bad_dma_address __read_mostly = 0; 45dma_addr_t bad_dma_address __read_mostly = 0;
39EXPORT_SYMBOL(bad_dma_address); 46EXPORT_SYMBOL(bad_dma_address);
@@ -153,7 +160,7 @@ again:
153 return NULL; 160 return NULL;
154 161
155 addr = page_to_phys(page); 162 addr = page_to_phys(page);
156 if (!is_buffer_dma_capable(dma_mask, addr, size)) { 163 if (addr + size > dma_mask) {
157 __free_pages(page, get_order(size)); 164 __free_pages(page, get_order(size));
158 165
159 if (dma_mask < DMA_BIT_MASK(32) && !(flag & GFP_DMA)) { 166 if (dma_mask < DMA_BIT_MASK(32) && !(flag & GFP_DMA)) {
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index d2e56b8f48e7..98a827ee9ed7 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -190,14 +190,13 @@ static void iommu_full(struct device *dev, size_t size, int dir)
190static inline int 190static inline int
191need_iommu(struct device *dev, unsigned long addr, size_t size) 191need_iommu(struct device *dev, unsigned long addr, size_t size)
192{ 192{
193 return force_iommu || 193 return force_iommu || !dma_capable(dev, addr, size);
194 !is_buffer_dma_capable(*dev->dma_mask, addr, size);
195} 194}
196 195
197static inline int 196static inline int
198nonforced_iommu(struct device *dev, unsigned long addr, size_t size) 197nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
199{ 198{
200 return !is_buffer_dma_capable(*dev->dma_mask, addr, size); 199 return !dma_capable(dev, addr, size);
201} 200}
202 201
203/* Map a single continuous physical area into the IOMMU. 202/* Map a single continuous physical area into the IOMMU.
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index 71d412a09f30..a3933d4330cd 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -14,7 +14,7 @@
14static int 14static int
15check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) 15check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
16{ 16{
17 if (hwdev && !is_buffer_dma_capable(*hwdev->dma_mask, bus, size)) { 17 if (hwdev && !dma_capable(hwdev, bus, size)) {
18 if (*hwdev->dma_mask >= DMA_BIT_MASK(32)) 18 if (*hwdev->dma_mask >= DMA_BIT_MASK(32))
19 printk(KERN_ERR 19 printk(KERN_ERR
20 "nommu_%s: overflow %Lx+%zu of device mask %Lx\n", 20 "nommu_%s: overflow %Lx+%zu of device mask %Lx\n",
@@ -79,12 +79,29 @@ static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
79 free_pages((unsigned long)vaddr, get_order(size)); 79 free_pages((unsigned long)vaddr, get_order(size));
80} 80}
81 81
82static void nommu_sync_single_for_device(struct device *dev,
83 dma_addr_t addr, size_t size,
84 enum dma_data_direction dir)
85{
86 flush_write_buffers();
87}
88
89
90static void nommu_sync_sg_for_device(struct device *dev,
91 struct scatterlist *sg, int nelems,
92 enum dma_data_direction dir)
93{
94 flush_write_buffers();
95}
96
82struct dma_map_ops nommu_dma_ops = { 97struct dma_map_ops nommu_dma_ops = {
83 .alloc_coherent = dma_generic_alloc_coherent, 98 .alloc_coherent = dma_generic_alloc_coherent,
84 .free_coherent = nommu_free_coherent, 99 .free_coherent = nommu_free_coherent,
85 .map_sg = nommu_map_sg, 100 .map_sg = nommu_map_sg,
86 .map_page = nommu_map_page, 101 .map_page = nommu_map_page,
87 .is_phys = 1, 102 .sync_single_for_device = nommu_sync_single_for_device,
103 .sync_sg_for_device = nommu_sync_sg_for_device,
104 .is_phys = 1,
88}; 105};
89 106
90void __init no_iommu_init(void) 107void __init no_iommu_init(void)
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 6af96ee44200..e8a35016115f 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -13,31 +13,6 @@
13 13
14int swiotlb __read_mostly; 14int swiotlb __read_mostly;
15 15
16void * __init swiotlb_alloc_boot(size_t size, unsigned long nslabs)
17{
18 return alloc_bootmem_low_pages(size);
19}
20
21void *swiotlb_alloc(unsigned order, unsigned long nslabs)
22{
23 return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order);
24}
25
26dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr)
27{
28 return paddr;
29}
30
31phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr)
32{
33 return baddr;
34}
35
36int __weak swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size)
37{
38 return 0;
39}
40
41static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, 16static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
42 dma_addr_t *dma_handle, gfp_t flags) 17 dma_addr_t *dma_handle, gfp_t flags)
43{ 18{
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 09ecbde91c13..8d7d5c9c1be3 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -35,10 +35,11 @@
35#include <asm/proto.h> 35#include <asm/proto.h>
36#include <asm/ds.h> 36#include <asm/ds.h>
37 37
38#include <trace/syscall.h>
39
40#include "tls.h" 38#include "tls.h"
41 39
40#define CREATE_TRACE_POINTS
41#include <trace/events/syscalls.h>
42
42enum x86_regset { 43enum x86_regset {
43 REGSET_GENERAL, 44 REGSET_GENERAL,
44 REGSET_FP, 45 REGSET_FP,
@@ -1497,8 +1498,8 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
1497 tracehook_report_syscall_entry(regs)) 1498 tracehook_report_syscall_entry(regs))
1498 ret = -1L; 1499 ret = -1L;
1499 1500
1500 if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) 1501 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
1501 ftrace_syscall_enter(regs); 1502 trace_sys_enter(regs, regs->orig_ax);
1502 1503
1503 if (unlikely(current->audit_context)) { 1504 if (unlikely(current->audit_context)) {
1504 if (IS_IA32) 1505 if (IS_IA32)
@@ -1523,8 +1524,8 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
1523 if (unlikely(current->audit_context)) 1524 if (unlikely(current->audit_context))
1524 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); 1525 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1525 1526
1526 if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) 1527 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
1527 ftrace_syscall_exit(regs); 1528 trace_sys_exit(regs, regs->ax);
1528 1529
1529 if (test_thread_flag(TIF_SYSCALL_TRACE)) 1530 if (test_thread_flag(TIF_SYSCALL_TRACE))
1530 tracehook_report_syscall_exit(regs, 0); 1531 tracehook_report_syscall_exit(regs, 0);
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 6bc211accf08..45e00eb09c3a 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -18,9 +18,9 @@
18#include <asm/ia32.h> 18#include <asm/ia32.h>
19#include <asm/syscalls.h> 19#include <asm/syscalls.h>
20 20
21asmlinkage long sys_mmap(unsigned long addr, unsigned long len, 21SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
22 unsigned long prot, unsigned long flags, 22 unsigned long, prot, unsigned long, flags,
23 unsigned long fd, unsigned long off) 23 unsigned long, fd, unsigned long, off)
24{ 24{
25 long error; 25 long error;
26 struct file *file; 26 struct file *file;
@@ -226,7 +226,7 @@ bottomup:
226} 226}
227 227
228 228
229asmlinkage long sys_uname(struct new_utsname __user *name) 229SYSCALL_DEFINE1(uname, struct new_utsname __user *, name)
230{ 230{
231 int err; 231 int err;
232 down_read(&uts_sem); 232 down_read(&uts_sem);
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 89b9a5cd63da..cb88b1a0bd5f 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -1,11 +1,14 @@
1/** 1/**
2 * @file nmi_int.c 2 * @file nmi_int.c
3 * 3 *
4 * @remark Copyright 2002-2008 OProfile authors 4 * @remark Copyright 2002-2009 OProfile authors
5 * @remark Read the file COPYING 5 * @remark Read the file COPYING
6 * 6 *
7 * @author John Levon <levon@movementarian.org> 7 * @author John Levon <levon@movementarian.org>
8 * @author Robert Richter <robert.richter@amd.com> 8 * @author Robert Richter <robert.richter@amd.com>
9 * @author Barry Kasindorf <barry.kasindorf@amd.com>
10 * @author Jason Yeh <jason.yeh@amd.com>
11 * @author Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
9 */ 12 */
10 13
11#include <linux/init.h> 14#include <linux/init.h>
@@ -24,13 +27,35 @@
24#include "op_counter.h" 27#include "op_counter.h"
25#include "op_x86_model.h" 28#include "op_x86_model.h"
26 29
27static struct op_x86_model_spec const *model; 30static struct op_x86_model_spec *model;
28static DEFINE_PER_CPU(struct op_msrs, cpu_msrs); 31static DEFINE_PER_CPU(struct op_msrs, cpu_msrs);
29static DEFINE_PER_CPU(unsigned long, saved_lvtpc); 32static DEFINE_PER_CPU(unsigned long, saved_lvtpc);
30 33
31/* 0 == registered but off, 1 == registered and on */ 34/* 0 == registered but off, 1 == registered and on */
32static int nmi_enabled = 0; 35static int nmi_enabled = 0;
33 36
37struct op_counter_config counter_config[OP_MAX_COUNTER];
38
39/* common functions */
40
41u64 op_x86_get_ctrl(struct op_x86_model_spec const *model,
42 struct op_counter_config *counter_config)
43{
44 u64 val = 0;
45 u16 event = (u16)counter_config->event;
46
47 val |= ARCH_PERFMON_EVENTSEL_INT;
48 val |= counter_config->user ? ARCH_PERFMON_EVENTSEL_USR : 0;
49 val |= counter_config->kernel ? ARCH_PERFMON_EVENTSEL_OS : 0;
50 val |= (counter_config->unit_mask & 0xFF) << 8;
51 event &= model->event_mask ? model->event_mask : 0xFF;
52 val |= event & 0xFF;
53 val |= (event & 0x0F00) << 24;
54
55 return val;
56}
57
58
34static int profile_exceptions_notify(struct notifier_block *self, 59static int profile_exceptions_notify(struct notifier_block *self,
35 unsigned long val, void *data) 60 unsigned long val, void *data)
36{ 61{
@@ -52,36 +77,214 @@ static int profile_exceptions_notify(struct notifier_block *self,
52 77
53static void nmi_cpu_save_registers(struct op_msrs *msrs) 78static void nmi_cpu_save_registers(struct op_msrs *msrs)
54{ 79{
55 unsigned int const nr_ctrs = model->num_counters;
56 unsigned int const nr_ctrls = model->num_controls;
57 struct op_msr *counters = msrs->counters; 80 struct op_msr *counters = msrs->counters;
58 struct op_msr *controls = msrs->controls; 81 struct op_msr *controls = msrs->controls;
59 unsigned int i; 82 unsigned int i;
60 83
61 for (i = 0; i < nr_ctrs; ++i) { 84 for (i = 0; i < model->num_counters; ++i) {
62 if (counters[i].addr) { 85 if (counters[i].addr)
63 rdmsr(counters[i].addr, 86 rdmsrl(counters[i].addr, counters[i].saved);
64 counters[i].saved.low, 87 }
65 counters[i].saved.high); 88
66 } 89 for (i = 0; i < model->num_controls; ++i) {
90 if (controls[i].addr)
91 rdmsrl(controls[i].addr, controls[i].saved);
92 }
93}
94
95static void nmi_cpu_start(void *dummy)
96{
97 struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs);
98 model->start(msrs);
99}
100
101static int nmi_start(void)
102{
103 on_each_cpu(nmi_cpu_start, NULL, 1);
104 return 0;
105}
106
107static void nmi_cpu_stop(void *dummy)
108{
109 struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs);
110 model->stop(msrs);
111}
112
113static void nmi_stop(void)
114{
115 on_each_cpu(nmi_cpu_stop, NULL, 1);
116}
117
118#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
119
120static DEFINE_PER_CPU(int, switch_index);
121
122static inline int has_mux(void)
123{
124 return !!model->switch_ctrl;
125}
126
127inline int op_x86_phys_to_virt(int phys)
128{
129 return __get_cpu_var(switch_index) + phys;
130}
131
132inline int op_x86_virt_to_phys(int virt)
133{
134 return virt % model->num_counters;
135}
136
137static void nmi_shutdown_mux(void)
138{
139 int i;
140
141 if (!has_mux())
142 return;
143
144 for_each_possible_cpu(i) {
145 kfree(per_cpu(cpu_msrs, i).multiplex);
146 per_cpu(cpu_msrs, i).multiplex = NULL;
147 per_cpu(switch_index, i) = 0;
67 } 148 }
149}
150
151static int nmi_setup_mux(void)
152{
153 size_t multiplex_size =
154 sizeof(struct op_msr) * model->num_virt_counters;
155 int i;
156
157 if (!has_mux())
158 return 1;
159
160 for_each_possible_cpu(i) {
161 per_cpu(cpu_msrs, i).multiplex =
162 kmalloc(multiplex_size, GFP_KERNEL);
163 if (!per_cpu(cpu_msrs, i).multiplex)
164 return 0;
165 }
166
167 return 1;
168}
169
170static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs)
171{
172 int i;
173 struct op_msr *multiplex = msrs->multiplex;
174
175 if (!has_mux())
176 return;
68 177
69 for (i = 0; i < nr_ctrls; ++i) { 178 for (i = 0; i < model->num_virt_counters; ++i) {
70 if (controls[i].addr) { 179 if (counter_config[i].enabled) {
71 rdmsr(controls[i].addr, 180 multiplex[i].saved = -(u64)counter_config[i].count;
72 controls[i].saved.low, 181 } else {
73 controls[i].saved.high); 182 multiplex[i].addr = 0;
183 multiplex[i].saved = 0;
74 } 184 }
75 } 185 }
186
187 per_cpu(switch_index, cpu) = 0;
188}
189
190static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs)
191{
192 struct op_msr *multiplex = msrs->multiplex;
193 int i;
194
195 for (i = 0; i < model->num_counters; ++i) {
196 int virt = op_x86_phys_to_virt(i);
197 if (multiplex[virt].addr)
198 rdmsrl(multiplex[virt].addr, multiplex[virt].saved);
199 }
200}
201
202static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs)
203{
204 struct op_msr *multiplex = msrs->multiplex;
205 int i;
206
207 for (i = 0; i < model->num_counters; ++i) {
208 int virt = op_x86_phys_to_virt(i);
209 if (multiplex[virt].addr)
210 wrmsrl(multiplex[virt].addr, multiplex[virt].saved);
211 }
76} 212}
77 213
78static void nmi_save_registers(void *dummy) 214static void nmi_cpu_switch(void *dummy)
79{ 215{
80 int cpu = smp_processor_id(); 216 int cpu = smp_processor_id();
217 int si = per_cpu(switch_index, cpu);
81 struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); 218 struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu);
82 nmi_cpu_save_registers(msrs); 219
220 nmi_cpu_stop(NULL);
221 nmi_cpu_save_mpx_registers(msrs);
222
223 /* move to next set */
224 si += model->num_counters;
225 if ((si > model->num_virt_counters) || (counter_config[si].count == 0))
226 per_cpu(switch_index, cpu) = 0;
227 else
228 per_cpu(switch_index, cpu) = si;
229
230 model->switch_ctrl(model, msrs);
231 nmi_cpu_restore_mpx_registers(msrs);
232
233 nmi_cpu_start(NULL);
234}
235
236
237/*
238 * Quick check to see if multiplexing is necessary.
239 * The check should be sufficient since counters are used
240 * in ordre.
241 */
242static int nmi_multiplex_on(void)
243{
244 return counter_config[model->num_counters].count ? 0 : -EINVAL;
245}
246
247static int nmi_switch_event(void)
248{
249 if (!has_mux())
250 return -ENOSYS; /* not implemented */
251 if (nmi_multiplex_on() < 0)
252 return -EINVAL; /* not necessary */
253
254 on_each_cpu(nmi_cpu_switch, NULL, 1);
255
256 return 0;
257}
258
259static inline void mux_init(struct oprofile_operations *ops)
260{
261 if (has_mux())
262 ops->switch_events = nmi_switch_event;
263}
264
265static void mux_clone(int cpu)
266{
267 if (!has_mux())
268 return;
269
270 memcpy(per_cpu(cpu_msrs, cpu).multiplex,
271 per_cpu(cpu_msrs, 0).multiplex,
272 sizeof(struct op_msr) * model->num_virt_counters);
83} 273}
84 274
275#else
276
277inline int op_x86_phys_to_virt(int phys) { return phys; }
278inline int op_x86_virt_to_phys(int virt) { return virt; }
279static inline void nmi_shutdown_mux(void) { }
280static inline int nmi_setup_mux(void) { return 1; }
281static inline void
282nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) { }
283static inline void mux_init(struct oprofile_operations *ops) { }
284static void mux_clone(int cpu) { }
285
286#endif
287
85static void free_msrs(void) 288static void free_msrs(void)
86{ 289{
87 int i; 290 int i;
@@ -95,38 +298,32 @@ static void free_msrs(void)
95 298
96static int allocate_msrs(void) 299static int allocate_msrs(void)
97{ 300{
98 int success = 1;
99 size_t controls_size = sizeof(struct op_msr) * model->num_controls; 301 size_t controls_size = sizeof(struct op_msr) * model->num_controls;
100 size_t counters_size = sizeof(struct op_msr) * model->num_counters; 302 size_t counters_size = sizeof(struct op_msr) * model->num_counters;
101 303
102 int i; 304 int i;
103 for_each_possible_cpu(i) { 305 for_each_possible_cpu(i) {
104 per_cpu(cpu_msrs, i).counters = kmalloc(counters_size, 306 per_cpu(cpu_msrs, i).counters = kmalloc(counters_size,
105 GFP_KERNEL); 307 GFP_KERNEL);
106 if (!per_cpu(cpu_msrs, i).counters) { 308 if (!per_cpu(cpu_msrs, i).counters)
107 success = 0; 309 return 0;
108 break;
109 }
110 per_cpu(cpu_msrs, i).controls = kmalloc(controls_size, 310 per_cpu(cpu_msrs, i).controls = kmalloc(controls_size,
111 GFP_KERNEL); 311 GFP_KERNEL);
112 if (!per_cpu(cpu_msrs, i).controls) { 312 if (!per_cpu(cpu_msrs, i).controls)
113 success = 0; 313 return 0;
114 break;
115 }
116 } 314 }
117 315
118 if (!success) 316 return 1;
119 free_msrs();
120
121 return success;
122} 317}
123 318
124static void nmi_cpu_setup(void *dummy) 319static void nmi_cpu_setup(void *dummy)
125{ 320{
126 int cpu = smp_processor_id(); 321 int cpu = smp_processor_id();
127 struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); 322 struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu);
323 nmi_cpu_save_registers(msrs);
128 spin_lock(&oprofilefs_lock); 324 spin_lock(&oprofilefs_lock);
129 model->setup_ctrs(msrs); 325 model->setup_ctrs(model, msrs);
326 nmi_cpu_setup_mux(cpu, msrs);
130 spin_unlock(&oprofilefs_lock); 327 spin_unlock(&oprofilefs_lock);
131 per_cpu(saved_lvtpc, cpu) = apic_read(APIC_LVTPC); 328 per_cpu(saved_lvtpc, cpu) = apic_read(APIC_LVTPC);
132 apic_write(APIC_LVTPC, APIC_DM_NMI); 329 apic_write(APIC_LVTPC, APIC_DM_NMI);
@@ -144,11 +341,15 @@ static int nmi_setup(void)
144 int cpu; 341 int cpu;
145 342
146 if (!allocate_msrs()) 343 if (!allocate_msrs())
147 return -ENOMEM; 344 err = -ENOMEM;
345 else if (!nmi_setup_mux())
346 err = -ENOMEM;
347 else
348 err = register_die_notifier(&profile_exceptions_nb);
148 349
149 err = register_die_notifier(&profile_exceptions_nb);
150 if (err) { 350 if (err) {
151 free_msrs(); 351 free_msrs();
352 nmi_shutdown_mux();
152 return err; 353 return err;
153 } 354 }
154 355
@@ -159,45 +360,38 @@ static int nmi_setup(void)
159 /* Assume saved/restored counters are the same on all CPUs */ 360 /* Assume saved/restored counters are the same on all CPUs */
160 model->fill_in_addresses(&per_cpu(cpu_msrs, 0)); 361 model->fill_in_addresses(&per_cpu(cpu_msrs, 0));
161 for_each_possible_cpu(cpu) { 362 for_each_possible_cpu(cpu) {
162 if (cpu != 0) { 363 if (!cpu)
163 memcpy(per_cpu(cpu_msrs, cpu).counters, 364 continue;
164 per_cpu(cpu_msrs, 0).counters, 365
165 sizeof(struct op_msr) * model->num_counters); 366 memcpy(per_cpu(cpu_msrs, cpu).counters,
166 367 per_cpu(cpu_msrs, 0).counters,
167 memcpy(per_cpu(cpu_msrs, cpu).controls, 368 sizeof(struct op_msr) * model->num_counters);
168 per_cpu(cpu_msrs, 0).controls, 369
169 sizeof(struct op_msr) * model->num_controls); 370 memcpy(per_cpu(cpu_msrs, cpu).controls,
170 } 371 per_cpu(cpu_msrs, 0).controls,
372 sizeof(struct op_msr) * model->num_controls);
171 373
374 mux_clone(cpu);
172 } 375 }
173 on_each_cpu(nmi_save_registers, NULL, 1);
174 on_each_cpu(nmi_cpu_setup, NULL, 1); 376 on_each_cpu(nmi_cpu_setup, NULL, 1);
175 nmi_enabled = 1; 377 nmi_enabled = 1;
176 return 0; 378 return 0;
177} 379}
178 380
179static void nmi_restore_registers(struct op_msrs *msrs) 381static void nmi_cpu_restore_registers(struct op_msrs *msrs)
180{ 382{
181 unsigned int const nr_ctrs = model->num_counters;
182 unsigned int const nr_ctrls = model->num_controls;
183 struct op_msr *counters = msrs->counters; 383 struct op_msr *counters = msrs->counters;
184 struct op_msr *controls = msrs->controls; 384 struct op_msr *controls = msrs->controls;
185 unsigned int i; 385 unsigned int i;
186 386
187 for (i = 0; i < nr_ctrls; ++i) { 387 for (i = 0; i < model->num_controls; ++i) {
188 if (controls[i].addr) { 388 if (controls[i].addr)
189 wrmsr(controls[i].addr, 389 wrmsrl(controls[i].addr, controls[i].saved);
190 controls[i].saved.low,
191 controls[i].saved.high);
192 }
193 } 390 }
194 391
195 for (i = 0; i < nr_ctrs; ++i) { 392 for (i = 0; i < model->num_counters; ++i) {
196 if (counters[i].addr) { 393 if (counters[i].addr)
197 wrmsr(counters[i].addr, 394 wrmsrl(counters[i].addr, counters[i].saved);
198 counters[i].saved.low,
199 counters[i].saved.high);
200 }
201 } 395 }
202} 396}
203 397
@@ -205,7 +399,7 @@ static void nmi_cpu_shutdown(void *dummy)
205{ 399{
206 unsigned int v; 400 unsigned int v;
207 int cpu = smp_processor_id(); 401 int cpu = smp_processor_id();
208 struct op_msrs *msrs = &__get_cpu_var(cpu_msrs); 402 struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu);
209 403
210 /* restoring APIC_LVTPC can trigger an apic error because the delivery 404 /* restoring APIC_LVTPC can trigger an apic error because the delivery
211 * mode and vector nr combination can be illegal. That's by design: on 405 * mode and vector nr combination can be illegal. That's by design: on
@@ -216,7 +410,7 @@ static void nmi_cpu_shutdown(void *dummy)
216 apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); 410 apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
217 apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu)); 411 apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu));
218 apic_write(APIC_LVTERR, v); 412 apic_write(APIC_LVTERR, v);
219 nmi_restore_registers(msrs); 413 nmi_cpu_restore_registers(msrs);
220} 414}
221 415
222static void nmi_shutdown(void) 416static void nmi_shutdown(void)
@@ -226,42 +420,18 @@ static void nmi_shutdown(void)
226 nmi_enabled = 0; 420 nmi_enabled = 0;
227 on_each_cpu(nmi_cpu_shutdown, NULL, 1); 421 on_each_cpu(nmi_cpu_shutdown, NULL, 1);
228 unregister_die_notifier(&profile_exceptions_nb); 422 unregister_die_notifier(&profile_exceptions_nb);
423 nmi_shutdown_mux();
229 msrs = &get_cpu_var(cpu_msrs); 424 msrs = &get_cpu_var(cpu_msrs);
230 model->shutdown(msrs); 425 model->shutdown(msrs);
231 free_msrs(); 426 free_msrs();
232 put_cpu_var(cpu_msrs); 427 put_cpu_var(cpu_msrs);
233} 428}
234 429
235static void nmi_cpu_start(void *dummy)
236{
237 struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs);
238 model->start(msrs);
239}
240
241static int nmi_start(void)
242{
243 on_each_cpu(nmi_cpu_start, NULL, 1);
244 return 0;
245}
246
247static void nmi_cpu_stop(void *dummy)
248{
249 struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs);
250 model->stop(msrs);
251}
252
253static void nmi_stop(void)
254{
255 on_each_cpu(nmi_cpu_stop, NULL, 1);
256}
257
258struct op_counter_config counter_config[OP_MAX_COUNTER];
259
260static int nmi_create_files(struct super_block *sb, struct dentry *root) 430static int nmi_create_files(struct super_block *sb, struct dentry *root)
261{ 431{
262 unsigned int i; 432 unsigned int i;
263 433
264 for (i = 0; i < model->num_counters; ++i) { 434 for (i = 0; i < model->num_virt_counters; ++i) {
265 struct dentry *dir; 435 struct dentry *dir;
266 char buf[4]; 436 char buf[4];
267 437
@@ -270,7 +440,7 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root)
270 * NOTE: assumes 1:1 mapping here (that counters are organized 440 * NOTE: assumes 1:1 mapping here (that counters are organized
271 * sequentially in their struct assignment). 441 * sequentially in their struct assignment).
272 */ 442 */
273 if (unlikely(!avail_to_resrv_perfctr_nmi_bit(i))) 443 if (!avail_to_resrv_perfctr_nmi_bit(op_x86_virt_to_phys(i)))
274 continue; 444 continue;
275 445
276 snprintf(buf, sizeof(buf), "%d", i); 446 snprintf(buf, sizeof(buf), "%d", i);
@@ -402,6 +572,7 @@ module_param_call(cpu_type, force_cpu_type, NULL, NULL, 0);
402static int __init ppro_init(char **cpu_type) 572static int __init ppro_init(char **cpu_type)
403{ 573{
404 __u8 cpu_model = boot_cpu_data.x86_model; 574 __u8 cpu_model = boot_cpu_data.x86_model;
575 struct op_x86_model_spec *spec = &op_ppro_spec; /* default */
405 576
406 if (force_arch_perfmon && cpu_has_arch_perfmon) 577 if (force_arch_perfmon && cpu_has_arch_perfmon)
407 return 0; 578 return 0;
@@ -428,7 +599,7 @@ static int __init ppro_init(char **cpu_type)
428 *cpu_type = "i386/core_2"; 599 *cpu_type = "i386/core_2";
429 break; 600 break;
430 case 26: 601 case 26:
431 arch_perfmon_setup_counters(); 602 spec = &op_arch_perfmon_spec;
432 *cpu_type = "i386/core_i7"; 603 *cpu_type = "i386/core_i7";
433 break; 604 break;
434 case 28: 605 case 28:
@@ -439,17 +610,7 @@ static int __init ppro_init(char **cpu_type)
439 return 0; 610 return 0;
440 } 611 }
441 612
442 model = &op_ppro_spec; 613 model = spec;
443 return 1;
444}
445
446static int __init arch_perfmon_init(char **cpu_type)
447{
448 if (!cpu_has_arch_perfmon)
449 return 0;
450 *cpu_type = "i386/arch_perfmon";
451 model = &op_arch_perfmon_spec;
452 arch_perfmon_setup_counters();
453 return 1; 614 return 1;
454} 615}
455 616
@@ -471,27 +632,26 @@ int __init op_nmi_init(struct oprofile_operations *ops)
471 /* Needs to be at least an Athlon (or hammer in 32bit mode) */ 632 /* Needs to be at least an Athlon (or hammer in 32bit mode) */
472 633
473 switch (family) { 634 switch (family) {
474 default:
475 return -ENODEV;
476 case 6: 635 case 6:
477 model = &op_amd_spec;
478 cpu_type = "i386/athlon"; 636 cpu_type = "i386/athlon";
479 break; 637 break;
480 case 0xf: 638 case 0xf:
481 model = &op_amd_spec; 639 /*
482 /* Actually it could be i386/hammer too, but give 640 * Actually it could be i386/hammer too, but
483 user space an consistent name. */ 641 * give user space an consistent name.
642 */
484 cpu_type = "x86-64/hammer"; 643 cpu_type = "x86-64/hammer";
485 break; 644 break;
486 case 0x10: 645 case 0x10:
487 model = &op_amd_spec;
488 cpu_type = "x86-64/family10"; 646 cpu_type = "x86-64/family10";
489 break; 647 break;
490 case 0x11: 648 case 0x11:
491 model = &op_amd_spec;
492 cpu_type = "x86-64/family11h"; 649 cpu_type = "x86-64/family11h";
493 break; 650 break;
651 default:
652 return -ENODEV;
494 } 653 }
654 model = &op_amd_spec;
495 break; 655 break;
496 656
497 case X86_VENDOR_INTEL: 657 case X86_VENDOR_INTEL:
@@ -510,8 +670,15 @@ int __init op_nmi_init(struct oprofile_operations *ops)
510 break; 670 break;
511 } 671 }
512 672
513 if (!cpu_type && !arch_perfmon_init(&cpu_type)) 673 if (cpu_type)
674 break;
675
676 if (!cpu_has_arch_perfmon)
514 return -ENODEV; 677 return -ENODEV;
678
679 /* use arch perfmon as fallback */
680 cpu_type = "i386/arch_perfmon";
681 model = &op_arch_perfmon_spec;
515 break; 682 break;
516 683
517 default: 684 default:
@@ -522,18 +689,23 @@ int __init op_nmi_init(struct oprofile_operations *ops)
522 register_cpu_notifier(&oprofile_cpu_nb); 689 register_cpu_notifier(&oprofile_cpu_nb);
523#endif 690#endif
524 /* default values, can be overwritten by model */ 691 /* default values, can be overwritten by model */
525 ops->create_files = nmi_create_files; 692 ops->create_files = nmi_create_files;
526 ops->setup = nmi_setup; 693 ops->setup = nmi_setup;
527 ops->shutdown = nmi_shutdown; 694 ops->shutdown = nmi_shutdown;
528 ops->start = nmi_start; 695 ops->start = nmi_start;
529 ops->stop = nmi_stop; 696 ops->stop = nmi_stop;
530 ops->cpu_type = cpu_type; 697 ops->cpu_type = cpu_type;
531 698
532 if (model->init) 699 if (model->init)
533 ret = model->init(ops); 700 ret = model->init(ops);
534 if (ret) 701 if (ret)
535 return ret; 702 return ret;
536 703
704 if (!model->num_virt_counters)
705 model->num_virt_counters = model->num_counters;
706
707 mux_init(ops);
708
537 init_sysfs(); 709 init_sysfs();
538 using_nmi = 1; 710 using_nmi = 1;
539 printk(KERN_INFO "oprofile: using NMI interrupt.\n"); 711 printk(KERN_INFO "oprofile: using NMI interrupt.\n");
diff --git a/arch/x86/oprofile/op_counter.h b/arch/x86/oprofile/op_counter.h
index 91b6a116165e..e28398df0df2 100644
--- a/arch/x86/oprofile/op_counter.h
+++ b/arch/x86/oprofile/op_counter.h
@@ -10,7 +10,7 @@
10#ifndef OP_COUNTER_H 10#ifndef OP_COUNTER_H
11#define OP_COUNTER_H 11#define OP_COUNTER_H
12 12
13#define OP_MAX_COUNTER 8 13#define OP_MAX_COUNTER 32
14 14
15/* Per-perfctr configuration as set via 15/* Per-perfctr configuration as set via
16 * oprofilefs. 16 * oprofilefs.
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 8fdf06e4edf9..39686c29f03a 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -9,12 +9,15 @@
9 * @author Philippe Elie 9 * @author Philippe Elie
10 * @author Graydon Hoare 10 * @author Graydon Hoare
11 * @author Robert Richter <robert.richter@amd.com> 11 * @author Robert Richter <robert.richter@amd.com>
12 * @author Barry Kasindorf 12 * @author Barry Kasindorf <barry.kasindorf@amd.com>
13 * @author Jason Yeh <jason.yeh@amd.com>
14 * @author Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
13 */ 15 */
14 16
15#include <linux/oprofile.h> 17#include <linux/oprofile.h>
16#include <linux/device.h> 18#include <linux/device.h>
17#include <linux/pci.h> 19#include <linux/pci.h>
20#include <linux/percpu.h>
18 21
19#include <asm/ptrace.h> 22#include <asm/ptrace.h>
20#include <asm/msr.h> 23#include <asm/msr.h>
@@ -25,43 +28,36 @@
25 28
26#define NUM_COUNTERS 4 29#define NUM_COUNTERS 4
27#define NUM_CONTROLS 4 30#define NUM_CONTROLS 4
31#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
32#define NUM_VIRT_COUNTERS 32
33#define NUM_VIRT_CONTROLS 32
34#else
35#define NUM_VIRT_COUNTERS NUM_COUNTERS
36#define NUM_VIRT_CONTROLS NUM_CONTROLS
37#endif
38
39#define OP_EVENT_MASK 0x0FFF
40#define OP_CTR_OVERFLOW (1ULL<<31)
28 41
29#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) 42#define MSR_AMD_EVENTSEL_RESERVED ((0xFFFFFCF0ULL<<32)|(1ULL<<21))
30#define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0) 43
31#define CTR_WRITE(l, msrs, c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1); } while (0) 44static unsigned long reset_value[NUM_VIRT_COUNTERS];
32#define CTR_OVERFLOWED(n) (!((n) & (1U<<31)))
33
34#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0)
35#define CTRL_READ(l, h, msrs, c) do {rdmsr(msrs->controls[(c)].addr, (l), (h)); } while (0)
36#define CTRL_WRITE(l, h, msrs, c) do {wrmsr(msrs->controls[(c)].addr, (l), (h)); } while (0)
37#define CTRL_SET_ACTIVE(n) (n |= (1<<22))
38#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22))
39#define CTRL_CLEAR_LO(x) (x &= (1<<21))
40#define CTRL_CLEAR_HI(x) (x &= 0xfffffcf0)
41#define CTRL_SET_ENABLE(val) (val |= 1<<20)
42#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16))
43#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17))
44#define CTRL_SET_UM(val, m) (val |= (m << 8))
45#define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff))
46#define CTRL_SET_EVENT_HIGH(val, e) (val |= ((e >> 8) & 0xf))
47#define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9))
48#define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8))
49
50static unsigned long reset_value[NUM_COUNTERS];
51 45
52#ifdef CONFIG_OPROFILE_IBS 46#ifdef CONFIG_OPROFILE_IBS
53 47
54/* IbsFetchCtl bits/masks */ 48/* IbsFetchCtl bits/masks */
55#define IBS_FETCH_HIGH_VALID_BIT (1UL << 17) /* bit 49 */ 49#define IBS_FETCH_RAND_EN (1ULL<<57)
56#define IBS_FETCH_HIGH_ENABLE (1UL << 16) /* bit 48 */ 50#define IBS_FETCH_VAL (1ULL<<49)
57#define IBS_FETCH_LOW_MAX_CNT_MASK 0x0000FFFFUL /* MaxCnt mask */ 51#define IBS_FETCH_ENABLE (1ULL<<48)
52#define IBS_FETCH_CNT_MASK 0xFFFF0000ULL
58 53
59/*IbsOpCtl bits */ 54/*IbsOpCtl bits */
60#define IBS_OP_LOW_VALID_BIT (1ULL<<18) /* bit 18 */ 55#define IBS_OP_CNT_CTL (1ULL<<19)
61#define IBS_OP_LOW_ENABLE (1ULL<<17) /* bit 17 */ 56#define IBS_OP_VAL (1ULL<<18)
57#define IBS_OP_ENABLE (1ULL<<17)
62 58
63#define IBS_FETCH_SIZE 6 59#define IBS_FETCH_SIZE 6
64#define IBS_OP_SIZE 12 60#define IBS_OP_SIZE 12
65 61
66static int has_ibs; /* AMD Family10h and later */ 62static int has_ibs; /* AMD Family10h and later */
67 63
@@ -78,6 +74,45 @@ static struct op_ibs_config ibs_config;
78 74
79#endif 75#endif
80 76
77#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
78
79static void op_mux_fill_in_addresses(struct op_msrs * const msrs)
80{
81 int i;
82
83 for (i = 0; i < NUM_VIRT_COUNTERS; i++) {
84 int hw_counter = op_x86_virt_to_phys(i);
85 if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
86 msrs->multiplex[i].addr = MSR_K7_PERFCTR0 + hw_counter;
87 else
88 msrs->multiplex[i].addr = 0;
89 }
90}
91
92static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
93 struct op_msrs const * const msrs)
94{
95 u64 val;
96 int i;
97
98 /* enable active counters */
99 for (i = 0; i < NUM_COUNTERS; ++i) {
100 int virt = op_x86_phys_to_virt(i);
101 if (!counter_config[virt].enabled)
102 continue;
103 rdmsrl(msrs->controls[i].addr, val);
104 val &= model->reserved;
105 val |= op_x86_get_ctrl(model, &counter_config[virt]);
106 wrmsrl(msrs->controls[i].addr, val);
107 }
108}
109
110#else
111
112static inline void op_mux_fill_in_addresses(struct op_msrs * const msrs) { }
113
114#endif
115
81/* functions for op_amd_spec */ 116/* functions for op_amd_spec */
82 117
83static void op_amd_fill_in_addresses(struct op_msrs * const msrs) 118static void op_amd_fill_in_addresses(struct op_msrs * const msrs)
@@ -97,150 +132,174 @@ static void op_amd_fill_in_addresses(struct op_msrs * const msrs)
97 else 132 else
98 msrs->controls[i].addr = 0; 133 msrs->controls[i].addr = 0;
99 } 134 }
100}
101 135
136 op_mux_fill_in_addresses(msrs);
137}
102 138
103static void op_amd_setup_ctrs(struct op_msrs const * const msrs) 139static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
140 struct op_msrs const * const msrs)
104{ 141{
105 unsigned int low, high; 142 u64 val;
106 int i; 143 int i;
107 144
145 /* setup reset_value */
146 for (i = 0; i < NUM_VIRT_COUNTERS; ++i) {
147 if (counter_config[i].enabled)
148 reset_value[i] = counter_config[i].count;
149 else
150 reset_value[i] = 0;
151 }
152
108 /* clear all counters */ 153 /* clear all counters */
109 for (i = 0 ; i < NUM_CONTROLS; ++i) { 154 for (i = 0; i < NUM_CONTROLS; ++i) {
110 if (unlikely(!CTRL_IS_RESERVED(msrs, i))) 155 if (unlikely(!msrs->controls[i].addr))
111 continue; 156 continue;
112 CTRL_READ(low, high, msrs, i); 157 rdmsrl(msrs->controls[i].addr, val);
113 CTRL_CLEAR_LO(low); 158 val &= model->reserved;
114 CTRL_CLEAR_HI(high); 159 wrmsrl(msrs->controls[i].addr, val);
115 CTRL_WRITE(low, high, msrs, i);
116 } 160 }
117 161
118 /* avoid a false detection of ctr overflows in NMI handler */ 162 /* avoid a false detection of ctr overflows in NMI handler */
119 for (i = 0; i < NUM_COUNTERS; ++i) { 163 for (i = 0; i < NUM_COUNTERS; ++i) {
120 if (unlikely(!CTR_IS_RESERVED(msrs, i))) 164 if (unlikely(!msrs->counters[i].addr))
121 continue; 165 continue;
122 CTR_WRITE(1, msrs, i); 166 wrmsrl(msrs->counters[i].addr, -1LL);
123 } 167 }
124 168
125 /* enable active counters */ 169 /* enable active counters */
126 for (i = 0; i < NUM_COUNTERS; ++i) { 170 for (i = 0; i < NUM_COUNTERS; ++i) {
127 if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { 171 int virt = op_x86_phys_to_virt(i);
128 reset_value[i] = counter_config[i].count; 172 if (!counter_config[virt].enabled)
173 continue;
174 if (!msrs->counters[i].addr)
175 continue;
129 176
130 CTR_WRITE(counter_config[i].count, msrs, i); 177 /* setup counter registers */
131 178 wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]);
132 CTRL_READ(low, high, msrs, i); 179
133 CTRL_CLEAR_LO(low); 180 /* setup control registers */
134 CTRL_CLEAR_HI(high); 181 rdmsrl(msrs->controls[i].addr, val);
135 CTRL_SET_ENABLE(low); 182 val &= model->reserved;
136 CTRL_SET_USR(low, counter_config[i].user); 183 val |= op_x86_get_ctrl(model, &counter_config[virt]);
137 CTRL_SET_KERN(low, counter_config[i].kernel); 184 wrmsrl(msrs->controls[i].addr, val);
138 CTRL_SET_UM(low, counter_config[i].unit_mask);
139 CTRL_SET_EVENT_LOW(low, counter_config[i].event);
140 CTRL_SET_EVENT_HIGH(high, counter_config[i].event);
141 CTRL_SET_HOST_ONLY(high, 0);
142 CTRL_SET_GUEST_ONLY(high, 0);
143
144 CTRL_WRITE(low, high, msrs, i);
145 } else {
146 reset_value[i] = 0;
147 }
148 } 185 }
149} 186}
150 187
151#ifdef CONFIG_OPROFILE_IBS 188#ifdef CONFIG_OPROFILE_IBS
152 189
153static inline int 190static inline void
154op_amd_handle_ibs(struct pt_regs * const regs, 191op_amd_handle_ibs(struct pt_regs * const regs,
155 struct op_msrs const * const msrs) 192 struct op_msrs const * const msrs)
156{ 193{
157 u32 low, high; 194 u64 val, ctl;
158 u64 msr;
159 struct op_entry entry; 195 struct op_entry entry;
160 196
161 if (!has_ibs) 197 if (!has_ibs)
162 return 1; 198 return;
163 199
164 if (ibs_config.fetch_enabled) { 200 if (ibs_config.fetch_enabled) {
165 rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); 201 rdmsrl(MSR_AMD64_IBSFETCHCTL, ctl);
166 if (high & IBS_FETCH_HIGH_VALID_BIT) { 202 if (ctl & IBS_FETCH_VAL) {
167 rdmsrl(MSR_AMD64_IBSFETCHLINAD, msr); 203 rdmsrl(MSR_AMD64_IBSFETCHLINAD, val);
168 oprofile_write_reserve(&entry, regs, msr, 204 oprofile_write_reserve(&entry, regs, val,
169 IBS_FETCH_CODE, IBS_FETCH_SIZE); 205 IBS_FETCH_CODE, IBS_FETCH_SIZE);
170 oprofile_add_data(&entry, (u32)msr); 206 oprofile_add_data64(&entry, val);
171 oprofile_add_data(&entry, (u32)(msr >> 32)); 207 oprofile_add_data64(&entry, ctl);
172 oprofile_add_data(&entry, low); 208 rdmsrl(MSR_AMD64_IBSFETCHPHYSAD, val);
173 oprofile_add_data(&entry, high); 209 oprofile_add_data64(&entry, val);
174 rdmsrl(MSR_AMD64_IBSFETCHPHYSAD, msr);
175 oprofile_add_data(&entry, (u32)msr);
176 oprofile_add_data(&entry, (u32)(msr >> 32));
177 oprofile_write_commit(&entry); 210 oprofile_write_commit(&entry);
178 211
179 /* reenable the IRQ */ 212 /* reenable the IRQ */
180 high &= ~IBS_FETCH_HIGH_VALID_BIT; 213 ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT_MASK);
181 high |= IBS_FETCH_HIGH_ENABLE; 214 ctl |= IBS_FETCH_ENABLE;
182 low &= IBS_FETCH_LOW_MAX_CNT_MASK; 215 wrmsrl(MSR_AMD64_IBSFETCHCTL, ctl);
183 wrmsr(MSR_AMD64_IBSFETCHCTL, low, high);
184 } 216 }
185 } 217 }
186 218
187 if (ibs_config.op_enabled) { 219 if (ibs_config.op_enabled) {
188 rdmsr(MSR_AMD64_IBSOPCTL, low, high); 220 rdmsrl(MSR_AMD64_IBSOPCTL, ctl);
189 if (low & IBS_OP_LOW_VALID_BIT) { 221 if (ctl & IBS_OP_VAL) {
190 rdmsrl(MSR_AMD64_IBSOPRIP, msr); 222 rdmsrl(MSR_AMD64_IBSOPRIP, val);
191 oprofile_write_reserve(&entry, regs, msr, 223 oprofile_write_reserve(&entry, regs, val,
192 IBS_OP_CODE, IBS_OP_SIZE); 224 IBS_OP_CODE, IBS_OP_SIZE);
193 oprofile_add_data(&entry, (u32)msr); 225 oprofile_add_data64(&entry, val);
194 oprofile_add_data(&entry, (u32)(msr >> 32)); 226 rdmsrl(MSR_AMD64_IBSOPDATA, val);
195 rdmsrl(MSR_AMD64_IBSOPDATA, msr); 227 oprofile_add_data64(&entry, val);
196 oprofile_add_data(&entry, (u32)msr); 228 rdmsrl(MSR_AMD64_IBSOPDATA2, val);
197 oprofile_add_data(&entry, (u32)(msr >> 32)); 229 oprofile_add_data64(&entry, val);
198 rdmsrl(MSR_AMD64_IBSOPDATA2, msr); 230 rdmsrl(MSR_AMD64_IBSOPDATA3, val);
199 oprofile_add_data(&entry, (u32)msr); 231 oprofile_add_data64(&entry, val);
200 oprofile_add_data(&entry, (u32)(msr >> 32)); 232 rdmsrl(MSR_AMD64_IBSDCLINAD, val);
201 rdmsrl(MSR_AMD64_IBSOPDATA3, msr); 233 oprofile_add_data64(&entry, val);
202 oprofile_add_data(&entry, (u32)msr); 234 rdmsrl(MSR_AMD64_IBSDCPHYSAD, val);
203 oprofile_add_data(&entry, (u32)(msr >> 32)); 235 oprofile_add_data64(&entry, val);
204 rdmsrl(MSR_AMD64_IBSDCLINAD, msr);
205 oprofile_add_data(&entry, (u32)msr);
206 oprofile_add_data(&entry, (u32)(msr >> 32));
207 rdmsrl(MSR_AMD64_IBSDCPHYSAD, msr);
208 oprofile_add_data(&entry, (u32)msr);
209 oprofile_add_data(&entry, (u32)(msr >> 32));
210 oprofile_write_commit(&entry); 236 oprofile_write_commit(&entry);
211 237
212 /* reenable the IRQ */ 238 /* reenable the IRQ */
213 high = 0; 239 ctl &= ~IBS_OP_VAL & 0xFFFFFFFF;
214 low &= ~IBS_OP_LOW_VALID_BIT; 240 ctl |= IBS_OP_ENABLE;
215 low |= IBS_OP_LOW_ENABLE; 241 wrmsrl(MSR_AMD64_IBSOPCTL, ctl);
216 wrmsr(MSR_AMD64_IBSOPCTL, low, high);
217 } 242 }
218 } 243 }
244}
219 245
220 return 1; 246static inline void op_amd_start_ibs(void)
247{
248 u64 val;
249 if (has_ibs && ibs_config.fetch_enabled) {
250 val = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF;
251 val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0;
252 val |= IBS_FETCH_ENABLE;
253 wrmsrl(MSR_AMD64_IBSFETCHCTL, val);
254 }
255
256 if (has_ibs && ibs_config.op_enabled) {
257 val = (ibs_config.max_cnt_op >> 4) & 0xFFFF;
258 val |= ibs_config.dispatched_ops ? IBS_OP_CNT_CTL : 0;
259 val |= IBS_OP_ENABLE;
260 wrmsrl(MSR_AMD64_IBSOPCTL, val);
261 }
262}
263
264static void op_amd_stop_ibs(void)
265{
266 if (has_ibs && ibs_config.fetch_enabled)
267 /* clear max count and enable */
268 wrmsrl(MSR_AMD64_IBSFETCHCTL, 0);
269
270 if (has_ibs && ibs_config.op_enabled)
271 /* clear max count and enable */
272 wrmsrl(MSR_AMD64_IBSOPCTL, 0);
221} 273}
222 274
275#else
276
277static inline void op_amd_handle_ibs(struct pt_regs * const regs,
278 struct op_msrs const * const msrs) { }
279static inline void op_amd_start_ibs(void) { }
280static inline void op_amd_stop_ibs(void) { }
281
223#endif 282#endif
224 283
225static int op_amd_check_ctrs(struct pt_regs * const regs, 284static int op_amd_check_ctrs(struct pt_regs * const regs,
226 struct op_msrs const * const msrs) 285 struct op_msrs const * const msrs)
227{ 286{
228 unsigned int low, high; 287 u64 val;
229 int i; 288 int i;
230 289
231 for (i = 0 ; i < NUM_COUNTERS; ++i) { 290 for (i = 0; i < NUM_COUNTERS; ++i) {
232 if (!reset_value[i]) 291 int virt = op_x86_phys_to_virt(i);
292 if (!reset_value[virt])
233 continue; 293 continue;
234 CTR_READ(low, high, msrs, i); 294 rdmsrl(msrs->counters[i].addr, val);
235 if (CTR_OVERFLOWED(low)) { 295 /* bit is clear if overflowed: */
236 oprofile_add_sample(regs, i); 296 if (val & OP_CTR_OVERFLOW)
237 CTR_WRITE(reset_value[i], msrs, i); 297 continue;
238 } 298 oprofile_add_sample(regs, virt);
299 wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]);
239 } 300 }
240 301
241#ifdef CONFIG_OPROFILE_IBS
242 op_amd_handle_ibs(regs, msrs); 302 op_amd_handle_ibs(regs, msrs);
243#endif
244 303
245 /* See op_model_ppro.c */ 304 /* See op_model_ppro.c */
246 return 1; 305 return 1;
@@ -248,79 +307,50 @@ static int op_amd_check_ctrs(struct pt_regs * const regs,
248 307
249static void op_amd_start(struct op_msrs const * const msrs) 308static void op_amd_start(struct op_msrs const * const msrs)
250{ 309{
251 unsigned int low, high; 310 u64 val;
252 int i; 311 int i;
253 for (i = 0 ; i < NUM_COUNTERS ; ++i) {
254 if (reset_value[i]) {
255 CTRL_READ(low, high, msrs, i);
256 CTRL_SET_ACTIVE(low);
257 CTRL_WRITE(low, high, msrs, i);
258 }
259 }
260 312
261#ifdef CONFIG_OPROFILE_IBS 313 for (i = 0; i < NUM_COUNTERS; ++i) {
262 if (has_ibs && ibs_config.fetch_enabled) { 314 if (!reset_value[op_x86_phys_to_virt(i)])
263 low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; 315 continue;
264 high = ((ibs_config.rand_en & 0x1) << 25) /* bit 57 */ 316 rdmsrl(msrs->controls[i].addr, val);
265 + IBS_FETCH_HIGH_ENABLE; 317 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
266 wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); 318 wrmsrl(msrs->controls[i].addr, val);
267 } 319 }
268 320
269 if (has_ibs && ibs_config.op_enabled) { 321 op_amd_start_ibs();
270 low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF)
271 + ((ibs_config.dispatched_ops & 0x1) << 19) /* bit 19 */
272 + IBS_OP_LOW_ENABLE;
273 high = 0;
274 wrmsr(MSR_AMD64_IBSOPCTL, low, high);
275 }
276#endif
277} 322}
278 323
279
280static void op_amd_stop(struct op_msrs const * const msrs) 324static void op_amd_stop(struct op_msrs const * const msrs)
281{ 325{
282 unsigned int low, high; 326 u64 val;
283 int i; 327 int i;
284 328
285 /* 329 /*
286 * Subtle: stop on all counters to avoid race with setting our 330 * Subtle: stop on all counters to avoid race with setting our
287 * pm callback 331 * pm callback
288 */ 332 */
289 for (i = 0 ; i < NUM_COUNTERS ; ++i) { 333 for (i = 0; i < NUM_COUNTERS; ++i) {
290 if (!reset_value[i]) 334 if (!reset_value[op_x86_phys_to_virt(i)])
291 continue; 335 continue;
292 CTRL_READ(low, high, msrs, i); 336 rdmsrl(msrs->controls[i].addr, val);
293 CTRL_SET_INACTIVE(low); 337 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
294 CTRL_WRITE(low, high, msrs, i); 338 wrmsrl(msrs->controls[i].addr, val);
295 }
296
297#ifdef CONFIG_OPROFILE_IBS
298 if (has_ibs && ibs_config.fetch_enabled) {
299 /* clear max count and enable */
300 low = 0;
301 high = 0;
302 wrmsr(MSR_AMD64_IBSFETCHCTL, low, high);
303 } 339 }
304 340
305 if (has_ibs && ibs_config.op_enabled) { 341 op_amd_stop_ibs();
306 /* clear max count and enable */
307 low = 0;
308 high = 0;
309 wrmsr(MSR_AMD64_IBSOPCTL, low, high);
310 }
311#endif
312} 342}
313 343
314static void op_amd_shutdown(struct op_msrs const * const msrs) 344static void op_amd_shutdown(struct op_msrs const * const msrs)
315{ 345{
316 int i; 346 int i;
317 347
318 for (i = 0 ; i < NUM_COUNTERS ; ++i) { 348 for (i = 0; i < NUM_COUNTERS; ++i) {
319 if (CTR_IS_RESERVED(msrs, i)) 349 if (msrs->counters[i].addr)
320 release_perfctr_nmi(MSR_K7_PERFCTR0 + i); 350 release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
321 } 351 }
322 for (i = 0 ; i < NUM_CONTROLS ; ++i) { 352 for (i = 0; i < NUM_CONTROLS; ++i) {
323 if (CTRL_IS_RESERVED(msrs, i)) 353 if (msrs->controls[i].addr)
324 release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); 354 release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
325 } 355 }
326} 356}
@@ -490,15 +520,21 @@ static void op_amd_exit(void) {}
490 520
491#endif /* CONFIG_OPROFILE_IBS */ 521#endif /* CONFIG_OPROFILE_IBS */
492 522
493struct op_x86_model_spec const op_amd_spec = { 523struct op_x86_model_spec op_amd_spec = {
494 .init = op_amd_init,
495 .exit = op_amd_exit,
496 .num_counters = NUM_COUNTERS, 524 .num_counters = NUM_COUNTERS,
497 .num_controls = NUM_CONTROLS, 525 .num_controls = NUM_CONTROLS,
526 .num_virt_counters = NUM_VIRT_COUNTERS,
527 .reserved = MSR_AMD_EVENTSEL_RESERVED,
528 .event_mask = OP_EVENT_MASK,
529 .init = op_amd_init,
530 .exit = op_amd_exit,
498 .fill_in_addresses = &op_amd_fill_in_addresses, 531 .fill_in_addresses = &op_amd_fill_in_addresses,
499 .setup_ctrs = &op_amd_setup_ctrs, 532 .setup_ctrs = &op_amd_setup_ctrs,
500 .check_ctrs = &op_amd_check_ctrs, 533 .check_ctrs = &op_amd_check_ctrs,
501 .start = &op_amd_start, 534 .start = &op_amd_start,
502 .stop = &op_amd_stop, 535 .stop = &op_amd_stop,
503 .shutdown = &op_amd_shutdown 536 .shutdown = &op_amd_shutdown,
537#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
538 .switch_ctrl = &op_mux_switch_ctrl,
539#endif
504}; 540};
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index 819b131fd752..ac6b354becdf 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -32,6 +32,8 @@
32#define NUM_CCCRS_HT2 9 32#define NUM_CCCRS_HT2 9
33#define NUM_CONTROLS_HT2 (NUM_ESCRS_HT2 + NUM_CCCRS_HT2) 33#define NUM_CONTROLS_HT2 (NUM_ESCRS_HT2 + NUM_CCCRS_HT2)
34 34
35#define OP_CTR_OVERFLOW (1ULL<<31)
36
35static unsigned int num_counters = NUM_COUNTERS_NON_HT; 37static unsigned int num_counters = NUM_COUNTERS_NON_HT;
36static unsigned int num_controls = NUM_CONTROLS_NON_HT; 38static unsigned int num_controls = NUM_CONTROLS_NON_HT;
37 39
@@ -350,8 +352,6 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
350#define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1)) 352#define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1))
351#define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25)) 353#define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25))
352#define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9)) 354#define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9))
353#define ESCR_READ(escr, high, ev, i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0)
354#define ESCR_WRITE(escr, high, ev, i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0)
355 355
356#define CCCR_RESERVED_BITS 0x38030FFF 356#define CCCR_RESERVED_BITS 0x38030FFF
357#define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS) 357#define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS)
@@ -361,17 +361,9 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
361#define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27)) 361#define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27))
362#define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12)) 362#define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12))
363#define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12)) 363#define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12))
364#define CCCR_READ(low, high, i) do {rdmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0)
365#define CCCR_WRITE(low, high, i) do {wrmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0)
366#define CCCR_OVF_P(cccr) ((cccr) & (1U<<31)) 364#define CCCR_OVF_P(cccr) ((cccr) & (1U<<31))
367#define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31))) 365#define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31)))
368 366
369#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0)
370#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0)
371#define CTR_READ(l, h, i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h)); } while (0)
372#define CTR_WRITE(l, i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1); } while (0)
373#define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000))
374
375 367
376/* this assigns a "stagger" to the current CPU, which is used throughout 368/* this assigns a "stagger" to the current CPU, which is used throughout
377 the code in this module as an extra array offset, to select the "even" 369 the code in this module as an extra array offset, to select the "even"
@@ -515,7 +507,7 @@ static void pmc_setup_one_p4_counter(unsigned int ctr)
515 if (ev->bindings[i].virt_counter & counter_bit) { 507 if (ev->bindings[i].virt_counter & counter_bit) {
516 508
517 /* modify ESCR */ 509 /* modify ESCR */
518 ESCR_READ(escr, high, ev, i); 510 rdmsr(ev->bindings[i].escr_address, escr, high);
519 ESCR_CLEAR(escr); 511 ESCR_CLEAR(escr);
520 if (stag == 0) { 512 if (stag == 0) {
521 ESCR_SET_USR_0(escr, counter_config[ctr].user); 513 ESCR_SET_USR_0(escr, counter_config[ctr].user);
@@ -526,10 +518,11 @@ static void pmc_setup_one_p4_counter(unsigned int ctr)
526 } 518 }
527 ESCR_SET_EVENT_SELECT(escr, ev->event_select); 519 ESCR_SET_EVENT_SELECT(escr, ev->event_select);
528 ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask); 520 ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask);
529 ESCR_WRITE(escr, high, ev, i); 521 wrmsr(ev->bindings[i].escr_address, escr, high);
530 522
531 /* modify CCCR */ 523 /* modify CCCR */
532 CCCR_READ(cccr, high, VIRT_CTR(stag, ctr)); 524 rdmsr(p4_counters[VIRT_CTR(stag, ctr)].cccr_address,
525 cccr, high);
533 CCCR_CLEAR(cccr); 526 CCCR_CLEAR(cccr);
534 CCCR_SET_REQUIRED_BITS(cccr); 527 CCCR_SET_REQUIRED_BITS(cccr);
535 CCCR_SET_ESCR_SELECT(cccr, ev->escr_select); 528 CCCR_SET_ESCR_SELECT(cccr, ev->escr_select);
@@ -537,7 +530,8 @@ static void pmc_setup_one_p4_counter(unsigned int ctr)
537 CCCR_SET_PMI_OVF_0(cccr); 530 CCCR_SET_PMI_OVF_0(cccr);
538 else 531 else
539 CCCR_SET_PMI_OVF_1(cccr); 532 CCCR_SET_PMI_OVF_1(cccr);
540 CCCR_WRITE(cccr, high, VIRT_CTR(stag, ctr)); 533 wrmsr(p4_counters[VIRT_CTR(stag, ctr)].cccr_address,
534 cccr, high);
541 return; 535 return;
542 } 536 }
543 } 537 }
@@ -548,7 +542,8 @@ static void pmc_setup_one_p4_counter(unsigned int ctr)
548} 542}
549 543
550 544
551static void p4_setup_ctrs(struct op_msrs const * const msrs) 545static void p4_setup_ctrs(struct op_x86_model_spec const *model,
546 struct op_msrs const * const msrs)
552{ 547{
553 unsigned int i; 548 unsigned int i;
554 unsigned int low, high; 549 unsigned int low, high;
@@ -563,8 +558,8 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs)
563 } 558 }
564 559
565 /* clear the cccrs we will use */ 560 /* clear the cccrs we will use */
566 for (i = 0 ; i < num_counters ; i++) { 561 for (i = 0; i < num_counters; i++) {
567 if (unlikely(!CTRL_IS_RESERVED(msrs, i))) 562 if (unlikely(!msrs->controls[i].addr))
568 continue; 563 continue;
569 rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); 564 rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
570 CCCR_CLEAR(low); 565 CCCR_CLEAR(low);
@@ -574,17 +569,18 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs)
574 569
575 /* clear all escrs (including those outside our concern) */ 570 /* clear all escrs (including those outside our concern) */
576 for (i = num_counters; i < num_controls; i++) { 571 for (i = num_counters; i < num_controls; i++) {
577 if (unlikely(!CTRL_IS_RESERVED(msrs, i))) 572 if (unlikely(!msrs->controls[i].addr))
578 continue; 573 continue;
579 wrmsr(msrs->controls[i].addr, 0, 0); 574 wrmsr(msrs->controls[i].addr, 0, 0);
580 } 575 }
581 576
582 /* setup all counters */ 577 /* setup all counters */
583 for (i = 0 ; i < num_counters ; ++i) { 578 for (i = 0; i < num_counters; ++i) {
584 if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs, i))) { 579 if (counter_config[i].enabled && msrs->controls[i].addr) {
585 reset_value[i] = counter_config[i].count; 580 reset_value[i] = counter_config[i].count;
586 pmc_setup_one_p4_counter(i); 581 pmc_setup_one_p4_counter(i);
587 CTR_WRITE(counter_config[i].count, VIRT_CTR(stag, i)); 582 wrmsrl(p4_counters[VIRT_CTR(stag, i)].counter_address,
583 -(u64)counter_config[i].count);
588 } else { 584 } else {
589 reset_value[i] = 0; 585 reset_value[i] = 0;
590 } 586 }
@@ -624,14 +620,16 @@ static int p4_check_ctrs(struct pt_regs * const regs,
624 620
625 real = VIRT_CTR(stag, i); 621 real = VIRT_CTR(stag, i);
626 622
627 CCCR_READ(low, high, real); 623 rdmsr(p4_counters[real].cccr_address, low, high);
628 CTR_READ(ctr, high, real); 624 rdmsr(p4_counters[real].counter_address, ctr, high);
629 if (CCCR_OVF_P(low) || CTR_OVERFLOW_P(ctr)) { 625 if (CCCR_OVF_P(low) || !(ctr & OP_CTR_OVERFLOW)) {
630 oprofile_add_sample(regs, i); 626 oprofile_add_sample(regs, i);
631 CTR_WRITE(reset_value[i], real); 627 wrmsrl(p4_counters[real].counter_address,
628 -(u64)reset_value[i]);
632 CCCR_CLEAR_OVF(low); 629 CCCR_CLEAR_OVF(low);
633 CCCR_WRITE(low, high, real); 630 wrmsr(p4_counters[real].cccr_address, low, high);
634 CTR_WRITE(reset_value[i], real); 631 wrmsrl(p4_counters[real].counter_address,
632 -(u64)reset_value[i]);
635 } 633 }
636 } 634 }
637 635
@@ -653,9 +651,9 @@ static void p4_start(struct op_msrs const * const msrs)
653 for (i = 0; i < num_counters; ++i) { 651 for (i = 0; i < num_counters; ++i) {
654 if (!reset_value[i]) 652 if (!reset_value[i])
655 continue; 653 continue;
656 CCCR_READ(low, high, VIRT_CTR(stag, i)); 654 rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
657 CCCR_SET_ENABLE(low); 655 CCCR_SET_ENABLE(low);
658 CCCR_WRITE(low, high, VIRT_CTR(stag, i)); 656 wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
659 } 657 }
660} 658}
661 659
@@ -670,9 +668,9 @@ static void p4_stop(struct op_msrs const * const msrs)
670 for (i = 0; i < num_counters; ++i) { 668 for (i = 0; i < num_counters; ++i) {
671 if (!reset_value[i]) 669 if (!reset_value[i])
672 continue; 670 continue;
673 CCCR_READ(low, high, VIRT_CTR(stag, i)); 671 rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
674 CCCR_SET_DISABLE(low); 672 CCCR_SET_DISABLE(low);
675 CCCR_WRITE(low, high, VIRT_CTR(stag, i)); 673 wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
676 } 674 }
677} 675}
678 676
@@ -680,8 +678,8 @@ static void p4_shutdown(struct op_msrs const * const msrs)
680{ 678{
681 int i; 679 int i;
682 680
683 for (i = 0 ; i < num_counters ; ++i) { 681 for (i = 0; i < num_counters; ++i) {
684 if (CTR_IS_RESERVED(msrs, i)) 682 if (msrs->counters[i].addr)
685 release_perfctr_nmi(msrs->counters[i].addr); 683 release_perfctr_nmi(msrs->counters[i].addr);
686 } 684 }
687 /* 685 /*
@@ -689,15 +687,15 @@ static void p4_shutdown(struct op_msrs const * const msrs)
689 * conjunction with the counter registers (hence the starting offset). 687 * conjunction with the counter registers (hence the starting offset).
690 * This saves a few bits. 688 * This saves a few bits.
691 */ 689 */
692 for (i = num_counters ; i < num_controls ; ++i) { 690 for (i = num_counters; i < num_controls; ++i) {
693 if (CTRL_IS_RESERVED(msrs, i)) 691 if (msrs->controls[i].addr)
694 release_evntsel_nmi(msrs->controls[i].addr); 692 release_evntsel_nmi(msrs->controls[i].addr);
695 } 693 }
696} 694}
697 695
698 696
699#ifdef CONFIG_SMP 697#ifdef CONFIG_SMP
700struct op_x86_model_spec const op_p4_ht2_spec = { 698struct op_x86_model_spec op_p4_ht2_spec = {
701 .num_counters = NUM_COUNTERS_HT2, 699 .num_counters = NUM_COUNTERS_HT2,
702 .num_controls = NUM_CONTROLS_HT2, 700 .num_controls = NUM_CONTROLS_HT2,
703 .fill_in_addresses = &p4_fill_in_addresses, 701 .fill_in_addresses = &p4_fill_in_addresses,
@@ -709,7 +707,7 @@ struct op_x86_model_spec const op_p4_ht2_spec = {
709}; 707};
710#endif 708#endif
711 709
712struct op_x86_model_spec const op_p4_spec = { 710struct op_x86_model_spec op_p4_spec = {
713 .num_counters = NUM_COUNTERS_NON_HT, 711 .num_counters = NUM_COUNTERS_NON_HT,
714 .num_controls = NUM_CONTROLS_NON_HT, 712 .num_controls = NUM_CONTROLS_NON_HT,
715 .fill_in_addresses = &p4_fill_in_addresses, 713 .fill_in_addresses = &p4_fill_in_addresses,
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 4da7230b3d17..4899215999de 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -10,6 +10,7 @@
10 * @author Philippe Elie 10 * @author Philippe Elie
11 * @author Graydon Hoare 11 * @author Graydon Hoare
12 * @author Andi Kleen 12 * @author Andi Kleen
13 * @author Robert Richter <robert.richter@amd.com>
13 */ 14 */
14 15
15#include <linux/oprofile.h> 16#include <linux/oprofile.h>
@@ -18,7 +19,6 @@
18#include <asm/msr.h> 19#include <asm/msr.h>
19#include <asm/apic.h> 20#include <asm/apic.h>
20#include <asm/nmi.h> 21#include <asm/nmi.h>
21#include <asm/perf_counter.h>
22 22
23#include "op_x86_model.h" 23#include "op_x86_model.h"
24#include "op_counter.h" 24#include "op_counter.h"
@@ -26,20 +26,7 @@
26static int num_counters = 2; 26static int num_counters = 2;
27static int counter_width = 32; 27static int counter_width = 32;
28 28
29#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) 29#define MSR_PPRO_EVENTSEL_RESERVED ((0xFFFFFFFFULL<<32)|(1ULL<<21))
30#define CTR_OVERFLOWED(n) (!((n) & (1ULL<<(counter_width-1))))
31
32#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0)
33#define CTRL_READ(l, h, msrs, c) do {rdmsr((msrs->controls[(c)].addr), (l), (h)); } while (0)
34#define CTRL_WRITE(l, h, msrs, c) do {wrmsr((msrs->controls[(c)].addr), (l), (h)); } while (0)
35#define CTRL_SET_ACTIVE(n) (n |= (1<<22))
36#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22))
37#define CTRL_CLEAR(x) (x &= (1<<21))
38#define CTRL_SET_ENABLE(val) (val |= 1<<20)
39#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16))
40#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17))
41#define CTRL_SET_UM(val, m) (val |= (m << 8))
42#define CTRL_SET_EVENT(val, e) (val |= e)
43 30
44static u64 *reset_value; 31static u64 *reset_value;
45 32
@@ -63,9 +50,10 @@ static void ppro_fill_in_addresses(struct op_msrs * const msrs)
63} 50}
64 51
65 52
66static void ppro_setup_ctrs(struct op_msrs const * const msrs) 53static void ppro_setup_ctrs(struct op_x86_model_spec const *model,
54 struct op_msrs const * const msrs)
67{ 55{
68 unsigned int low, high; 56 u64 val;
69 int i; 57 int i;
70 58
71 if (!reset_value) { 59 if (!reset_value) {
@@ -93,36 +81,30 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs)
93 } 81 }
94 82
95 /* clear all counters */ 83 /* clear all counters */
96 for (i = 0 ; i < num_counters; ++i) { 84 for (i = 0; i < num_counters; ++i) {
97 if (unlikely(!CTRL_IS_RESERVED(msrs, i))) 85 if (unlikely(!msrs->controls[i].addr))
98 continue; 86 continue;
99 CTRL_READ(low, high, msrs, i); 87 rdmsrl(msrs->controls[i].addr, val);
100 CTRL_CLEAR(low); 88 val &= model->reserved;
101 CTRL_WRITE(low, high, msrs, i); 89 wrmsrl(msrs->controls[i].addr, val);
102 } 90 }
103 91
104 /* avoid a false detection of ctr overflows in NMI handler */ 92 /* avoid a false detection of ctr overflows in NMI handler */
105 for (i = 0; i < num_counters; ++i) { 93 for (i = 0; i < num_counters; ++i) {
106 if (unlikely(!CTR_IS_RESERVED(msrs, i))) 94 if (unlikely(!msrs->counters[i].addr))
107 continue; 95 continue;
108 wrmsrl(msrs->counters[i].addr, -1LL); 96 wrmsrl(msrs->counters[i].addr, -1LL);
109 } 97 }
110 98
111 /* enable active counters */ 99 /* enable active counters */
112 for (i = 0; i < num_counters; ++i) { 100 for (i = 0; i < num_counters; ++i) {
113 if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { 101 if (counter_config[i].enabled && msrs->counters[i].addr) {
114 reset_value[i] = counter_config[i].count; 102 reset_value[i] = counter_config[i].count;
115
116 wrmsrl(msrs->counters[i].addr, -reset_value[i]); 103 wrmsrl(msrs->counters[i].addr, -reset_value[i]);
117 104 rdmsrl(msrs->controls[i].addr, val);
118 CTRL_READ(low, high, msrs, i); 105 val &= model->reserved;
119 CTRL_CLEAR(low); 106 val |= op_x86_get_ctrl(model, &counter_config[i]);
120 CTRL_SET_ENABLE(low); 107 wrmsrl(msrs->controls[i].addr, val);
121 CTRL_SET_USR(low, counter_config[i].user);
122 CTRL_SET_KERN(low, counter_config[i].kernel);
123 CTRL_SET_UM(low, counter_config[i].unit_mask);
124 CTRL_SET_EVENT(low, counter_config[i].event);
125 CTRL_WRITE(low, high, msrs, i);
126 } else { 108 } else {
127 reset_value[i] = 0; 109 reset_value[i] = 0;
128 } 110 }
@@ -143,14 +125,14 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
143 if (unlikely(!reset_value)) 125 if (unlikely(!reset_value))
144 goto out; 126 goto out;
145 127
146 for (i = 0 ; i < num_counters; ++i) { 128 for (i = 0; i < num_counters; ++i) {
147 if (!reset_value[i]) 129 if (!reset_value[i])
148 continue; 130 continue;
149 rdmsrl(msrs->counters[i].addr, val); 131 rdmsrl(msrs->counters[i].addr, val);
150 if (CTR_OVERFLOWED(val)) { 132 if (val & (1ULL << (counter_width - 1)))
151 oprofile_add_sample(regs, i); 133 continue;
152 wrmsrl(msrs->counters[i].addr, -reset_value[i]); 134 oprofile_add_sample(regs, i);
153 } 135 wrmsrl(msrs->counters[i].addr, -reset_value[i]);
154 } 136 }
155 137
156out: 138out:
@@ -171,16 +153,16 @@ out:
171 153
172static void ppro_start(struct op_msrs const * const msrs) 154static void ppro_start(struct op_msrs const * const msrs)
173{ 155{
174 unsigned int low, high; 156 u64 val;
175 int i; 157 int i;
176 158
177 if (!reset_value) 159 if (!reset_value)
178 return; 160 return;
179 for (i = 0; i < num_counters; ++i) { 161 for (i = 0; i < num_counters; ++i) {
180 if (reset_value[i]) { 162 if (reset_value[i]) {
181 CTRL_READ(low, high, msrs, i); 163 rdmsrl(msrs->controls[i].addr, val);
182 CTRL_SET_ACTIVE(low); 164 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
183 CTRL_WRITE(low, high, msrs, i); 165 wrmsrl(msrs->controls[i].addr, val);
184 } 166 }
185 } 167 }
186} 168}
@@ -188,7 +170,7 @@ static void ppro_start(struct op_msrs const * const msrs)
188 170
189static void ppro_stop(struct op_msrs const * const msrs) 171static void ppro_stop(struct op_msrs const * const msrs)
190{ 172{
191 unsigned int low, high; 173 u64 val;
192 int i; 174 int i;
193 175
194 if (!reset_value) 176 if (!reset_value)
@@ -196,9 +178,9 @@ static void ppro_stop(struct op_msrs const * const msrs)
196 for (i = 0; i < num_counters; ++i) { 178 for (i = 0; i < num_counters; ++i) {
197 if (!reset_value[i]) 179 if (!reset_value[i])
198 continue; 180 continue;
199 CTRL_READ(low, high, msrs, i); 181 rdmsrl(msrs->controls[i].addr, val);
200 CTRL_SET_INACTIVE(low); 182 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
201 CTRL_WRITE(low, high, msrs, i); 183 wrmsrl(msrs->controls[i].addr, val);
202 } 184 }
203} 185}
204 186
@@ -206,12 +188,12 @@ static void ppro_shutdown(struct op_msrs const * const msrs)
206{ 188{
207 int i; 189 int i;
208 190
209 for (i = 0 ; i < num_counters ; ++i) { 191 for (i = 0; i < num_counters; ++i) {
210 if (CTR_IS_RESERVED(msrs, i)) 192 if (msrs->counters[i].addr)
211 release_perfctr_nmi(MSR_P6_PERFCTR0 + i); 193 release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
212 } 194 }
213 for (i = 0 ; i < num_counters ; ++i) { 195 for (i = 0; i < num_counters; ++i) {
214 if (CTRL_IS_RESERVED(msrs, i)) 196 if (msrs->controls[i].addr)
215 release_evntsel_nmi(MSR_P6_EVNTSEL0 + i); 197 release_evntsel_nmi(MSR_P6_EVNTSEL0 + i);
216 } 198 }
217 if (reset_value) { 199 if (reset_value) {
@@ -222,8 +204,9 @@ static void ppro_shutdown(struct op_msrs const * const msrs)
222 204
223 205
224struct op_x86_model_spec op_ppro_spec = { 206struct op_x86_model_spec op_ppro_spec = {
225 .num_counters = 2, /* can be overriden */ 207 .num_counters = 2,
226 .num_controls = 2, /* dito */ 208 .num_controls = 2,
209 .reserved = MSR_PPRO_EVENTSEL_RESERVED,
227 .fill_in_addresses = &ppro_fill_in_addresses, 210 .fill_in_addresses = &ppro_fill_in_addresses,
228 .setup_ctrs = &ppro_setup_ctrs, 211 .setup_ctrs = &ppro_setup_ctrs,
229 .check_ctrs = &ppro_check_ctrs, 212 .check_ctrs = &ppro_check_ctrs,
@@ -241,7 +224,7 @@ struct op_x86_model_spec op_ppro_spec = {
241 * the specific CPU. 224 * the specific CPU.
242 */ 225 */
243 226
244void arch_perfmon_setup_counters(void) 227static void arch_perfmon_setup_counters(void)
245{ 228{
246 union cpuid10_eax eax; 229 union cpuid10_eax eax;
247 230
@@ -259,11 +242,17 @@ void arch_perfmon_setup_counters(void)
259 242
260 op_arch_perfmon_spec.num_counters = num_counters; 243 op_arch_perfmon_spec.num_counters = num_counters;
261 op_arch_perfmon_spec.num_controls = num_counters; 244 op_arch_perfmon_spec.num_controls = num_counters;
262 op_ppro_spec.num_counters = num_counters; 245}
263 op_ppro_spec.num_controls = num_counters; 246
247static int arch_perfmon_init(struct oprofile_operations *ignore)
248{
249 arch_perfmon_setup_counters();
250 return 0;
264} 251}
265 252
266struct op_x86_model_spec op_arch_perfmon_spec = { 253struct op_x86_model_spec op_arch_perfmon_spec = {
254 .reserved = MSR_PPRO_EVENTSEL_RESERVED,
255 .init = &arch_perfmon_init,
267 /* num_counters/num_controls filled in at runtime */ 256 /* num_counters/num_controls filled in at runtime */
268 .fill_in_addresses = &ppro_fill_in_addresses, 257 .fill_in_addresses = &ppro_fill_in_addresses,
269 /* user space does the cpuid check for available events */ 258 /* user space does the cpuid check for available events */
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h
index 825e79064d64..b83776180c7f 100644
--- a/arch/x86/oprofile/op_x86_model.h
+++ b/arch/x86/oprofile/op_x86_model.h
@@ -6,51 +6,66 @@
6 * @remark Read the file COPYING 6 * @remark Read the file COPYING
7 * 7 *
8 * @author Graydon Hoare 8 * @author Graydon Hoare
9 * @author Robert Richter <robert.richter@amd.com>
9 */ 10 */
10 11
11#ifndef OP_X86_MODEL_H 12#ifndef OP_X86_MODEL_H
12#define OP_X86_MODEL_H 13#define OP_X86_MODEL_H
13 14
14struct op_saved_msr { 15#include <asm/types.h>
15 unsigned int high; 16#include <asm/perf_counter.h>
16 unsigned int low;
17};
18 17
19struct op_msr { 18struct op_msr {
20 unsigned long addr; 19 unsigned long addr;
21 struct op_saved_msr saved; 20 u64 saved;
22}; 21};
23 22
24struct op_msrs { 23struct op_msrs {
25 struct op_msr *counters; 24 struct op_msr *counters;
26 struct op_msr *controls; 25 struct op_msr *controls;
26 struct op_msr *multiplex;
27}; 27};
28 28
29struct pt_regs; 29struct pt_regs;
30 30
31struct oprofile_operations;
32
31/* The model vtable abstracts the differences between 33/* The model vtable abstracts the differences between
32 * various x86 CPU models' perfctr support. 34 * various x86 CPU models' perfctr support.
33 */ 35 */
34struct op_x86_model_spec { 36struct op_x86_model_spec {
35 int (*init)(struct oprofile_operations *ops); 37 unsigned int num_counters;
36 void (*exit)(void); 38 unsigned int num_controls;
37 unsigned int num_counters; 39 unsigned int num_virt_counters;
38 unsigned int num_controls; 40 u64 reserved;
39 void (*fill_in_addresses)(struct op_msrs * const msrs); 41 u16 event_mask;
40 void (*setup_ctrs)(struct op_msrs const * const msrs); 42 int (*init)(struct oprofile_operations *ops);
41 int (*check_ctrs)(struct pt_regs * const regs, 43 void (*exit)(void);
42 struct op_msrs const * const msrs); 44 void (*fill_in_addresses)(struct op_msrs * const msrs);
43 void (*start)(struct op_msrs const * const msrs); 45 void (*setup_ctrs)(struct op_x86_model_spec const *model,
44 void (*stop)(struct op_msrs const * const msrs); 46 struct op_msrs const * const msrs);
45 void (*shutdown)(struct op_msrs const * const msrs); 47 int (*check_ctrs)(struct pt_regs * const regs,
48 struct op_msrs const * const msrs);
49 void (*start)(struct op_msrs const * const msrs);
50 void (*stop)(struct op_msrs const * const msrs);
51 void (*shutdown)(struct op_msrs const * const msrs);
52#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
53 void (*switch_ctrl)(struct op_x86_model_spec const *model,
54 struct op_msrs const * const msrs);
55#endif
46}; 56};
47 57
58struct op_counter_config;
59
60extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model,
61 struct op_counter_config *counter_config);
62extern int op_x86_phys_to_virt(int phys);
63extern int op_x86_virt_to_phys(int virt);
64
48extern struct op_x86_model_spec op_ppro_spec; 65extern struct op_x86_model_spec op_ppro_spec;
49extern struct op_x86_model_spec const op_p4_spec; 66extern struct op_x86_model_spec op_p4_spec;
50extern struct op_x86_model_spec const op_p4_ht2_spec; 67extern struct op_x86_model_spec op_p4_ht2_spec;
51extern struct op_x86_model_spec const op_amd_spec; 68extern struct op_x86_model_spec op_amd_spec;
52extern struct op_x86_model_spec op_arch_perfmon_spec; 69extern struct op_x86_model_spec op_arch_perfmon_spec;
53 70
54extern void arch_perfmon_setup_counters(void);
55
56#endif /* OP_X86_MODEL_H */ 71#endif /* OP_X86_MODEL_H */
diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c
index bd13c3e4c6db..347d882b3bb3 100644
--- a/arch/x86/pci/direct.c
+++ b/arch/x86/pci/direct.c
@@ -192,13 +192,14 @@ struct pci_raw_ops pci_direct_conf2 = {
192static int __init pci_sanity_check(struct pci_raw_ops *o) 192static int __init pci_sanity_check(struct pci_raw_ops *o)
193{ 193{
194 u32 x = 0; 194 u32 x = 0;
195 int devfn; 195 int year, devfn;
196 196
197 if (pci_probe & PCI_NO_CHECKS) 197 if (pci_probe & PCI_NO_CHECKS)
198 return 1; 198 return 1;
199 /* Assume Type 1 works for newer systems. 199 /* Assume Type 1 works for newer systems.
200 This handles machines that don't have anything on PCI Bus 0. */ 200 This handles machines that don't have anything on PCI Bus 0. */
201 if (dmi_get_year(DMI_BIOS_DATE) >= 2001) 201 dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL);
202 if (year >= 2001)
202 return 1; 203 return 1;
203 204
204 for (devfn = 0; devfn < 0x100; devfn++) { 205 for (devfn = 0; devfn < 0x100; devfn++) {
diff --git a/drivers/acpi/blacklist.c b/drivers/acpi/blacklist.c
index f6baa77deefb..0c4ca4d318b3 100644
--- a/drivers/acpi/blacklist.c
+++ b/drivers/acpi/blacklist.c
@@ -78,9 +78,10 @@ static struct acpi_blacklist_item acpi_blacklist[] __initdata = {
78 78
79static int __init blacklist_by_year(void) 79static int __init blacklist_by_year(void)
80{ 80{
81 int year = dmi_get_year(DMI_BIOS_DATE); 81 int year;
82
82 /* Doesn't exist? Likely an old system */ 83 /* Doesn't exist? Likely an old system */
83 if (year == -1) { 84 if (!dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL)) {
84 printk(KERN_ERR PREFIX "no DMI BIOS year, " 85 printk(KERN_ERR PREFIX "no DMI BIOS year, "
85 "acpi=force is required to enable ACPI\n" ); 86 "acpi=force is required to enable ACPI\n" );
86 return 1; 87 return 1;
diff --git a/drivers/ata/Kconfig b/drivers/ata/Kconfig
index b17c57f85032..ab2fa4eeb364 100644
--- a/drivers/ata/Kconfig
+++ b/drivers/ata/Kconfig
@@ -26,6 +26,17 @@ config ATA_NONSTANDARD
26 bool 26 bool
27 default n 27 default n
28 28
29config ATA_VERBOSE_ERROR
30 bool "Verbose ATA error reporting"
31 default y
32 help
33 This option adds parsing of ATA command descriptions and error bits
34 in libata kernel output, making it easier to interpret.
35 This option will enlarge the kernel by approx. 6KB. Disable it only
36 if kernel size is more important than ease of debugging.
37
38 If unsure, say Y.
39
29config ATA_ACPI 40config ATA_ACPI
30 bool "ATA ACPI Support" 41 bool "ATA ACPI Support"
31 depends on ACPI && PCI 42 depends on ACPI && PCI
@@ -586,6 +597,16 @@ config PATA_RB532
586 597
587 If unsure, say N. 598 If unsure, say N.
588 599
600config PATA_RDC
601 tristate "RDC PATA support"
602 depends on PCI
603 help
604 This option enables basic support for the later RDC PATA controllers
605 controllers via the new ATA layer. For the RDC 1010, you need to
606 enable the IT821X driver instead.
607
608 If unsure, say N.
609
589config PATA_RZ1000 610config PATA_RZ1000
590 tristate "PC Tech RZ1000 PATA support" 611 tristate "PC Tech RZ1000 PATA support"
591 depends on PCI 612 depends on PCI
diff --git a/drivers/ata/Makefile b/drivers/ata/Makefile
index 38906f9bbb4e..463eb52236aa 100644
--- a/drivers/ata/Makefile
+++ b/drivers/ata/Makefile
@@ -57,6 +57,7 @@ obj-$(CONFIG_PATA_PDC_OLD) += pata_pdc202xx_old.o
57obj-$(CONFIG_PATA_QDI) += pata_qdi.o 57obj-$(CONFIG_PATA_QDI) += pata_qdi.o
58obj-$(CONFIG_PATA_RADISYS) += pata_radisys.o 58obj-$(CONFIG_PATA_RADISYS) += pata_radisys.o
59obj-$(CONFIG_PATA_RB532) += pata_rb532_cf.o 59obj-$(CONFIG_PATA_RB532) += pata_rb532_cf.o
60obj-$(CONFIG_PATA_RDC) += pata_rdc.o
60obj-$(CONFIG_PATA_RZ1000) += pata_rz1000.o 61obj-$(CONFIG_PATA_RZ1000) += pata_rz1000.o
61obj-$(CONFIG_PATA_SC1200) += pata_sc1200.o 62obj-$(CONFIG_PATA_SC1200) += pata_sc1200.o
62obj-$(CONFIG_PATA_SERVERWORKS) += pata_serverworks.o 63obj-$(CONFIG_PATA_SERVERWORKS) += pata_serverworks.o
diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index fe3eba5d6b3e..d4cd9c203314 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -329,10 +329,24 @@ static ssize_t ahci_activity_store(struct ata_device *dev,
329 enum sw_activity val); 329 enum sw_activity val);
330static void ahci_init_sw_activity(struct ata_link *link); 330static void ahci_init_sw_activity(struct ata_link *link);
331 331
332static ssize_t ahci_show_host_caps(struct device *dev,
333 struct device_attribute *attr, char *buf);
334static ssize_t ahci_show_host_version(struct device *dev,
335 struct device_attribute *attr, char *buf);
336static ssize_t ahci_show_port_cmd(struct device *dev,
337 struct device_attribute *attr, char *buf);
338
339DEVICE_ATTR(ahci_host_caps, S_IRUGO, ahci_show_host_caps, NULL);
340DEVICE_ATTR(ahci_host_version, S_IRUGO, ahci_show_host_version, NULL);
341DEVICE_ATTR(ahci_port_cmd, S_IRUGO, ahci_show_port_cmd, NULL);
342
332static struct device_attribute *ahci_shost_attrs[] = { 343static struct device_attribute *ahci_shost_attrs[] = {
333 &dev_attr_link_power_management_policy, 344 &dev_attr_link_power_management_policy,
334 &dev_attr_em_message_type, 345 &dev_attr_em_message_type,
335 &dev_attr_em_message, 346 &dev_attr_em_message,
347 &dev_attr_ahci_host_caps,
348 &dev_attr_ahci_host_version,
349 &dev_attr_ahci_port_cmd,
336 NULL 350 NULL
337}; 351};
338 352
@@ -539,6 +553,12 @@ static const struct pci_device_id ahci_pci_tbl[] = {
539 { PCI_VDEVICE(ATI, 0x4394), board_ahci_sb700 }, /* ATI SB700/800 */ 553 { PCI_VDEVICE(ATI, 0x4394), board_ahci_sb700 }, /* ATI SB700/800 */
540 { PCI_VDEVICE(ATI, 0x4395), board_ahci_sb700 }, /* ATI SB700/800 */ 554 { PCI_VDEVICE(ATI, 0x4395), board_ahci_sb700 }, /* ATI SB700/800 */
541 555
556 /* AMD */
557 { PCI_VDEVICE(AMD, 0x7800), board_ahci }, /* AMD SB900 */
558 /* AMD is using RAID class only for ahci controllers */
559 { PCI_VENDOR_ID_AMD, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID,
560 PCI_CLASS_STORAGE_RAID << 8, 0xffffff, board_ahci },
561
542 /* VIA */ 562 /* VIA */
543 { PCI_VDEVICE(VIA, 0x3349), board_ahci_vt8251 }, /* VIA VT8251 */ 563 { PCI_VDEVICE(VIA, 0x3349), board_ahci_vt8251 }, /* VIA VT8251 */
544 { PCI_VDEVICE(VIA, 0x6287), board_ahci_vt8251 }, /* VIA VT8251 */ 564 { PCI_VDEVICE(VIA, 0x6287), board_ahci_vt8251 }, /* VIA VT8251 */
@@ -702,6 +722,36 @@ static void ahci_enable_ahci(void __iomem *mmio)
702 WARN_ON(1); 722 WARN_ON(1);
703} 723}
704 724
725static ssize_t ahci_show_host_caps(struct device *dev,
726 struct device_attribute *attr, char *buf)
727{
728 struct Scsi_Host *shost = class_to_shost(dev);
729 struct ata_port *ap = ata_shost_to_port(shost);
730 struct ahci_host_priv *hpriv = ap->host->private_data;
731
732 return sprintf(buf, "%x\n", hpriv->cap);
733}
734
735static ssize_t ahci_show_host_version(struct device *dev,
736 struct device_attribute *attr, char *buf)
737{
738 struct Scsi_Host *shost = class_to_shost(dev);
739 struct ata_port *ap = ata_shost_to_port(shost);
740 void __iomem *mmio = ap->host->iomap[AHCI_PCI_BAR];
741
742 return sprintf(buf, "%x\n", readl(mmio + HOST_VERSION));
743}
744
745static ssize_t ahci_show_port_cmd(struct device *dev,
746 struct device_attribute *attr, char *buf)
747{
748 struct Scsi_Host *shost = class_to_shost(dev);
749 struct ata_port *ap = ata_shost_to_port(shost);
750 void __iomem *port_mmio = ahci_port_base(ap);
751
752 return sprintf(buf, "%x\n", readl(port_mmio + PORT_CMD));
753}
754
705/** 755/**
706 * ahci_save_initial_config - Save and fixup initial config values 756 * ahci_save_initial_config - Save and fixup initial config values
707 * @pdev: target PCI device 757 * @pdev: target PCI device
@@ -1584,7 +1634,7 @@ static void ahci_fill_cmd_slot(struct ahci_port_priv *pp, unsigned int tag,
1584 pp->cmd_slot[tag].tbl_addr_hi = cpu_to_le32((cmd_tbl_dma >> 16) >> 16); 1634 pp->cmd_slot[tag].tbl_addr_hi = cpu_to_le32((cmd_tbl_dma >> 16) >> 16);
1585} 1635}
1586 1636
1587static int ahci_kick_engine(struct ata_port *ap, int force_restart) 1637static int ahci_kick_engine(struct ata_port *ap)
1588{ 1638{
1589 void __iomem *port_mmio = ahci_port_base(ap); 1639 void __iomem *port_mmio = ahci_port_base(ap);
1590 struct ahci_host_priv *hpriv = ap->host->private_data; 1640 struct ahci_host_priv *hpriv = ap->host->private_data;
@@ -1592,18 +1642,16 @@ static int ahci_kick_engine(struct ata_port *ap, int force_restart)
1592 u32 tmp; 1642 u32 tmp;
1593 int busy, rc; 1643 int busy, rc;
1594 1644
1595 /* do we need to kick the port? */
1596 busy = status & (ATA_BUSY | ATA_DRQ);
1597 if (!busy && !force_restart)
1598 return 0;
1599
1600 /* stop engine */ 1645 /* stop engine */
1601 rc = ahci_stop_engine(ap); 1646 rc = ahci_stop_engine(ap);
1602 if (rc) 1647 if (rc)
1603 goto out_restart; 1648 goto out_restart;
1604 1649
1605 /* need to do CLO? */ 1650 /* need to do CLO?
1606 if (!busy) { 1651 * always do CLO if PMP is attached (AHCI-1.3 9.2)
1652 */
1653 busy = status & (ATA_BUSY | ATA_DRQ);
1654 if (!busy && !sata_pmp_attached(ap)) {
1607 rc = 0; 1655 rc = 0;
1608 goto out_restart; 1656 goto out_restart;
1609 } 1657 }
@@ -1651,7 +1699,7 @@ static int ahci_exec_polled_cmd(struct ata_port *ap, int pmp,
1651 tmp = ata_wait_register(port_mmio + PORT_CMD_ISSUE, 0x1, 0x1, 1699 tmp = ata_wait_register(port_mmio + PORT_CMD_ISSUE, 0x1, 0x1,
1652 1, timeout_msec); 1700 1, timeout_msec);
1653 if (tmp & 0x1) { 1701 if (tmp & 0x1) {
1654 ahci_kick_engine(ap, 1); 1702 ahci_kick_engine(ap);
1655 return -EBUSY; 1703 return -EBUSY;
1656 } 1704 }
1657 } else 1705 } else
@@ -1674,7 +1722,7 @@ static int ahci_do_softreset(struct ata_link *link, unsigned int *class,
1674 DPRINTK("ENTER\n"); 1722 DPRINTK("ENTER\n");
1675 1723
1676 /* prepare for SRST (AHCI-1.1 10.4.1) */ 1724 /* prepare for SRST (AHCI-1.1 10.4.1) */
1677 rc = ahci_kick_engine(ap, 1); 1725 rc = ahci_kick_engine(ap);
1678 if (rc && rc != -EOPNOTSUPP) 1726 if (rc && rc != -EOPNOTSUPP)
1679 ata_link_printk(link, KERN_WARNING, 1727 ata_link_printk(link, KERN_WARNING,
1680 "failed to reset engine (errno=%d)\n", rc); 1728 "failed to reset engine (errno=%d)\n", rc);
@@ -1890,7 +1938,7 @@ static int ahci_p5wdh_hardreset(struct ata_link *link, unsigned int *class,
1890 rc = ata_wait_after_reset(link, jiffies + 2 * HZ, 1938 rc = ata_wait_after_reset(link, jiffies + 2 * HZ,
1891 ahci_check_ready); 1939 ahci_check_ready);
1892 if (rc) 1940 if (rc)
1893 ahci_kick_engine(ap, 0); 1941 ahci_kick_engine(ap);
1894 } 1942 }
1895 return rc; 1943 return rc;
1896} 1944}
@@ -2271,7 +2319,7 @@ static void ahci_post_internal_cmd(struct ata_queued_cmd *qc)
2271 2319
2272 /* make DMA engine forget about the failed command */ 2320 /* make DMA engine forget about the failed command */
2273 if (qc->flags & ATA_QCFLAG_FAILED) 2321 if (qc->flags & ATA_QCFLAG_FAILED)
2274 ahci_kick_engine(ap, 1); 2322 ahci_kick_engine(ap);
2275} 2323}
2276 2324
2277static void ahci_pmp_attach(struct ata_port *ap) 2325static void ahci_pmp_attach(struct ata_port *ap)
@@ -2603,14 +2651,18 @@ static void ahci_p5wdh_workaround(struct ata_host *host)
2603} 2651}
2604 2652
2605/* 2653/*
2606 * SB600 ahci controller on ASUS M2A-VM can't do 64bit DMA with older 2654 * SB600 ahci controller on certain boards can't do 64bit DMA with
2607 * BIOS. The oldest version known to be broken is 0901 and working is 2655 * older BIOS.
2608 * 1501 which was released on 2007-10-26. Force 32bit DMA on anything
2609 * older than 1501. Please read bko#9412 for more info.
2610 */ 2656 */
2611static bool ahci_asus_m2a_vm_32bit_only(struct pci_dev *pdev) 2657static bool ahci_sb600_32bit_only(struct pci_dev *pdev)
2612{ 2658{
2613 static const struct dmi_system_id sysids[] = { 2659 static const struct dmi_system_id sysids[] = {
2660 /*
2661 * The oldest version known to be broken is 0901 and
2662 * working is 1501 which was released on 2007-10-26.
2663 * Force 32bit DMA on anything older than 1501.
2664 * Please read bko#9412 for more info.
2665 */
2614 { 2666 {
2615 .ident = "ASUS M2A-VM", 2667 .ident = "ASUS M2A-VM",
2616 .matches = { 2668 .matches = {
@@ -2618,31 +2670,48 @@ static bool ahci_asus_m2a_vm_32bit_only(struct pci_dev *pdev)
2618 "ASUSTeK Computer INC."), 2670 "ASUSTeK Computer INC."),
2619 DMI_MATCH(DMI_BOARD_NAME, "M2A-VM"), 2671 DMI_MATCH(DMI_BOARD_NAME, "M2A-VM"),
2620 }, 2672 },
2673 .driver_data = "20071026", /* yyyymmdd */
2674 },
2675 /*
2676 * It's yet unknown whether more recent BIOS fixes the
2677 * problem. Blacklist the whole board for the time
2678 * being. Please read the following thread for more
2679 * info.
2680 *
2681 * http://thread.gmane.org/gmane.linux.ide/42326
2682 */
2683 {
2684 .ident = "Gigabyte GA-MA69VM-S2",
2685 .matches = {
2686 DMI_MATCH(DMI_BOARD_VENDOR,
2687 "Gigabyte Technology Co., Ltd."),
2688 DMI_MATCH(DMI_BOARD_NAME, "GA-MA69VM-S2"),
2689 },
2621 }, 2690 },
2622 { } 2691 { }
2623 }; 2692 };
2624 const char *cutoff_mmdd = "10/26"; 2693 const struct dmi_system_id *match;
2625 const char *date;
2626 int year;
2627 2694
2695 match = dmi_first_match(sysids);
2628 if (pdev->bus->number != 0 || pdev->devfn != PCI_DEVFN(0x12, 0) || 2696 if (pdev->bus->number != 0 || pdev->devfn != PCI_DEVFN(0x12, 0) ||
2629 !dmi_check_system(sysids)) 2697 !match)
2630 return false; 2698 return false;
2631 2699
2632 /* 2700 if (match->driver_data) {
2633 * Argh.... both version and date are free form strings. 2701 int year, month, date;
2634 * Let's hope they're using the same date format across 2702 char buf[9];
2635 * different versions. 2703
2636 */ 2704 dmi_get_date(DMI_BIOS_DATE, &year, &month, &date);
2637 date = dmi_get_system_info(DMI_BIOS_DATE); 2705 snprintf(buf, sizeof(buf), "%04d%02d%02d", year, month, date);
2638 year = dmi_get_year(DMI_BIOS_DATE);
2639 if (date && strlen(date) >= 10 && date[2] == '/' && date[5] == '/' &&
2640 (year > 2007 ||
2641 (year == 2007 && strncmp(date, cutoff_mmdd, 5) >= 0)))
2642 return false;
2643 2706
2644 dev_printk(KERN_WARNING, &pdev->dev, "ASUS M2A-VM: BIOS too old, " 2707 if (strcmp(buf, match->driver_data) >= 0)
2645 "forcing 32bit DMA, update BIOS\n"); 2708 return false;
2709
2710 dev_printk(KERN_WARNING, &pdev->dev, "%s: BIOS too old, "
2711 "forcing 32bit DMA, update BIOS\n", match->ident);
2712 } else
2713 dev_printk(KERN_WARNING, &pdev->dev, "%s: this board can't "
2714 "do 64bit DMA, forcing 32bit\n", match->ident);
2646 2715
2647 return true; 2716 return true;
2648} 2717}
@@ -2857,8 +2926,8 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
2857 if (board_id == board_ahci_sb700 && pdev->revision >= 0x40) 2926 if (board_id == board_ahci_sb700 && pdev->revision >= 0x40)
2858 hpriv->flags &= ~AHCI_HFLAG_IGN_SERR_INTERNAL; 2927 hpriv->flags &= ~AHCI_HFLAG_IGN_SERR_INTERNAL;
2859 2928
2860 /* apply ASUS M2A_VM quirk */ 2929 /* apply sb600 32bit only quirk */
2861 if (ahci_asus_m2a_vm_32bit_only(pdev)) 2930 if (ahci_sb600_32bit_only(pdev))
2862 hpriv->flags |= AHCI_HFLAG_32BIT_ONLY; 2931 hpriv->flags |= AHCI_HFLAG_32BIT_ONLY;
2863 2932
2864 if (!(hpriv->flags & AHCI_HFLAG_NO_MSI)) 2933 if (!(hpriv->flags & AHCI_HFLAG_NO_MSI))
@@ -2869,7 +2938,7 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
2869 2938
2870 /* prepare host */ 2939 /* prepare host */
2871 if (hpriv->cap & HOST_CAP_NCQ) 2940 if (hpriv->cap & HOST_CAP_NCQ)
2872 pi.flags |= ATA_FLAG_NCQ; 2941 pi.flags |= ATA_FLAG_NCQ | ATA_FLAG_FPDMA_AA;
2873 2942
2874 if (hpriv->cap & HOST_CAP_PMP) 2943 if (hpriv->cap & HOST_CAP_PMP)
2875 pi.flags |= ATA_FLAG_PMP; 2944 pi.flags |= ATA_FLAG_PMP;
diff --git a/drivers/ata/libata-acpi.c b/drivers/ata/libata-acpi.c
index ac176da1f94e..01964b6e6f6b 100644
--- a/drivers/ata/libata-acpi.c
+++ b/drivers/ata/libata-acpi.c
@@ -689,6 +689,7 @@ static int ata_acpi_run_tf(struct ata_device *dev,
689 struct ata_taskfile tf, ptf, rtf; 689 struct ata_taskfile tf, ptf, rtf;
690 unsigned int err_mask; 690 unsigned int err_mask;
691 const char *level; 691 const char *level;
692 const char *descr;
692 char msg[60]; 693 char msg[60];
693 int rc; 694 int rc;
694 695
@@ -736,11 +737,13 @@ static int ata_acpi_run_tf(struct ata_device *dev,
736 snprintf(msg, sizeof(msg), "filtered out"); 737 snprintf(msg, sizeof(msg), "filtered out");
737 rc = 0; 738 rc = 0;
738 } 739 }
740 descr = ata_get_cmd_descript(tf.command);
739 741
740 ata_dev_printk(dev, level, 742 ata_dev_printk(dev, level,
741 "ACPI cmd %02x/%02x:%02x:%02x:%02x:%02x:%02x %s\n", 743 "ACPI cmd %02x/%02x:%02x:%02x:%02x:%02x:%02x (%s) %s\n",
742 tf.command, tf.feature, tf.nsect, tf.lbal, 744 tf.command, tf.feature, tf.nsect, tf.lbal,
743 tf.lbam, tf.lbah, tf.device, msg); 745 tf.lbam, tf.lbah, tf.device,
746 (descr ? descr : "unknown"), msg);
744 747
745 return rc; 748 return rc;
746} 749}
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 072ba5ea138f..df31deac5c82 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -709,7 +709,13 @@ u64 ata_tf_read_block(struct ata_taskfile *tf, struct ata_device *dev)
709 head = tf->device & 0xf; 709 head = tf->device & 0xf;
710 sect = tf->lbal; 710 sect = tf->lbal;
711 711
712 block = (cyl * dev->heads + head) * dev->sectors + sect; 712 if (!sect) {
713 ata_dev_printk(dev, KERN_WARNING, "device reported "
714 "invalid CHS sector 0\n");
715 sect = 1; /* oh well */
716 }
717
718 block = (cyl * dev->heads + head) * dev->sectors + sect - 1;
713 } 719 }
714 720
715 return block; 721 return block;
@@ -2299,29 +2305,49 @@ static inline u8 ata_dev_knobble(struct ata_device *dev)
2299 return ((ap->cbl == ATA_CBL_SATA) && (!ata_id_is_sata(dev->id))); 2305 return ((ap->cbl == ATA_CBL_SATA) && (!ata_id_is_sata(dev->id)));
2300} 2306}
2301 2307
2302static void ata_dev_config_ncq(struct ata_device *dev, 2308static int ata_dev_config_ncq(struct ata_device *dev,
2303 char *desc, size_t desc_sz) 2309 char *desc, size_t desc_sz)
2304{ 2310{
2305 struct ata_port *ap = dev->link->ap; 2311 struct ata_port *ap = dev->link->ap;
2306 int hdepth = 0, ddepth = ata_id_queue_depth(dev->id); 2312 int hdepth = 0, ddepth = ata_id_queue_depth(dev->id);
2313 unsigned int err_mask;
2314 char *aa_desc = "";
2307 2315
2308 if (!ata_id_has_ncq(dev->id)) { 2316 if (!ata_id_has_ncq(dev->id)) {
2309 desc[0] = '\0'; 2317 desc[0] = '\0';
2310 return; 2318 return 0;
2311 } 2319 }
2312 if (dev->horkage & ATA_HORKAGE_NONCQ) { 2320 if (dev->horkage & ATA_HORKAGE_NONCQ) {
2313 snprintf(desc, desc_sz, "NCQ (not used)"); 2321 snprintf(desc, desc_sz, "NCQ (not used)");
2314 return; 2322 return 0;
2315 } 2323 }
2316 if (ap->flags & ATA_FLAG_NCQ) { 2324 if (ap->flags & ATA_FLAG_NCQ) {
2317 hdepth = min(ap->scsi_host->can_queue, ATA_MAX_QUEUE - 1); 2325 hdepth = min(ap->scsi_host->can_queue, ATA_MAX_QUEUE - 1);
2318 dev->flags |= ATA_DFLAG_NCQ; 2326 dev->flags |= ATA_DFLAG_NCQ;
2319 } 2327 }
2320 2328
2329 if (!(dev->horkage & ATA_HORKAGE_BROKEN_FPDMA_AA) &&
2330 (ap->flags & ATA_FLAG_FPDMA_AA) &&
2331 ata_id_has_fpdma_aa(dev->id)) {
2332 err_mask = ata_dev_set_feature(dev, SETFEATURES_SATA_ENABLE,
2333 SATA_FPDMA_AA);
2334 if (err_mask) {
2335 ata_dev_printk(dev, KERN_ERR, "failed to enable AA"
2336 "(error_mask=0x%x)\n", err_mask);
2337 if (err_mask != AC_ERR_DEV) {
2338 dev->horkage |= ATA_HORKAGE_BROKEN_FPDMA_AA;
2339 return -EIO;
2340 }
2341 } else
2342 aa_desc = ", AA";
2343 }
2344
2321 if (hdepth >= ddepth) 2345 if (hdepth >= ddepth)
2322 snprintf(desc, desc_sz, "NCQ (depth %d)", ddepth); 2346 snprintf(desc, desc_sz, "NCQ (depth %d)%s", ddepth, aa_desc);
2323 else 2347 else
2324 snprintf(desc, desc_sz, "NCQ (depth %d/%d)", hdepth, ddepth); 2348 snprintf(desc, desc_sz, "NCQ (depth %d/%d)%s", hdepth,
2349 ddepth, aa_desc);
2350 return 0;
2325} 2351}
2326 2352
2327/** 2353/**
@@ -2461,7 +2487,7 @@ int ata_dev_configure(struct ata_device *dev)
2461 2487
2462 if (ata_id_has_lba(id)) { 2488 if (ata_id_has_lba(id)) {
2463 const char *lba_desc; 2489 const char *lba_desc;
2464 char ncq_desc[20]; 2490 char ncq_desc[24];
2465 2491
2466 lba_desc = "LBA"; 2492 lba_desc = "LBA";
2467 dev->flags |= ATA_DFLAG_LBA; 2493 dev->flags |= ATA_DFLAG_LBA;
@@ -2475,7 +2501,9 @@ int ata_dev_configure(struct ata_device *dev)
2475 } 2501 }
2476 2502
2477 /* config NCQ */ 2503 /* config NCQ */
2478 ata_dev_config_ncq(dev, ncq_desc, sizeof(ncq_desc)); 2504 rc = ata_dev_config_ncq(dev, ncq_desc, sizeof(ncq_desc));
2505 if (rc)
2506 return rc;
2479 2507
2480 /* print device info to dmesg */ 2508 /* print device info to dmesg */
2481 if (ata_msg_drv(ap) && print_info) { 2509 if (ata_msg_drv(ap) && print_info) {
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 79711b64054b..a04488f0de88 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -40,6 +40,7 @@
40#include <scsi/scsi_eh.h> 40#include <scsi/scsi_eh.h>
41#include <scsi/scsi_device.h> 41#include <scsi/scsi_device.h>
42#include <scsi/scsi_cmnd.h> 42#include <scsi/scsi_cmnd.h>
43#include <scsi/scsi_dbg.h>
43#include "../scsi/scsi_transport_api.h" 44#include "../scsi/scsi_transport_api.h"
44 45
45#include <linux/libata.h> 46#include <linux/libata.h>
@@ -999,7 +1000,9 @@ static void __ata_port_freeze(struct ata_port *ap)
999 * ata_port_freeze - abort & freeze port 1000 * ata_port_freeze - abort & freeze port
1000 * @ap: ATA port to freeze 1001 * @ap: ATA port to freeze
1001 * 1002 *
1002 * Abort and freeze @ap. 1003 * Abort and freeze @ap. The freeze operation must be called
1004 * first, because some hardware requires special operations
1005 * before the taskfile registers are accessible.
1003 * 1006 *
1004 * LOCKING: 1007 * LOCKING:
1005 * spin_lock_irqsave(host lock) 1008 * spin_lock_irqsave(host lock)
@@ -1013,8 +1016,8 @@ int ata_port_freeze(struct ata_port *ap)
1013 1016
1014 WARN_ON(!ap->ops->error_handler); 1017 WARN_ON(!ap->ops->error_handler);
1015 1018
1016 nr_aborted = ata_port_abort(ap);
1017 __ata_port_freeze(ap); 1019 __ata_port_freeze(ap);
1020 nr_aborted = ata_port_abort(ap);
1018 1021
1019 return nr_aborted; 1022 return nr_aborted;
1020} 1023}
@@ -2110,6 +2113,116 @@ void ata_eh_autopsy(struct ata_port *ap)
2110} 2113}
2111 2114
2112/** 2115/**
2116 * ata_get_cmd_descript - get description for ATA command
2117 * @command: ATA command code to get description for
2118 *
2119 * Return a textual description of the given command, or NULL if the
2120 * command is not known.
2121 *
2122 * LOCKING:
2123 * None
2124 */
2125const char *ata_get_cmd_descript(u8 command)
2126{
2127#ifdef CONFIG_ATA_VERBOSE_ERROR
2128 static const struct
2129 {
2130 u8 command;
2131 const char *text;
2132 } cmd_descr[] = {
2133 { ATA_CMD_DEV_RESET, "DEVICE RESET" },
2134 { ATA_CMD_CHK_POWER, "CHECK POWER MODE" },
2135 { ATA_CMD_STANDBY, "STANDBY" },
2136 { ATA_CMD_IDLE, "IDLE" },
2137 { ATA_CMD_EDD, "EXECUTE DEVICE DIAGNOSTIC" },
2138 { ATA_CMD_DOWNLOAD_MICRO, "DOWNLOAD MICROCODE" },
2139 { ATA_CMD_NOP, "NOP" },
2140 { ATA_CMD_FLUSH, "FLUSH CACHE" },
2141 { ATA_CMD_FLUSH_EXT, "FLUSH CACHE EXT" },
2142 { ATA_CMD_ID_ATA, "IDENTIFY DEVICE" },
2143 { ATA_CMD_ID_ATAPI, "IDENTIFY PACKET DEVICE" },
2144 { ATA_CMD_SERVICE, "SERVICE" },
2145 { ATA_CMD_READ, "READ DMA" },
2146 { ATA_CMD_READ_EXT, "READ DMA EXT" },
2147 { ATA_CMD_READ_QUEUED, "READ DMA QUEUED" },
2148 { ATA_CMD_READ_STREAM_EXT, "READ STREAM EXT" },
2149 { ATA_CMD_READ_STREAM_DMA_EXT, "READ STREAM DMA EXT" },
2150 { ATA_CMD_WRITE, "WRITE DMA" },
2151 { ATA_CMD_WRITE_EXT, "WRITE DMA EXT" },
2152 { ATA_CMD_WRITE_QUEUED, "WRITE DMA QUEUED EXT" },
2153 { ATA_CMD_WRITE_STREAM_EXT, "WRITE STREAM EXT" },
2154 { ATA_CMD_WRITE_STREAM_DMA_EXT, "WRITE STREAM DMA EXT" },
2155 { ATA_CMD_WRITE_FUA_EXT, "WRITE DMA FUA EXT" },
2156 { ATA_CMD_WRITE_QUEUED_FUA_EXT, "WRITE DMA QUEUED FUA EXT" },
2157 { ATA_CMD_FPDMA_READ, "READ FPDMA QUEUED" },
2158 { ATA_CMD_FPDMA_WRITE, "WRITE FPDMA QUEUED" },
2159 { ATA_CMD_PIO_READ, "READ SECTOR(S)" },
2160 { ATA_CMD_PIO_READ_EXT, "READ SECTOR(S) EXT" },
2161 { ATA_CMD_PIO_WRITE, "WRITE SECTOR(S)" },
2162 { ATA_CMD_PIO_WRITE_EXT, "WRITE SECTOR(S) EXT" },
2163 { ATA_CMD_READ_MULTI, "READ MULTIPLE" },
2164 { ATA_CMD_READ_MULTI_EXT, "READ MULTIPLE EXT" },
2165 { ATA_CMD_WRITE_MULTI, "WRITE MULTIPLE" },
2166 { ATA_CMD_WRITE_MULTI_EXT, "WRITE MULTIPLE EXT" },
2167 { ATA_CMD_WRITE_MULTI_FUA_EXT, "WRITE MULTIPLE FUA EXT" },
2168 { ATA_CMD_SET_FEATURES, "SET FEATURES" },
2169 { ATA_CMD_SET_MULTI, "SET MULTIPLE MODE" },
2170 { ATA_CMD_VERIFY, "READ VERIFY SECTOR(S)" },
2171 { ATA_CMD_VERIFY_EXT, "READ VERIFY SECTOR(S) EXT" },
2172 { ATA_CMD_WRITE_UNCORR_EXT, "WRITE UNCORRECTABLE EXT" },
2173 { ATA_CMD_STANDBYNOW1, "STANDBY IMMEDIATE" },
2174 { ATA_CMD_IDLEIMMEDIATE, "IDLE IMMEDIATE" },
2175 { ATA_CMD_SLEEP, "SLEEP" },
2176 { ATA_CMD_INIT_DEV_PARAMS, "INITIALIZE DEVICE PARAMETERS" },
2177 { ATA_CMD_READ_NATIVE_MAX, "READ NATIVE MAX ADDRESS" },
2178 { ATA_CMD_READ_NATIVE_MAX_EXT, "READ NATIVE MAX ADDRESS EXT" },
2179 { ATA_CMD_SET_MAX, "SET MAX ADDRESS" },
2180 { ATA_CMD_SET_MAX_EXT, "SET MAX ADDRESS EXT" },
2181 { ATA_CMD_READ_LOG_EXT, "READ LOG EXT" },
2182 { ATA_CMD_WRITE_LOG_EXT, "WRITE LOG EXT" },
2183 { ATA_CMD_READ_LOG_DMA_EXT, "READ LOG DMA EXT" },
2184 { ATA_CMD_WRITE_LOG_DMA_EXT, "WRITE LOG DMA EXT" },
2185 { ATA_CMD_TRUSTED_RCV, "TRUSTED RECEIVE" },
2186 { ATA_CMD_TRUSTED_RCV_DMA, "TRUSTED RECEIVE DMA" },
2187 { ATA_CMD_TRUSTED_SND, "TRUSTED SEND" },
2188 { ATA_CMD_TRUSTED_SND_DMA, "TRUSTED SEND DMA" },
2189 { ATA_CMD_PMP_READ, "READ BUFFER" },
2190 { ATA_CMD_PMP_WRITE, "WRITE BUFFER" },
2191 { ATA_CMD_CONF_OVERLAY, "DEVICE CONFIGURATION OVERLAY" },
2192 { ATA_CMD_SEC_SET_PASS, "SECURITY SET PASSWORD" },
2193 { ATA_CMD_SEC_UNLOCK, "SECURITY UNLOCK" },
2194 { ATA_CMD_SEC_ERASE_PREP, "SECURITY ERASE PREPARE" },
2195 { ATA_CMD_SEC_ERASE_UNIT, "SECURITY ERASE UNIT" },
2196 { ATA_CMD_SEC_FREEZE_LOCK, "SECURITY FREEZE LOCK" },
2197 { ATA_CMD_SEC_DISABLE_PASS, "SECURITY DISABLE PASSWORD" },
2198 { ATA_CMD_CONFIG_STREAM, "CONFIGURE STREAM" },
2199 { ATA_CMD_SMART, "SMART" },
2200 { ATA_CMD_MEDIA_LOCK, "DOOR LOCK" },
2201 { ATA_CMD_MEDIA_UNLOCK, "DOOR UNLOCK" },
2202 { ATA_CMD_CHK_MED_CRD_TYP, "CHECK MEDIA CARD TYPE" },
2203 { ATA_CMD_CFA_REQ_EXT_ERR, "CFA REQUEST EXTENDED ERROR" },
2204 { ATA_CMD_CFA_WRITE_NE, "CFA WRITE SECTORS WITHOUT ERASE" },
2205 { ATA_CMD_CFA_TRANS_SECT, "CFA TRANSLATE SECTOR" },
2206 { ATA_CMD_CFA_ERASE, "CFA ERASE SECTORS" },
2207 { ATA_CMD_CFA_WRITE_MULT_NE, "CFA WRITE MULTIPLE WITHOUT ERASE" },
2208 { ATA_CMD_READ_LONG, "READ LONG (with retries)" },
2209 { ATA_CMD_READ_LONG_ONCE, "READ LONG (without retries)" },
2210 { ATA_CMD_WRITE_LONG, "WRITE LONG (with retries)" },
2211 { ATA_CMD_WRITE_LONG_ONCE, "WRITE LONG (without retries)" },
2212 { ATA_CMD_RESTORE, "RECALIBRATE" },
2213 { 0, NULL } /* terminate list */
2214 };
2215
2216 unsigned int i;
2217 for (i = 0; cmd_descr[i].text; i++)
2218 if (cmd_descr[i].command == command)
2219 return cmd_descr[i].text;
2220#endif
2221
2222 return NULL;
2223}
2224
2225/**
2113 * ata_eh_link_report - report error handling to user 2226 * ata_eh_link_report - report error handling to user
2114 * @link: ATA link EH is going on 2227 * @link: ATA link EH is going on
2115 * 2228 *
@@ -2175,6 +2288,7 @@ static void ata_eh_link_report(struct ata_link *link)
2175 ata_link_printk(link, KERN_ERR, "%s\n", desc); 2288 ata_link_printk(link, KERN_ERR, "%s\n", desc);
2176 } 2289 }
2177 2290
2291#ifdef CONFIG_ATA_VERBOSE_ERROR
2178 if (ehc->i.serror) 2292 if (ehc->i.serror)
2179 ata_link_printk(link, KERN_ERR, 2293 ata_link_printk(link, KERN_ERR,
2180 "SError: { %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s}\n", 2294 "SError: { %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s}\n",
@@ -2195,6 +2309,7 @@ static void ata_eh_link_report(struct ata_link *link)
2195 ehc->i.serror & SERR_TRANS_ST_ERROR ? "TrStaTrns " : "", 2309 ehc->i.serror & SERR_TRANS_ST_ERROR ? "TrStaTrns " : "",
2196 ehc->i.serror & SERR_UNRECOG_FIS ? "UnrecFIS " : "", 2310 ehc->i.serror & SERR_UNRECOG_FIS ? "UnrecFIS " : "",
2197 ehc->i.serror & SERR_DEV_XCHG ? "DevExch " : ""); 2311 ehc->i.serror & SERR_DEV_XCHG ? "DevExch " : "");
2312#endif
2198 2313
2199 for (tag = 0; tag < ATA_MAX_QUEUE; tag++) { 2314 for (tag = 0; tag < ATA_MAX_QUEUE; tag++) {
2200 struct ata_queued_cmd *qc = __ata_qc_from_tag(ap, tag); 2315 struct ata_queued_cmd *qc = __ata_qc_from_tag(ap, tag);
@@ -2226,14 +2341,23 @@ static void ata_eh_link_report(struct ata_link *link)
2226 dma_str[qc->dma_dir]); 2341 dma_str[qc->dma_dir]);
2227 } 2342 }
2228 2343
2229 if (ata_is_atapi(qc->tf.protocol)) 2344 if (ata_is_atapi(qc->tf.protocol)) {
2230 snprintf(cdb_buf, sizeof(cdb_buf), 2345 if (qc->scsicmd)
2346 scsi_print_command(qc->scsicmd);
2347 else
2348 snprintf(cdb_buf, sizeof(cdb_buf),
2231 "cdb %02x %02x %02x %02x %02x %02x %02x %02x " 2349 "cdb %02x %02x %02x %02x %02x %02x %02x %02x "
2232 "%02x %02x %02x %02x %02x %02x %02x %02x\n ", 2350 "%02x %02x %02x %02x %02x %02x %02x %02x\n ",
2233 cdb[0], cdb[1], cdb[2], cdb[3], 2351 cdb[0], cdb[1], cdb[2], cdb[3],
2234 cdb[4], cdb[5], cdb[6], cdb[7], 2352 cdb[4], cdb[5], cdb[6], cdb[7],
2235 cdb[8], cdb[9], cdb[10], cdb[11], 2353 cdb[8], cdb[9], cdb[10], cdb[11],
2236 cdb[12], cdb[13], cdb[14], cdb[15]); 2354 cdb[12], cdb[13], cdb[14], cdb[15]);
2355 } else {
2356 const char *descr = ata_get_cmd_descript(cmd->command);
2357 if (descr)
2358 ata_dev_printk(qc->dev, KERN_ERR,
2359 "failed command: %s\n", descr);
2360 }
2237 2361
2238 ata_dev_printk(qc->dev, KERN_ERR, 2362 ata_dev_printk(qc->dev, KERN_ERR,
2239 "cmd %02x/%02x:%02x:%02x:%02x:%02x/%02x:%02x:%02x:%02x:%02x/%02x " 2363 "cmd %02x/%02x:%02x:%02x:%02x:%02x/%02x:%02x:%02x:%02x:%02x/%02x "
@@ -2252,6 +2376,7 @@ static void ata_eh_link_report(struct ata_link *link)
2252 res->device, qc->err_mask, ata_err_string(qc->err_mask), 2376 res->device, qc->err_mask, ata_err_string(qc->err_mask),
2253 qc->err_mask & AC_ERR_NCQ ? " <F>" : ""); 2377 qc->err_mask & AC_ERR_NCQ ? " <F>" : "");
2254 2378
2379#ifdef CONFIG_ATA_VERBOSE_ERROR
2255 if (res->command & (ATA_BUSY | ATA_DRDY | ATA_DF | ATA_DRQ | 2380 if (res->command & (ATA_BUSY | ATA_DRDY | ATA_DF | ATA_DRQ |
2256 ATA_ERR)) { 2381 ATA_ERR)) {
2257 if (res->command & ATA_BUSY) 2382 if (res->command & ATA_BUSY)
@@ -2275,6 +2400,7 @@ static void ata_eh_link_report(struct ata_link *link)
2275 res->feature & ATA_UNC ? "UNC " : "", 2400 res->feature & ATA_UNC ? "UNC " : "",
2276 res->feature & ATA_IDNF ? "IDNF " : "", 2401 res->feature & ATA_IDNF ? "IDNF " : "",
2277 res->feature & ATA_ABORTED ? "ABRT " : ""); 2402 res->feature & ATA_ABORTED ? "ABRT " : "");
2403#endif
2278 } 2404 }
2279} 2405}
2280 2406
@@ -2574,11 +2700,17 @@ int ata_eh_reset(struct ata_link *link, int classify,
2574 postreset(slave, classes); 2700 postreset(slave, classes);
2575 } 2701 }
2576 2702
2577 /* clear cached SError */ 2703 /*
2704 * Some controllers can't be frozen very well and may set
2705 * spuruious error conditions during reset. Clear accumulated
2706 * error information. As reset is the final recovery action,
2707 * nothing is lost by doing this.
2708 */
2578 spin_lock_irqsave(link->ap->lock, flags); 2709 spin_lock_irqsave(link->ap->lock, flags);
2579 link->eh_info.serror = 0; 2710 memset(&link->eh_info, 0, sizeof(link->eh_info));
2580 if (slave) 2711 if (slave)
2581 slave->eh_info.serror = 0; 2712 memset(&slave->eh_info, 0, sizeof(link->eh_info));
2713 ap->pflags &= ~ATA_PFLAG_EH_PENDING;
2582 spin_unlock_irqrestore(link->ap->lock, flags); 2714 spin_unlock_irqrestore(link->ap->lock, flags);
2583 2715
2584 /* Make sure onlineness and classification result correspond. 2716 /* Make sure onlineness and classification result correspond.
diff --git a/drivers/ata/libata-pmp.c b/drivers/ata/libata-pmp.c
index 619f2c33950e..51f0ffb78cbd 100644
--- a/drivers/ata/libata-pmp.c
+++ b/drivers/ata/libata-pmp.c
@@ -221,6 +221,8 @@ static const char *sata_pmp_spec_rev_str(const u32 *gscr)
221{ 221{
222 u32 rev = gscr[SATA_PMP_GSCR_REV]; 222 u32 rev = gscr[SATA_PMP_GSCR_REV];
223 223
224 if (rev & (1 << 3))
225 return "1.2";
224 if (rev & (1 << 2)) 226 if (rev & (1 << 2))
225 return "1.1"; 227 return "1.1";
226 if (rev & (1 << 1)) 228 if (rev & (1 << 1))
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index d0dfeef55db5..b4ee28dec521 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -1119,10 +1119,6 @@ static int ata_scsi_dev_config(struct scsi_device *sdev,
1119 1119
1120 blk_queue_dma_drain(q, atapi_drain_needed, buf, ATAPI_MAX_DRAIN); 1120 blk_queue_dma_drain(q, atapi_drain_needed, buf, ATAPI_MAX_DRAIN);
1121 } else { 1121 } else {
1122 if (ata_id_is_ssd(dev->id))
1123 queue_flag_set_unlocked(QUEUE_FLAG_NONROT,
1124 sdev->request_queue);
1125
1126 /* ATA devices must be sector aligned */ 1122 /* ATA devices must be sector aligned */
1127 blk_queue_update_dma_alignment(sdev->request_queue, 1123 blk_queue_update_dma_alignment(sdev->request_queue,
1128 ATA_SECT_SIZE - 1); 1124 ATA_SECT_SIZE - 1);
@@ -1257,23 +1253,6 @@ int ata_scsi_change_queue_depth(struct scsi_device *sdev, int queue_depth)
1257 return queue_depth; 1253 return queue_depth;
1258} 1254}
1259 1255
1260/* XXX: for spindown warning */
1261static void ata_delayed_done_timerfn(unsigned long arg)
1262{
1263 struct scsi_cmnd *scmd = (void *)arg;
1264
1265 scmd->scsi_done(scmd);
1266}
1267
1268/* XXX: for spindown warning */
1269static void ata_delayed_done(struct scsi_cmnd *scmd)
1270{
1271 static struct timer_list timer;
1272
1273 setup_timer(&timer, ata_delayed_done_timerfn, (unsigned long)scmd);
1274 mod_timer(&timer, jiffies + 5 * HZ);
1275}
1276
1277/** 1256/**
1278 * ata_scsi_start_stop_xlat - Translate SCSI START STOP UNIT command 1257 * ata_scsi_start_stop_xlat - Translate SCSI START STOP UNIT command
1279 * @qc: Storage for translated ATA taskfile 1258 * @qc: Storage for translated ATA taskfile
@@ -1338,32 +1317,6 @@ static unsigned int ata_scsi_start_stop_xlat(struct ata_queued_cmd *qc)
1338 system_entering_hibernation()) 1317 system_entering_hibernation())
1339 goto skip; 1318 goto skip;
1340 1319
1341 /* XXX: This is for backward compatibility, will be
1342 * removed. Read Documentation/feature-removal-schedule.txt
1343 * for more info.
1344 */
1345 if ((qc->dev->flags & ATA_DFLAG_SPUNDOWN) &&
1346 (system_state == SYSTEM_HALT ||
1347 system_state == SYSTEM_POWER_OFF)) {
1348 static unsigned long warned;
1349
1350 if (!test_and_set_bit(0, &warned)) {
1351 ata_dev_printk(qc->dev, KERN_WARNING,
1352 "DISK MIGHT NOT BE SPUN DOWN PROPERLY. "
1353 "UPDATE SHUTDOWN UTILITY\n");
1354 ata_dev_printk(qc->dev, KERN_WARNING,
1355 "For more info, visit "
1356 "http://linux-ata.org/shutdown.html\n");
1357
1358 /* ->scsi_done is not used, use it for
1359 * delayed completion.
1360 */
1361 scmd->scsi_done = qc->scsidone;
1362 qc->scsidone = ata_delayed_done;
1363 }
1364 goto skip;
1365 }
1366
1367 /* Issue ATA STANDBY IMMEDIATE command */ 1320 /* Issue ATA STANDBY IMMEDIATE command */
1368 tf->command = ATA_CMD_STANDBYNOW1; 1321 tf->command = ATA_CMD_STANDBYNOW1;
1369 } 1322 }
@@ -1764,14 +1717,6 @@ static void ata_scsi_qc_complete(struct ata_queued_cmd *qc)
1764 } 1717 }
1765 } 1718 }
1766 1719
1767 /* XXX: track spindown state for spindown skipping and warning */
1768 if (unlikely(qc->tf.command == ATA_CMD_STANDBY ||
1769 qc->tf.command == ATA_CMD_STANDBYNOW1))
1770 qc->dev->flags |= ATA_DFLAG_SPUNDOWN;
1771 else if (likely(system_state != SYSTEM_HALT &&
1772 system_state != SYSTEM_POWER_OFF))
1773 qc->dev->flags &= ~ATA_DFLAG_SPUNDOWN;
1774
1775 if (need_sense && !ap->ops->error_handler) 1720 if (need_sense && !ap->ops->error_handler)
1776 ata_dump_status(ap->print_id, &qc->result_tf); 1721 ata_dump_status(ap->print_id, &qc->result_tf);
1777 1722
@@ -2815,28 +2760,6 @@ static unsigned int ata_scsi_pass_thru(struct ata_queued_cmd *qc)
2815 goto invalid_fld; 2760 goto invalid_fld;
2816 2761
2817 /* 2762 /*
2818 * Filter TPM commands by default. These provide an
2819 * essentially uncontrolled encrypted "back door" between
2820 * applications and the disk. Set libata.allow_tpm=1 if you
2821 * have a real reason for wanting to use them. This ensures
2822 * that installed software cannot easily mess stuff up without
2823 * user intent. DVR type users will probably ship with this enabled
2824 * for movie content management.
2825 *
2826 * Note that for ATA8 we can issue a DCS change and DCS freeze lock
2827 * for this and should do in future but that it is not sufficient as
2828 * DCS is an optional feature set. Thus we also do the software filter
2829 * so that we comply with the TC consortium stated goal that the user
2830 * can turn off TC features of their system.
2831 */
2832 if (tf->command >= 0x5C && tf->command <= 0x5F && !libata_allow_tpm)
2833 goto invalid_fld;
2834
2835 /* We may not issue DMA commands if no DMA mode is set */
2836 if (tf->protocol == ATA_PROT_DMA && dev->dma_mode == 0)
2837 goto invalid_fld;
2838
2839 /*
2840 * 12 and 16 byte CDBs use different offsets to 2763 * 12 and 16 byte CDBs use different offsets to
2841 * provide the various register values. 2764 * provide the various register values.
2842 */ 2765 */
@@ -2885,6 +2808,41 @@ static unsigned int ata_scsi_pass_thru(struct ata_queued_cmd *qc)
2885 tf->device = dev->devno ? 2808 tf->device = dev->devno ?
2886 tf->device | ATA_DEV1 : tf->device & ~ATA_DEV1; 2809 tf->device | ATA_DEV1 : tf->device & ~ATA_DEV1;
2887 2810
2811 /* READ/WRITE LONG use a non-standard sect_size */
2812 qc->sect_size = ATA_SECT_SIZE;
2813 switch (tf->command) {
2814 case ATA_CMD_READ_LONG:
2815 case ATA_CMD_READ_LONG_ONCE:
2816 case ATA_CMD_WRITE_LONG:
2817 case ATA_CMD_WRITE_LONG_ONCE:
2818 if (tf->protocol != ATA_PROT_PIO || tf->nsect != 1)
2819 goto invalid_fld;
2820 qc->sect_size = scsi_bufflen(scmd);
2821 }
2822
2823 /*
2824 * Set flags so that all registers will be written, pass on
2825 * write indication (used for PIO/DMA setup), result TF is
2826 * copied back and we don't whine too much about its failure.
2827 */
2828 tf->flags = ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE;
2829 if (scmd->sc_data_direction == DMA_TO_DEVICE)
2830 tf->flags |= ATA_TFLAG_WRITE;
2831
2832 qc->flags |= ATA_QCFLAG_RESULT_TF | ATA_QCFLAG_QUIET;
2833
2834 /*
2835 * Set transfer length.
2836 *
2837 * TODO: find out if we need to do more here to
2838 * cover scatter/gather case.
2839 */
2840 ata_qc_set_pc_nbytes(qc);
2841
2842 /* We may not issue DMA commands if no DMA mode is set */
2843 if (tf->protocol == ATA_PROT_DMA && dev->dma_mode == 0)
2844 goto invalid_fld;
2845
2888 /* sanity check for pio multi commands */ 2846 /* sanity check for pio multi commands */
2889 if ((cdb[1] & 0xe0) && !is_multi_taskfile(tf)) 2847 if ((cdb[1] & 0xe0) && !is_multi_taskfile(tf))
2890 goto invalid_fld; 2848 goto invalid_fld;
@@ -2901,18 +2859,6 @@ static unsigned int ata_scsi_pass_thru(struct ata_queued_cmd *qc)
2901 multi_count); 2859 multi_count);
2902 } 2860 }
2903 2861
2904 /* READ/WRITE LONG use a non-standard sect_size */
2905 qc->sect_size = ATA_SECT_SIZE;
2906 switch (tf->command) {
2907 case ATA_CMD_READ_LONG:
2908 case ATA_CMD_READ_LONG_ONCE:
2909 case ATA_CMD_WRITE_LONG:
2910 case ATA_CMD_WRITE_LONG_ONCE:
2911 if (tf->protocol != ATA_PROT_PIO || tf->nsect != 1)
2912 goto invalid_fld;
2913 qc->sect_size = scsi_bufflen(scmd);
2914 }
2915
2916 /* 2862 /*
2917 * Filter SET_FEATURES - XFER MODE command -- otherwise, 2863 * Filter SET_FEATURES - XFER MODE command -- otherwise,
2918 * SET_FEATURES - XFER MODE must be preceded/succeeded 2864 * SET_FEATURES - XFER MODE must be preceded/succeeded
@@ -2920,30 +2866,27 @@ static unsigned int ata_scsi_pass_thru(struct ata_queued_cmd *qc)
2920 * controller (i.e. the reason for ->set_piomode(), 2866 * controller (i.e. the reason for ->set_piomode(),
2921 * ->set_dmamode(), and ->post_set_mode() hooks). 2867 * ->set_dmamode(), and ->post_set_mode() hooks).
2922 */ 2868 */
2923 if ((tf->command == ATA_CMD_SET_FEATURES) 2869 if (tf->command == ATA_CMD_SET_FEATURES &&
2924 && (tf->feature == SETFEATURES_XFER)) 2870 tf->feature == SETFEATURES_XFER)
2925 goto invalid_fld; 2871 goto invalid_fld;
2926 2872
2927 /* 2873 /*
2928 * Set flags so that all registers will be written, 2874 * Filter TPM commands by default. These provide an
2929 * and pass on write indication (used for PIO/DMA 2875 * essentially uncontrolled encrypted "back door" between
2930 * setup.) 2876 * applications and the disk. Set libata.allow_tpm=1 if you
2931 */ 2877 * have a real reason for wanting to use them. This ensures
2932 tf->flags |= (ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE); 2878 * that installed software cannot easily mess stuff up without
2933 2879 * user intent. DVR type users will probably ship with this enabled
2934 if (scmd->sc_data_direction == DMA_TO_DEVICE) 2880 * for movie content management.
2935 tf->flags |= ATA_TFLAG_WRITE;
2936
2937 /*
2938 * Set transfer length.
2939 * 2881 *
2940 * TODO: find out if we need to do more here to 2882 * Note that for ATA8 we can issue a DCS change and DCS freeze lock
2941 * cover scatter/gather case. 2883 * for this and should do in future but that it is not sufficient as
2884 * DCS is an optional feature set. Thus we also do the software filter
2885 * so that we comply with the TC consortium stated goal that the user
2886 * can turn off TC features of their system.
2942 */ 2887 */
2943 ata_qc_set_pc_nbytes(qc); 2888 if (tf->command >= 0x5C && tf->command <= 0x5F && !libata_allow_tpm)
2944 2889 goto invalid_fld;
2945 /* request result TF and be quiet about device error */
2946 qc->flags |= ATA_QCFLAG_RESULT_TF | ATA_QCFLAG_QUIET;
2947 2890
2948 return 0; 2891 return 0;
2949 2892
diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h
index 89a1e0018e71..be8e2628f82c 100644
--- a/drivers/ata/libata.h
+++ b/drivers/ata/libata.h
@@ -164,6 +164,7 @@ extern void ata_eh_about_to_do(struct ata_link *link, struct ata_device *dev,
164extern void ata_eh_done(struct ata_link *link, struct ata_device *dev, 164extern void ata_eh_done(struct ata_link *link, struct ata_device *dev,
165 unsigned int action); 165 unsigned int action);
166extern void ata_eh_autopsy(struct ata_port *ap); 166extern void ata_eh_autopsy(struct ata_port *ap);
167const char *ata_get_cmd_descript(u8 command);
167extern void ata_eh_report(struct ata_port *ap); 168extern void ata_eh_report(struct ata_port *ap);
168extern int ata_eh_reset(struct ata_link *link, int classify, 169extern int ata_eh_reset(struct ata_link *link, int classify,
169 ata_prereset_fn_t prereset, ata_reset_fn_t softreset, 170 ata_prereset_fn_t prereset, ata_reset_fn_t softreset,
diff --git a/drivers/ata/pata_atiixp.c b/drivers/ata/pata_atiixp.c
index 45915566e4e9..aa4b3f6ae771 100644
--- a/drivers/ata/pata_atiixp.c
+++ b/drivers/ata/pata_atiixp.c
@@ -246,6 +246,7 @@ static const struct pci_device_id atiixp[] = {
246 { PCI_VDEVICE(ATI, PCI_DEVICE_ID_ATI_IXP400_IDE), }, 246 { PCI_VDEVICE(ATI, PCI_DEVICE_ID_ATI_IXP400_IDE), },
247 { PCI_VDEVICE(ATI, PCI_DEVICE_ID_ATI_IXP600_IDE), }, 247 { PCI_VDEVICE(ATI, PCI_DEVICE_ID_ATI_IXP600_IDE), },
248 { PCI_VDEVICE(ATI, PCI_DEVICE_ID_ATI_IXP700_IDE), }, 248 { PCI_VDEVICE(ATI, PCI_DEVICE_ID_ATI_IXP700_IDE), },
249 { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_SB900_IDE), },
249 250
250 { }, 251 { },
251}; 252};
diff --git a/drivers/ata/pata_cs5535.c b/drivers/ata/pata_cs5535.c
index d33aa28239a9..403f56165cec 100644
--- a/drivers/ata/pata_cs5535.c
+++ b/drivers/ata/pata_cs5535.c
@@ -202,7 +202,8 @@ static int cs5535_init_one(struct pci_dev *dev, const struct pci_device_id *id)
202} 202}
203 203
204static const struct pci_device_id cs5535[] = { 204static const struct pci_device_id cs5535[] = {
205 { PCI_VDEVICE(NS, 0x002D), }, 205 { PCI_VDEVICE(NS, PCI_DEVICE_ID_NS_CS5535_IDE), },
206 { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_CS5535_IDE), },
206 207
207 { }, 208 { },
208}; 209};
diff --git a/drivers/ata/pata_octeon_cf.c b/drivers/ata/pata_octeon_cf.c
index abdd19fe990a..d6f69561dc86 100644
--- a/drivers/ata/pata_octeon_cf.c
+++ b/drivers/ata/pata_octeon_cf.c
@@ -213,7 +213,7 @@ static void octeon_cf_set_dmamode(struct ata_port *ap, struct ata_device *dev)
213 * This is tI, C.F. spec. says 0, but Sony CF card requires 213 * This is tI, C.F. spec. says 0, but Sony CF card requires
214 * more, we use 20 nS. 214 * more, we use 20 nS.
215 */ 215 */
216 dma_tim.s.dmack_s = ns_to_tim_reg(tim_mult, 20);; 216 dma_tim.s.dmack_s = ns_to_tim_reg(tim_mult, 20);
217 dma_tim.s.dmack_h = ns_to_tim_reg(tim_mult, dma_ackh); 217 dma_tim.s.dmack_h = ns_to_tim_reg(tim_mult, dma_ackh);
218 218
219 dma_tim.s.dmarq = dma_arq; 219 dma_tim.s.dmarq = dma_arq;
@@ -841,7 +841,7 @@ static int __devinit octeon_cf_probe(struct platform_device *pdev)
841 ocd = pdev->dev.platform_data; 841 ocd = pdev->dev.platform_data;
842 842
843 cs0 = devm_ioremap_nocache(&pdev->dev, res_cs0->start, 843 cs0 = devm_ioremap_nocache(&pdev->dev, res_cs0->start,
844 res_cs0->end - res_cs0->start + 1); 844 resource_size(res_cs0));
845 845
846 if (!cs0) 846 if (!cs0)
847 return -ENOMEM; 847 return -ENOMEM;
diff --git a/drivers/ata/pata_platform.c b/drivers/ata/pata_platform.c
index d8d743af3225..3f6ebc6c665a 100644
--- a/drivers/ata/pata_platform.c
+++ b/drivers/ata/pata_platform.c
@@ -151,14 +151,14 @@ int __devinit __pata_platform_probe(struct device *dev,
151 */ 151 */
152 if (mmio) { 152 if (mmio) {
153 ap->ioaddr.cmd_addr = devm_ioremap(dev, io_res->start, 153 ap->ioaddr.cmd_addr = devm_ioremap(dev, io_res->start,
154 io_res->end - io_res->start + 1); 154 resource_size(io_res));
155 ap->ioaddr.ctl_addr = devm_ioremap(dev, ctl_res->start, 155 ap->ioaddr.ctl_addr = devm_ioremap(dev, ctl_res->start,
156 ctl_res->end - ctl_res->start + 1); 156 resource_size(ctl_res));
157 } else { 157 } else {
158 ap->ioaddr.cmd_addr = devm_ioport_map(dev, io_res->start, 158 ap->ioaddr.cmd_addr = devm_ioport_map(dev, io_res->start,
159 io_res->end - io_res->start + 1); 159 resource_size(io_res));
160 ap->ioaddr.ctl_addr = devm_ioport_map(dev, ctl_res->start, 160 ap->ioaddr.ctl_addr = devm_ioport_map(dev, ctl_res->start,
161 ctl_res->end - ctl_res->start + 1); 161 resource_size(ctl_res));
162 } 162 }
163 if (!ap->ioaddr.cmd_addr || !ap->ioaddr.ctl_addr) { 163 if (!ap->ioaddr.cmd_addr || !ap->ioaddr.ctl_addr) {
164 dev_err(dev, "failed to map IO/CTL base\n"); 164 dev_err(dev, "failed to map IO/CTL base\n");
diff --git a/drivers/ata/pata_rb532_cf.c b/drivers/ata/pata_rb532_cf.c
index 8e3cdef8a25f..45f1e10f917b 100644
--- a/drivers/ata/pata_rb532_cf.c
+++ b/drivers/ata/pata_rb532_cf.c
@@ -151,7 +151,7 @@ static __devinit int rb532_pata_driver_probe(struct platform_device *pdev)
151 info->irq = irq; 151 info->irq = irq;
152 152
153 info->iobase = devm_ioremap_nocache(&pdev->dev, res->start, 153 info->iobase = devm_ioremap_nocache(&pdev->dev, res->start,
154 res->end - res->start + 1); 154 resource_size(res));
155 if (!info->iobase) 155 if (!info->iobase)
156 return -ENOMEM; 156 return -ENOMEM;
157 157
diff --git a/drivers/ata/pata_rdc.c b/drivers/ata/pata_rdc.c
new file mode 100644
index 000000000000..c843a1e07c4f
--- /dev/null
+++ b/drivers/ata/pata_rdc.c
@@ -0,0 +1,400 @@
1/*
2 * pata_rdc - Driver for later RDC PATA controllers
3 *
4 * This is actually a driver for hardware meeting
5 * INCITS 370-2004 (1510D): ATA Host Adapter Standards
6 *
7 * Based on ata_piix.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2, or (at your option)
12 * any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; see the file COPYING. If not, write to
21 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22 */
23
24#include <linux/kernel.h>
25#include <linux/module.h>
26#include <linux/pci.h>
27#include <linux/init.h>
28#include <linux/blkdev.h>
29#include <linux/delay.h>
30#include <linux/device.h>
31#include <scsi/scsi_host.h>
32#include <linux/libata.h>
33#include <linux/dmi.h>
34
35#define DRV_NAME "pata_rdc"
36#define DRV_VERSION "0.01"
37
38struct rdc_host_priv {
39 u32 saved_iocfg;
40};
41
42/**
43 * rdc_pata_cable_detect - Probe host controller cable detect info
44 * @ap: Port for which cable detect info is desired
45 *
46 * Read 80c cable indicator from ATA PCI device's PCI config
47 * register. This register is normally set by firmware (BIOS).
48 *
49 * LOCKING:
50 * None (inherited from caller).
51 */
52
53static int rdc_pata_cable_detect(struct ata_port *ap)
54{
55 struct rdc_host_priv *hpriv = ap->host->private_data;
56 u8 mask;
57
58 /* check BIOS cable detect results */
59 mask = 0x30 << (2 * ap->port_no);
60 if ((hpriv->saved_iocfg & mask) == 0)
61 return ATA_CBL_PATA40;
62 return ATA_CBL_PATA80;
63}
64
65/**
66 * rdc_pata_prereset - prereset for PATA host controller
67 * @link: Target link
68 * @deadline: deadline jiffies for the operation
69 *
70 * LOCKING:
71 * None (inherited from caller).
72 */
73static int rdc_pata_prereset(struct ata_link *link, unsigned long deadline)
74{
75 struct ata_port *ap = link->ap;
76 struct pci_dev *pdev = to_pci_dev(ap->host->dev);
77
78 static const struct pci_bits rdc_enable_bits[] = {
79 { 0x41U, 1U, 0x80UL, 0x80UL }, /* port 0 */
80 { 0x43U, 1U, 0x80UL, 0x80UL }, /* port 1 */
81 };
82
83 if (!pci_test_config_bits(pdev, &rdc_enable_bits[ap->port_no]))
84 return -ENOENT;
85 return ata_sff_prereset(link, deadline);
86}
87
88/**
89 * rdc_set_piomode - Initialize host controller PATA PIO timings
90 * @ap: Port whose timings we are configuring
91 * @adev: um
92 *
93 * Set PIO mode for device, in host controller PCI config space.
94 *
95 * LOCKING:
96 * None (inherited from caller).
97 */
98
99static void rdc_set_piomode(struct ata_port *ap, struct ata_device *adev)
100{
101 unsigned int pio = adev->pio_mode - XFER_PIO_0;
102 struct pci_dev *dev = to_pci_dev(ap->host->dev);
103 unsigned int is_slave = (adev->devno != 0);
104 unsigned int master_port= ap->port_no ? 0x42 : 0x40;
105 unsigned int slave_port = 0x44;
106 u16 master_data;
107 u8 slave_data;
108 u8 udma_enable;
109 int control = 0;
110
111 static const /* ISP RTC */
112 u8 timings[][2] = { { 0, 0 },
113 { 0, 0 },
114 { 1, 0 },
115 { 2, 1 },
116 { 2, 3 }, };
117
118 if (pio >= 2)
119 control |= 1; /* TIME1 enable */
120 if (ata_pio_need_iordy(adev))
121 control |= 2; /* IE enable */
122
123 if (adev->class == ATA_DEV_ATA)
124 control |= 4; /* PPE enable */
125
126 /* PIO configuration clears DTE unconditionally. It will be
127 * programmed in set_dmamode which is guaranteed to be called
128 * after set_piomode if any DMA mode is available.
129 */
130 pci_read_config_word(dev, master_port, &master_data);
131 if (is_slave) {
132 /* clear TIME1|IE1|PPE1|DTE1 */
133 master_data &= 0xff0f;
134 /* Enable SITRE (separate slave timing register) */
135 master_data |= 0x4000;
136 /* enable PPE1, IE1 and TIME1 as needed */
137 master_data |= (control << 4);
138 pci_read_config_byte(dev, slave_port, &slave_data);
139 slave_data &= (ap->port_no ? 0x0f : 0xf0);
140 /* Load the timing nibble for this slave */
141 slave_data |= ((timings[pio][0] << 2) | timings[pio][1])
142 << (ap->port_no ? 4 : 0);
143 } else {
144 /* clear ISP|RCT|TIME0|IE0|PPE0|DTE0 */
145 master_data &= 0xccf0;
146 /* Enable PPE, IE and TIME as appropriate */
147 master_data |= control;
148 /* load ISP and RCT */
149 master_data |=
150 (timings[pio][0] << 12) |
151 (timings[pio][1] << 8);
152 }
153 pci_write_config_word(dev, master_port, master_data);
154 if (is_slave)
155 pci_write_config_byte(dev, slave_port, slave_data);
156
157 /* Ensure the UDMA bit is off - it will be turned back on if
158 UDMA is selected */
159
160 pci_read_config_byte(dev, 0x48, &udma_enable);
161 udma_enable &= ~(1 << (2 * ap->port_no + adev->devno));
162 pci_write_config_byte(dev, 0x48, udma_enable);
163}
164
165/**
166 * rdc_set_dmamode - Initialize host controller PATA PIO timings
167 * @ap: Port whose timings we are configuring
168 * @adev: Drive in question
169 *
170 * Set UDMA mode for device, in host controller PCI config space.
171 *
172 * LOCKING:
173 * None (inherited from caller).
174 */
175
176static void rdc_set_dmamode(struct ata_port *ap, struct ata_device *adev)
177{
178 struct pci_dev *dev = to_pci_dev(ap->host->dev);
179 u8 master_port = ap->port_no ? 0x42 : 0x40;
180 u16 master_data;
181 u8 speed = adev->dma_mode;
182 int devid = adev->devno + 2 * ap->port_no;
183 u8 udma_enable = 0;
184
185 static const /* ISP RTC */
186 u8 timings[][2] = { { 0, 0 },
187 { 0, 0 },
188 { 1, 0 },
189 { 2, 1 },
190 { 2, 3 }, };
191
192 pci_read_config_word(dev, master_port, &master_data);
193 pci_read_config_byte(dev, 0x48, &udma_enable);
194
195 if (speed >= XFER_UDMA_0) {
196 unsigned int udma = adev->dma_mode - XFER_UDMA_0;
197 u16 udma_timing;
198 u16 ideconf;
199 int u_clock, u_speed;
200
201 /*
202 * UDMA is handled by a combination of clock switching and
203 * selection of dividers
204 *
205 * Handy rule: Odd modes are UDMATIMx 01, even are 02
206 * except UDMA0 which is 00
207 */
208 u_speed = min(2 - (udma & 1), udma);
209 if (udma == 5)
210 u_clock = 0x1000; /* 100Mhz */
211 else if (udma > 2)
212 u_clock = 1; /* 66Mhz */
213 else
214 u_clock = 0; /* 33Mhz */
215
216 udma_enable |= (1 << devid);
217
218 /* Load the CT/RP selection */
219 pci_read_config_word(dev, 0x4A, &udma_timing);
220 udma_timing &= ~(3 << (4 * devid));
221 udma_timing |= u_speed << (4 * devid);
222 pci_write_config_word(dev, 0x4A, udma_timing);
223
224 /* Select a 33/66/100Mhz clock */
225 pci_read_config_word(dev, 0x54, &ideconf);
226 ideconf &= ~(0x1001 << devid);
227 ideconf |= u_clock << devid;
228 pci_write_config_word(dev, 0x54, ideconf);
229 } else {
230 /*
231 * MWDMA is driven by the PIO timings. We must also enable
232 * IORDY unconditionally along with TIME1. PPE has already
233 * been set when the PIO timing was set.
234 */
235 unsigned int mwdma = adev->dma_mode - XFER_MW_DMA_0;
236 unsigned int control;
237 u8 slave_data;
238 const unsigned int needed_pio[3] = {
239 XFER_PIO_0, XFER_PIO_3, XFER_PIO_4
240 };
241 int pio = needed_pio[mwdma] - XFER_PIO_0;
242
243 control = 3; /* IORDY|TIME1 */
244
245 /* If the drive MWDMA is faster than it can do PIO then
246 we must force PIO into PIO0 */
247
248 if (adev->pio_mode < needed_pio[mwdma])
249 /* Enable DMA timing only */
250 control |= 8; /* PIO cycles in PIO0 */
251
252 if (adev->devno) { /* Slave */
253 master_data &= 0xFF4F; /* Mask out IORDY|TIME1|DMAONLY */
254 master_data |= control << 4;
255 pci_read_config_byte(dev, 0x44, &slave_data);
256 slave_data &= (ap->port_no ? 0x0f : 0xf0);
257 /* Load the matching timing */
258 slave_data |= ((timings[pio][0] << 2) | timings[pio][1]) << (ap->port_no ? 4 : 0);
259 pci_write_config_byte(dev, 0x44, slave_data);
260 } else { /* Master */
261 master_data &= 0xCCF4; /* Mask out IORDY|TIME1|DMAONLY
262 and master timing bits */
263 master_data |= control;
264 master_data |=
265 (timings[pio][0] << 12) |
266 (timings[pio][1] << 8);
267 }
268
269 udma_enable &= ~(1 << devid);
270 pci_write_config_word(dev, master_port, master_data);
271 }
272 pci_write_config_byte(dev, 0x48, udma_enable);
273}
274
275static struct ata_port_operations rdc_pata_ops = {
276 .inherits = &ata_bmdma32_port_ops,
277 .cable_detect = rdc_pata_cable_detect,
278 .set_piomode = rdc_set_piomode,
279 .set_dmamode = rdc_set_dmamode,
280 .prereset = rdc_pata_prereset,
281};
282
283static struct ata_port_info rdc_port_info = {
284
285 .flags = ATA_FLAG_SLAVE_POSS,
286 .pio_mask = ATA_PIO4,
287 .mwdma_mask = ATA_MWDMA2,
288 .udma_mask = ATA_UDMA5,
289 .port_ops = &rdc_pata_ops,
290};
291
292static struct scsi_host_template rdc_sht = {
293 ATA_BMDMA_SHT(DRV_NAME),
294};
295
296/**
297 * rdc_init_one - Register PIIX ATA PCI device with kernel services
298 * @pdev: PCI device to register
299 * @ent: Entry in rdc_pci_tbl matching with @pdev
300 *
301 * Called from kernel PCI layer. We probe for combined mode (sigh),
302 * and then hand over control to libata, for it to do the rest.
303 *
304 * LOCKING:
305 * Inherited from PCI layer (may sleep).
306 *
307 * RETURNS:
308 * Zero on success, or -ERRNO value.
309 */
310
311static int __devinit rdc_init_one(struct pci_dev *pdev,
312 const struct pci_device_id *ent)
313{
314 static int printed_version;
315 struct device *dev = &pdev->dev;
316 struct ata_port_info port_info[2];
317 const struct ata_port_info *ppi[] = { &port_info[0], &port_info[1] };
318 unsigned long port_flags;
319 struct ata_host *host;
320 struct rdc_host_priv *hpriv;
321 int rc;
322
323 if (!printed_version++)
324 dev_printk(KERN_DEBUG, &pdev->dev,
325 "version " DRV_VERSION "\n");
326
327 port_info[0] = rdc_port_info;
328 port_info[1] = rdc_port_info;
329
330 port_flags = port_info[0].flags;
331
332 /* enable device and prepare host */
333 rc = pcim_enable_device(pdev);
334 if (rc)
335 return rc;
336
337 hpriv = devm_kzalloc(dev, sizeof(*hpriv), GFP_KERNEL);
338 if (!hpriv)
339 return -ENOMEM;
340
341 /* Save IOCFG, this will be used for cable detection, quirk
342 * detection and restoration on detach.
343 */
344 pci_read_config_dword(pdev, 0x54, &hpriv->saved_iocfg);
345
346 rc = ata_pci_sff_prepare_host(pdev, ppi, &host);
347 if (rc)
348 return rc;
349 host->private_data = hpriv;
350
351 pci_intx(pdev, 1);
352
353 host->flags |= ATA_HOST_PARALLEL_SCAN;
354
355 pci_set_master(pdev);
356 return ata_pci_sff_activate_host(host, ata_sff_interrupt, &rdc_sht);
357}
358
359static void rdc_remove_one(struct pci_dev *pdev)
360{
361 struct ata_host *host = dev_get_drvdata(&pdev->dev);
362 struct rdc_host_priv *hpriv = host->private_data;
363
364 pci_write_config_dword(pdev, 0x54, hpriv->saved_iocfg);
365
366 ata_pci_remove_one(pdev);
367}
368
369static const struct pci_device_id rdc_pci_tbl[] = {
370 { PCI_DEVICE(0x17F3, 0x1011), },
371 { PCI_DEVICE(0x17F3, 0x1012), },
372 { } /* terminate list */
373};
374
375static struct pci_driver rdc_pci_driver = {
376 .name = DRV_NAME,
377 .id_table = rdc_pci_tbl,
378 .probe = rdc_init_one,
379 .remove = rdc_remove_one,
380};
381
382
383static int __init rdc_init(void)
384{
385 return pci_register_driver(&rdc_pci_driver);
386}
387
388static void __exit rdc_exit(void)
389{
390 pci_unregister_driver(&rdc_pci_driver);
391}
392
393module_init(rdc_init);
394module_exit(rdc_exit);
395
396MODULE_AUTHOR("Alan Cox (based on ata_piix)");
397MODULE_DESCRIPTION("SCSI low-level driver for RDC PATA controllers");
398MODULE_LICENSE("GPL");
399MODULE_DEVICE_TABLE(pci, rdc_pci_tbl);
400MODULE_VERSION(DRV_VERSION);
diff --git a/drivers/ata/pata_rz1000.c b/drivers/ata/pata_rz1000.c
index 0c574c065c62..a5e4dfe60b41 100644
--- a/drivers/ata/pata_rz1000.c
+++ b/drivers/ata/pata_rz1000.c
@@ -85,7 +85,6 @@ static int rz1000_fifo_disable(struct pci_dev *pdev)
85 85
86static int rz1000_init_one (struct pci_dev *pdev, const struct pci_device_id *ent) 86static int rz1000_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
87{ 87{
88 static int printed_version;
89 static const struct ata_port_info info = { 88 static const struct ata_port_info info = {
90 .flags = ATA_FLAG_SLAVE_POSS, 89 .flags = ATA_FLAG_SLAVE_POSS,
91 .pio_mask = ATA_PIO4, 90 .pio_mask = ATA_PIO4,
@@ -93,8 +92,7 @@ static int rz1000_init_one (struct pci_dev *pdev, const struct pci_device_id *en
93 }; 92 };
94 const struct ata_port_info *ppi[] = { &info, NULL }; 93 const struct ata_port_info *ppi[] = { &info, NULL };
95 94
96 if (!printed_version++) 95 printk_once(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n");
97 printk(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n");
98 96
99 if (rz1000_fifo_disable(pdev) == 0) 97 if (rz1000_fifo_disable(pdev) == 0)
100 return ata_pci_sff_init_one(pdev, ppi, &rz1000_sht, NULL); 98 return ata_pci_sff_init_one(pdev, ppi, &rz1000_sht, NULL);
diff --git a/drivers/ata/sata_fsl.c b/drivers/ata/sata_fsl.c
index 94eaa432c40a..d344db42a002 100644
--- a/drivers/ata/sata_fsl.c
+++ b/drivers/ata/sata_fsl.c
@@ -1257,6 +1257,7 @@ static struct scsi_host_template sata_fsl_sht = {
1257static struct ata_port_operations sata_fsl_ops = { 1257static struct ata_port_operations sata_fsl_ops = {
1258 .inherits = &sata_pmp_port_ops, 1258 .inherits = &sata_pmp_port_ops,
1259 1259
1260 .qc_defer = ata_std_qc_defer,
1260 .qc_prep = sata_fsl_qc_prep, 1261 .qc_prep = sata_fsl_qc_prep,
1261 .qc_issue = sata_fsl_qc_issue, 1262 .qc_issue = sata_fsl_qc_issue,
1262 .qc_fill_rtf = sata_fsl_qc_fill_rtf, 1263 .qc_fill_rtf = sata_fsl_qc_fill_rtf,
diff --git a/drivers/ata/sata_inic162x.c b/drivers/ata/sata_inic162x.c
index 8d890cc5a7ee..4406902b4293 100644
--- a/drivers/ata/sata_inic162x.c
+++ b/drivers/ata/sata_inic162x.c
@@ -405,7 +405,7 @@ static irqreturn_t inic_interrupt(int irq, void *dev_instance)
405 struct ata_host *host = dev_instance; 405 struct ata_host *host = dev_instance;
406 struct inic_host_priv *hpriv = host->private_data; 406 struct inic_host_priv *hpriv = host->private_data;
407 u16 host_irq_stat; 407 u16 host_irq_stat;
408 int i, handled = 0;; 408 int i, handled = 0;
409 409
410 host_irq_stat = readw(hpriv->mmio_base + HOST_IRQ_STAT); 410 host_irq_stat = readw(hpriv->mmio_base + HOST_IRQ_STAT);
411 411
diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c
index c19417e02208..17f9ff9067a2 100644
--- a/drivers/ata/sata_mv.c
+++ b/drivers/ata/sata_mv.c
@@ -4013,7 +4013,7 @@ static int mv_platform_probe(struct platform_device *pdev)
4013 4013
4014 host->iomap = NULL; 4014 host->iomap = NULL;
4015 hpriv->base = devm_ioremap(&pdev->dev, res->start, 4015 hpriv->base = devm_ioremap(&pdev->dev, res->start,
4016 res->end - res->start + 1); 4016 resource_size(res));
4017 hpriv->base -= SATAHC0_REG_BASE; 4017 hpriv->base -= SATAHC0_REG_BASE;
4018 4018
4019 /* 4019 /*
diff --git a/drivers/ata/sata_sil.c b/drivers/ata/sata_sil.c
index 35bd5cc7f285..3cb69d5fb817 100644
--- a/drivers/ata/sata_sil.c
+++ b/drivers/ata/sata_sil.c
@@ -565,6 +565,19 @@ static void sil_freeze(struct ata_port *ap)
565 tmp |= SIL_MASK_IDE0_INT << ap->port_no; 565 tmp |= SIL_MASK_IDE0_INT << ap->port_no;
566 writel(tmp, mmio_base + SIL_SYSCFG); 566 writel(tmp, mmio_base + SIL_SYSCFG);
567 readl(mmio_base + SIL_SYSCFG); /* flush */ 567 readl(mmio_base + SIL_SYSCFG); /* flush */
568
569 /* Ensure DMA_ENABLE is off.
570 *
571 * This is because the controller will not give us access to the
572 * taskfile registers while a DMA is in progress
573 */
574 iowrite8(ioread8(ap->ioaddr.bmdma_addr) & ~SIL_DMA_ENABLE,
575 ap->ioaddr.bmdma_addr);
576
577 /* According to ata_bmdma_stop, an HDMA transition requires
578 * on PIO cycle. But we can't read a taskfile register.
579 */
580 ioread8(ap->ioaddr.bmdma_addr);
568} 581}
569 582
570static void sil_thaw(struct ata_port *ap) 583static void sil_thaw(struct ata_port *ap)
diff --git a/drivers/ata/sata_sil24.c b/drivers/ata/sata_sil24.c
index 77aa8d7ecec4..e6946fc527d0 100644
--- a/drivers/ata/sata_sil24.c
+++ b/drivers/ata/sata_sil24.c
@@ -846,6 +846,17 @@ static void sil24_qc_prep(struct ata_queued_cmd *qc)
846 if (!ata_is_atapi(qc->tf.protocol)) { 846 if (!ata_is_atapi(qc->tf.protocol)) {
847 prb = &cb->ata.prb; 847 prb = &cb->ata.prb;
848 sge = cb->ata.sge; 848 sge = cb->ata.sge;
849 if (ata_is_data(qc->tf.protocol)) {
850 u16 prot = 0;
851 ctrl = PRB_CTRL_PROTOCOL;
852 if (ata_is_ncq(qc->tf.protocol))
853 prot |= PRB_PROT_NCQ;
854 if (qc->tf.flags & ATA_TFLAG_WRITE)
855 prot |= PRB_PROT_WRITE;
856 else
857 prot |= PRB_PROT_READ;
858 prb->prot = cpu_to_le16(prot);
859 }
849 } else { 860 } else {
850 prb = &cb->atapi.prb; 861 prb = &cb->atapi.prb;
851 sge = cb->atapi.sge; 862 sge = cb->atapi.sge;
diff --git a/drivers/ata/sata_sis.c b/drivers/ata/sata_sis.c
index 8f9833228619..f8a91bfd66a8 100644
--- a/drivers/ata/sata_sis.c
+++ b/drivers/ata/sata_sis.c
@@ -109,8 +109,9 @@ MODULE_LICENSE("GPL");
109MODULE_DEVICE_TABLE(pci, sis_pci_tbl); 109MODULE_DEVICE_TABLE(pci, sis_pci_tbl);
110MODULE_VERSION(DRV_VERSION); 110MODULE_VERSION(DRV_VERSION);
111 111
112static unsigned int get_scr_cfg_addr(struct ata_port *ap, unsigned int sc_reg) 112static unsigned int get_scr_cfg_addr(struct ata_link *link, unsigned int sc_reg)
113{ 113{
114 struct ata_port *ap = link->ap;
114 struct pci_dev *pdev = to_pci_dev(ap->host->dev); 115 struct pci_dev *pdev = to_pci_dev(ap->host->dev);
115 unsigned int addr = SIS_SCR_BASE + (4 * sc_reg); 116 unsigned int addr = SIS_SCR_BASE + (4 * sc_reg);
116 u8 pmr; 117 u8 pmr;
@@ -131,6 +132,9 @@ static unsigned int get_scr_cfg_addr(struct ata_port *ap, unsigned int sc_reg)
131 break; 132 break;
132 } 133 }
133 } 134 }
135 if (link->pmp)
136 addr += 0x10;
137
134 return addr; 138 return addr;
135} 139}
136 140
@@ -138,24 +142,12 @@ static u32 sis_scr_cfg_read(struct ata_link *link,
138 unsigned int sc_reg, u32 *val) 142 unsigned int sc_reg, u32 *val)
139{ 143{
140 struct pci_dev *pdev = to_pci_dev(link->ap->host->dev); 144 struct pci_dev *pdev = to_pci_dev(link->ap->host->dev);
141 unsigned int cfg_addr = get_scr_cfg_addr(link->ap, sc_reg); 145 unsigned int cfg_addr = get_scr_cfg_addr(link, sc_reg);
142 u32 val2 = 0;
143 u8 pmr;
144 146
145 if (sc_reg == SCR_ERROR) /* doesn't exist in PCI cfg space */ 147 if (sc_reg == SCR_ERROR) /* doesn't exist in PCI cfg space */
146 return -EINVAL; 148 return -EINVAL;
147 149
148 pci_read_config_byte(pdev, SIS_PMR, &pmr);
149
150 pci_read_config_dword(pdev, cfg_addr, val); 150 pci_read_config_dword(pdev, cfg_addr, val);
151
152 if ((pdev->device == 0x0182) || (pdev->device == 0x0183) ||
153 (pdev->device == 0x1182) || (pmr & SIS_PMR_COMBINED))
154 pci_read_config_dword(pdev, cfg_addr+0x10, &val2);
155
156 *val |= val2;
157 *val &= 0xfffffffb; /* avoid problems with powerdowned ports */
158
159 return 0; 151 return 0;
160} 152}
161 153
@@ -163,28 +155,16 @@ static int sis_scr_cfg_write(struct ata_link *link,
163 unsigned int sc_reg, u32 val) 155 unsigned int sc_reg, u32 val)
164{ 156{
165 struct pci_dev *pdev = to_pci_dev(link->ap->host->dev); 157 struct pci_dev *pdev = to_pci_dev(link->ap->host->dev);
166 unsigned int cfg_addr = get_scr_cfg_addr(link->ap, sc_reg); 158 unsigned int cfg_addr = get_scr_cfg_addr(link, sc_reg);
167 u8 pmr;
168
169 if (sc_reg == SCR_ERROR) /* doesn't exist in PCI cfg space */
170 return -EINVAL;
171
172 pci_read_config_byte(pdev, SIS_PMR, &pmr);
173 159
174 pci_write_config_dword(pdev, cfg_addr, val); 160 pci_write_config_dword(pdev, cfg_addr, val);
175
176 if ((pdev->device == 0x0182) || (pdev->device == 0x0183) ||
177 (pdev->device == 0x1182) || (pmr & SIS_PMR_COMBINED))
178 pci_write_config_dword(pdev, cfg_addr+0x10, val);
179
180 return 0; 161 return 0;
181} 162}
182 163
183static int sis_scr_read(struct ata_link *link, unsigned int sc_reg, u32 *val) 164static int sis_scr_read(struct ata_link *link, unsigned int sc_reg, u32 *val)
184{ 165{
185 struct ata_port *ap = link->ap; 166 struct ata_port *ap = link->ap;
186 struct pci_dev *pdev = to_pci_dev(ap->host->dev); 167 void __iomem *base = ap->ioaddr.scr_addr + link->pmp * 0x10;
187 u8 pmr;
188 168
189 if (sc_reg > SCR_CONTROL) 169 if (sc_reg > SCR_CONTROL)
190 return -EINVAL; 170 return -EINVAL;
@@ -192,39 +172,23 @@ static int sis_scr_read(struct ata_link *link, unsigned int sc_reg, u32 *val)
192 if (ap->flags & SIS_FLAG_CFGSCR) 172 if (ap->flags & SIS_FLAG_CFGSCR)
193 return sis_scr_cfg_read(link, sc_reg, val); 173 return sis_scr_cfg_read(link, sc_reg, val);
194 174
195 pci_read_config_byte(pdev, SIS_PMR, &pmr); 175 *val = ioread32(base + sc_reg * 4);
196
197 *val = ioread32(ap->ioaddr.scr_addr + (sc_reg * 4));
198
199 if ((pdev->device == 0x0182) || (pdev->device == 0x0183) ||
200 (pdev->device == 0x1182) || (pmr & SIS_PMR_COMBINED))
201 *val |= ioread32(ap->ioaddr.scr_addr + (sc_reg * 4) + 0x10);
202
203 *val &= 0xfffffffb;
204
205 return 0; 176 return 0;
206} 177}
207 178
208static int sis_scr_write(struct ata_link *link, unsigned int sc_reg, u32 val) 179static int sis_scr_write(struct ata_link *link, unsigned int sc_reg, u32 val)
209{ 180{
210 struct ata_port *ap = link->ap; 181 struct ata_port *ap = link->ap;
211 struct pci_dev *pdev = to_pci_dev(ap->host->dev); 182 void __iomem *base = ap->ioaddr.scr_addr + link->pmp * 0x10;
212 u8 pmr;
213 183
214 if (sc_reg > SCR_CONTROL) 184 if (sc_reg > SCR_CONTROL)
215 return -EINVAL; 185 return -EINVAL;
216 186
217 pci_read_config_byte(pdev, SIS_PMR, &pmr);
218
219 if (ap->flags & SIS_FLAG_CFGSCR) 187 if (ap->flags & SIS_FLAG_CFGSCR)
220 return sis_scr_cfg_write(link, sc_reg, val); 188 return sis_scr_cfg_write(link, sc_reg, val);
221 else { 189
222 iowrite32(val, ap->ioaddr.scr_addr + (sc_reg * 4)); 190 iowrite32(val, base + (sc_reg * 4));
223 if ((pdev->device == 0x0182) || (pdev->device == 0x0183) || 191 return 0;
224 (pdev->device == 0x1182) || (pmr & SIS_PMR_COMBINED))
225 iowrite32(val, ap->ioaddr.scr_addr + (sc_reg * 4)+0x10);
226 return 0;
227 }
228} 192}
229 193
230static int sis_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) 194static int sis_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
@@ -236,7 +200,7 @@ static int sis_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
236 u32 genctl, val; 200 u32 genctl, val;
237 u8 pmr; 201 u8 pmr;
238 u8 port2_start = 0x20; 202 u8 port2_start = 0x20;
239 int rc; 203 int i, rc;
240 204
241 if (!printed_version++) 205 if (!printed_version++)
242 dev_printk(KERN_INFO, &pdev->dev, "version " DRV_VERSION "\n"); 206 dev_printk(KERN_INFO, &pdev->dev, "version " DRV_VERSION "\n");
@@ -319,6 +283,17 @@ static int sis_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
319 if (rc) 283 if (rc)
320 return rc; 284 return rc;
321 285
286 for (i = 0; i < 2; i++) {
287 struct ata_port *ap = host->ports[i];
288
289 if (ap->flags & ATA_FLAG_SATA &&
290 ap->flags & ATA_FLAG_SLAVE_POSS) {
291 rc = ata_slave_link_init(ap);
292 if (rc)
293 return rc;
294 }
295 }
296
322 if (!(pi.flags & SIS_FLAG_CFGSCR)) { 297 if (!(pi.flags & SIS_FLAG_CFGSCR)) {
323 void __iomem *mmio; 298 void __iomem *mmio;
324 299
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index 5d7a02f63e1c..50eecfe1d724 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -24,6 +24,7 @@
24#include <linux/sysrq.h> 24#include <linux/sysrq.h>
25#include <linux/kbd_kern.h> 25#include <linux/kbd_kern.h>
26#include <linux/proc_fs.h> 26#include <linux/proc_fs.h>
27#include <linux/nmi.h>
27#include <linux/quotaops.h> 28#include <linux/quotaops.h>
28#include <linux/perf_counter.h> 29#include <linux/perf_counter.h>
29#include <linux/kernel.h> 30#include <linux/kernel.h>
@@ -222,12 +223,20 @@ static DECLARE_WORK(sysrq_showallcpus, sysrq_showregs_othercpus);
222 223
223static void sysrq_handle_showallcpus(int key, struct tty_struct *tty) 224static void sysrq_handle_showallcpus(int key, struct tty_struct *tty)
224{ 225{
225 struct pt_regs *regs = get_irq_regs(); 226 /*
226 if (regs) { 227 * Fall back to the workqueue based printing if the
227 printk(KERN_INFO "CPU%d:\n", smp_processor_id()); 228 * backtrace printing did not succeed or the
228 show_regs(regs); 229 * architecture has no support for it:
230 */
231 if (!trigger_all_cpu_backtrace()) {
232 struct pt_regs *regs = get_irq_regs();
233
234 if (regs) {
235 printk(KERN_INFO "CPU%d:\n", smp_processor_id());
236 show_regs(regs);
237 }
238 schedule_work(&sysrq_showallcpus);
229 } 239 }
230 schedule_work(&sysrq_showallcpus);
231} 240}
232 241
233static struct sysrq_key_op sysrq_showallcpus_op = { 242static struct sysrq_key_op sysrq_showallcpus_op = {
diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c
index 24c84ae81527..938100f14b16 100644
--- a/drivers/firmware/dmi_scan.c
+++ b/drivers/firmware/dmi_scan.c
@@ -568,35 +568,76 @@ const struct dmi_device * dmi_find_device(int type, const char *name,
568EXPORT_SYMBOL(dmi_find_device); 568EXPORT_SYMBOL(dmi_find_device);
569 569
570/** 570/**
571 * dmi_get_year - Return year of a DMI date 571 * dmi_get_date - parse a DMI date
572 * @field: data index (like dmi_get_system_info) 572 * @field: data index (see enum dmi_field)
573 * @yearp: optional out parameter for the year
574 * @monthp: optional out parameter for the month
575 * @dayp: optional out parameter for the day
573 * 576 *
574 * Returns -1 when the field doesn't exist. 0 when it is broken. 577 * The date field is assumed to be in the form resembling
578 * [mm[/dd]]/yy[yy] and the result is stored in the out
579 * parameters any or all of which can be omitted.
580 *
581 * If the field doesn't exist, all out parameters are set to zero
582 * and false is returned. Otherwise, true is returned with any
583 * invalid part of date set to zero.
584 *
585 * On return, year, month and day are guaranteed to be in the
586 * range of [0,9999], [0,12] and [0,31] respectively.
575 */ 587 */
576int dmi_get_year(int field) 588bool dmi_get_date(int field, int *yearp, int *monthp, int *dayp)
577{ 589{
578 int year; 590 int year = 0, month = 0, day = 0;
579 const char *s = dmi_get_system_info(field); 591 bool exists;
592 const char *s, *y;
593 char *e;
580 594
581 if (!s) 595 s = dmi_get_system_info(field);
582 return -1; 596 exists = s;
583 if (*s == '\0') 597 if (!exists)
584 return 0; 598 goto out;
585 s = strrchr(s, '/');
586 if (!s)
587 return 0;
588 599
589 s += 1; 600 /*
590 year = simple_strtoul(s, NULL, 0); 601 * Determine year first. We assume the date string resembles
591 if (year && year < 100) { /* 2-digit year */ 602 * mm/dd/yy[yy] but the original code extracted only the year
603 * from the end. Keep the behavior in the spirit of no
604 * surprises.
605 */
606 y = strrchr(s, '/');
607 if (!y)
608 goto out;
609
610 y++;
611 year = simple_strtoul(y, &e, 10);
612 if (y != e && year < 100) { /* 2-digit year */
592 year += 1900; 613 year += 1900;
593 if (year < 1996) /* no dates < spec 1.0 */ 614 if (year < 1996) /* no dates < spec 1.0 */
594 year += 100; 615 year += 100;
595 } 616 }
617 if (year > 9999) /* year should fit in %04d */
618 year = 0;
619
620 /* parse the mm and dd */
621 month = simple_strtoul(s, &e, 10);
622 if (s == e || *e != '/' || !month || month > 12) {
623 month = 0;
624 goto out;
625 }
596 626
597 return year; 627 s = e + 1;
628 day = simple_strtoul(s, &e, 10);
629 if (s == y || s == e || *e != '/' || day > 31)
630 day = 0;
631out:
632 if (yearp)
633 *yearp = year;
634 if (monthp)
635 *monthp = month;
636 if (dayp)
637 *dayp = day;
638 return exists;
598} 639}
599EXPORT_SYMBOL(dmi_get_year); 640EXPORT_SYMBOL(dmi_get_date);
600 641
601/** 642/**
602 * dmi_walk - Walk the DMI table and get called back for every record 643 * dmi_walk - Walk the DMI table and get called back for every record
diff --git a/drivers/ide/atiixp.c b/drivers/ide/atiixp.c
index 923cbfe259d3..6396c3ad3252 100644
--- a/drivers/ide/atiixp.c
+++ b/drivers/ide/atiixp.c
@@ -177,6 +177,7 @@ static const struct pci_device_id atiixp_pci_tbl[] = {
177 { PCI_VDEVICE(ATI, PCI_DEVICE_ID_ATI_IXP400_IDE), 0 }, 177 { PCI_VDEVICE(ATI, PCI_DEVICE_ID_ATI_IXP400_IDE), 0 },
178 { PCI_VDEVICE(ATI, PCI_DEVICE_ID_ATI_IXP600_IDE), 1 }, 178 { PCI_VDEVICE(ATI, PCI_DEVICE_ID_ATI_IXP600_IDE), 1 },
179 { PCI_VDEVICE(ATI, PCI_DEVICE_ID_ATI_IXP700_IDE), 0 }, 179 { PCI_VDEVICE(ATI, PCI_DEVICE_ID_ATI_IXP700_IDE), 0 },
180 { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_SB900_IDE), 0 },
180 { 0, }, 181 { 0, },
181}; 182};
182MODULE_DEVICE_TABLE(pci, atiixp_pci_tbl); 183MODULE_DEVICE_TABLE(pci, atiixp_pci_tbl);
diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c
index 242257b19441..a7aae24f2889 100644
--- a/drivers/oprofile/cpu_buffer.c
+++ b/drivers/oprofile/cpu_buffer.c
@@ -21,7 +21,6 @@
21 21
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/oprofile.h> 23#include <linux/oprofile.h>
24#include <linux/vmalloc.h>
25#include <linux/errno.h> 24#include <linux/errno.h>
26 25
27#include "event_buffer.h" 26#include "event_buffer.h"
@@ -407,6 +406,21 @@ int oprofile_add_data(struct op_entry *entry, unsigned long val)
407 return op_cpu_buffer_add_data(entry, val); 406 return op_cpu_buffer_add_data(entry, val);
408} 407}
409 408
409int oprofile_add_data64(struct op_entry *entry, u64 val)
410{
411 if (!entry->event)
412 return 0;
413 if (op_cpu_buffer_get_size(entry) < 2)
414 /*
415 * the function returns 0 to indicate a too small
416 * buffer, even if there is some space left
417 */
418 return 0;
419 if (!op_cpu_buffer_add_data(entry, (u32)val))
420 return 0;
421 return op_cpu_buffer_add_data(entry, (u32)(val >> 32));
422}
423
410int oprofile_write_commit(struct op_entry *entry) 424int oprofile_write_commit(struct op_entry *entry)
411{ 425{
412 if (!entry->event) 426 if (!entry->event)
diff --git a/drivers/oprofile/oprof.c b/drivers/oprofile/oprof.c
index 3cffce90f82a..dc8a0428260d 100644
--- a/drivers/oprofile/oprof.c
+++ b/drivers/oprofile/oprof.c
@@ -12,6 +12,8 @@
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/oprofile.h> 13#include <linux/oprofile.h>
14#include <linux/moduleparam.h> 14#include <linux/moduleparam.h>
15#include <linux/workqueue.h>
16#include <linux/time.h>
15#include <asm/mutex.h> 17#include <asm/mutex.h>
16 18
17#include "oprof.h" 19#include "oprof.h"
@@ -87,6 +89,69 @@ out:
87 return err; 89 return err;
88} 90}
89 91
92#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
93
94static void switch_worker(struct work_struct *work);
95static DECLARE_DELAYED_WORK(switch_work, switch_worker);
96
97static void start_switch_worker(void)
98{
99 if (oprofile_ops.switch_events)
100 schedule_delayed_work(&switch_work, oprofile_time_slice);
101}
102
103static void stop_switch_worker(void)
104{
105 cancel_delayed_work_sync(&switch_work);
106}
107
108static void switch_worker(struct work_struct *work)
109{
110 if (oprofile_ops.switch_events())
111 return;
112
113 atomic_inc(&oprofile_stats.multiplex_counter);
114 start_switch_worker();
115}
116
117/* User inputs in ms, converts to jiffies */
118int oprofile_set_timeout(unsigned long val_msec)
119{
120 int err = 0;
121 unsigned long time_slice;
122
123 mutex_lock(&start_mutex);
124
125 if (oprofile_started) {
126 err = -EBUSY;
127 goto out;
128 }
129
130 if (!oprofile_ops.switch_events) {
131 err = -EINVAL;
132 goto out;
133 }
134
135 time_slice = msecs_to_jiffies(val_msec);
136 if (time_slice == MAX_JIFFY_OFFSET) {
137 err = -EINVAL;
138 goto out;
139 }
140
141 oprofile_time_slice = time_slice;
142
143out:
144 mutex_unlock(&start_mutex);
145 return err;
146
147}
148
149#else
150
151static inline void start_switch_worker(void) { }
152static inline void stop_switch_worker(void) { }
153
154#endif
90 155
91/* Actually start profiling (echo 1>/dev/oprofile/enable) */ 156/* Actually start profiling (echo 1>/dev/oprofile/enable) */
92int oprofile_start(void) 157int oprofile_start(void)
@@ -108,6 +173,8 @@ int oprofile_start(void)
108 if ((err = oprofile_ops.start())) 173 if ((err = oprofile_ops.start()))
109 goto out; 174 goto out;
110 175
176 start_switch_worker();
177
111 oprofile_started = 1; 178 oprofile_started = 1;
112out: 179out:
113 mutex_unlock(&start_mutex); 180 mutex_unlock(&start_mutex);
@@ -123,6 +190,9 @@ void oprofile_stop(void)
123 goto out; 190 goto out;
124 oprofile_ops.stop(); 191 oprofile_ops.stop();
125 oprofile_started = 0; 192 oprofile_started = 0;
193
194 stop_switch_worker();
195
126 /* wake up the daemon to read what remains */ 196 /* wake up the daemon to read what remains */
127 wake_up_buffer_waiter(); 197 wake_up_buffer_waiter();
128out: 198out:
@@ -155,7 +225,6 @@ post_sync:
155 mutex_unlock(&start_mutex); 225 mutex_unlock(&start_mutex);
156} 226}
157 227
158
159int oprofile_set_backtrace(unsigned long val) 228int oprofile_set_backtrace(unsigned long val)
160{ 229{
161 int err = 0; 230 int err = 0;
diff --git a/drivers/oprofile/oprof.h b/drivers/oprofile/oprof.h
index c288d3c24b50..cb92f5c98c1a 100644
--- a/drivers/oprofile/oprof.h
+++ b/drivers/oprofile/oprof.h
@@ -24,6 +24,8 @@ struct oprofile_operations;
24extern unsigned long oprofile_buffer_size; 24extern unsigned long oprofile_buffer_size;
25extern unsigned long oprofile_cpu_buffer_size; 25extern unsigned long oprofile_cpu_buffer_size;
26extern unsigned long oprofile_buffer_watershed; 26extern unsigned long oprofile_buffer_watershed;
27extern unsigned long oprofile_time_slice;
28
27extern struct oprofile_operations oprofile_ops; 29extern struct oprofile_operations oprofile_ops;
28extern unsigned long oprofile_started; 30extern unsigned long oprofile_started;
29extern unsigned long oprofile_backtrace_depth; 31extern unsigned long oprofile_backtrace_depth;
@@ -35,5 +37,6 @@ void oprofile_create_files(struct super_block *sb, struct dentry *root);
35void oprofile_timer_init(struct oprofile_operations *ops); 37void oprofile_timer_init(struct oprofile_operations *ops);
36 38
37int oprofile_set_backtrace(unsigned long depth); 39int oprofile_set_backtrace(unsigned long depth);
40int oprofile_set_timeout(unsigned long time);
38 41
39#endif /* OPROF_H */ 42#endif /* OPROF_H */
diff --git a/drivers/oprofile/oprofile_files.c b/drivers/oprofile/oprofile_files.c
index 5d36ffc30dd5..bbd7516e0869 100644
--- a/drivers/oprofile/oprofile_files.c
+++ b/drivers/oprofile/oprofile_files.c
@@ -9,6 +9,7 @@
9 9
10#include <linux/fs.h> 10#include <linux/fs.h>
11#include <linux/oprofile.h> 11#include <linux/oprofile.h>
12#include <linux/jiffies.h>
12 13
13#include "event_buffer.h" 14#include "event_buffer.h"
14#include "oprofile_stats.h" 15#include "oprofile_stats.h"
@@ -17,10 +18,51 @@
17#define BUFFER_SIZE_DEFAULT 131072 18#define BUFFER_SIZE_DEFAULT 131072
18#define CPU_BUFFER_SIZE_DEFAULT 8192 19#define CPU_BUFFER_SIZE_DEFAULT 8192
19#define BUFFER_WATERSHED_DEFAULT 32768 /* FIXME: tune */ 20#define BUFFER_WATERSHED_DEFAULT 32768 /* FIXME: tune */
21#define TIME_SLICE_DEFAULT 1
20 22
21unsigned long oprofile_buffer_size; 23unsigned long oprofile_buffer_size;
22unsigned long oprofile_cpu_buffer_size; 24unsigned long oprofile_cpu_buffer_size;
23unsigned long oprofile_buffer_watershed; 25unsigned long oprofile_buffer_watershed;
26unsigned long oprofile_time_slice;
27
28#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
29
30static ssize_t timeout_read(struct file *file, char __user *buf,
31 size_t count, loff_t *offset)
32{
33 return oprofilefs_ulong_to_user(jiffies_to_msecs(oprofile_time_slice),
34 buf, count, offset);
35}
36
37
38static ssize_t timeout_write(struct file *file, char const __user *buf,
39 size_t count, loff_t *offset)
40{
41 unsigned long val;
42 int retval;
43
44 if (*offset)
45 return -EINVAL;
46
47 retval = oprofilefs_ulong_from_user(&val, buf, count);
48 if (retval)
49 return retval;
50
51 retval = oprofile_set_timeout(val);
52
53 if (retval)
54 return retval;
55 return count;
56}
57
58
59static const struct file_operations timeout_fops = {
60 .read = timeout_read,
61 .write = timeout_write,
62};
63
64#endif
65
24 66
25static ssize_t depth_read(struct file *file, char __user *buf, size_t count, loff_t *offset) 67static ssize_t depth_read(struct file *file, char __user *buf, size_t count, loff_t *offset)
26{ 68{
@@ -129,6 +171,7 @@ void oprofile_create_files(struct super_block *sb, struct dentry *root)
129 oprofile_buffer_size = BUFFER_SIZE_DEFAULT; 171 oprofile_buffer_size = BUFFER_SIZE_DEFAULT;
130 oprofile_cpu_buffer_size = CPU_BUFFER_SIZE_DEFAULT; 172 oprofile_cpu_buffer_size = CPU_BUFFER_SIZE_DEFAULT;
131 oprofile_buffer_watershed = BUFFER_WATERSHED_DEFAULT; 173 oprofile_buffer_watershed = BUFFER_WATERSHED_DEFAULT;
174 oprofile_time_slice = msecs_to_jiffies(TIME_SLICE_DEFAULT);
132 175
133 oprofilefs_create_file(sb, root, "enable", &enable_fops); 176 oprofilefs_create_file(sb, root, "enable", &enable_fops);
134 oprofilefs_create_file_perm(sb, root, "dump", &dump_fops, 0666); 177 oprofilefs_create_file_perm(sb, root, "dump", &dump_fops, 0666);
@@ -139,6 +182,9 @@ void oprofile_create_files(struct super_block *sb, struct dentry *root)
139 oprofilefs_create_file(sb, root, "cpu_type", &cpu_type_fops); 182 oprofilefs_create_file(sb, root, "cpu_type", &cpu_type_fops);
140 oprofilefs_create_file(sb, root, "backtrace_depth", &depth_fops); 183 oprofilefs_create_file(sb, root, "backtrace_depth", &depth_fops);
141 oprofilefs_create_file(sb, root, "pointer_size", &pointer_size_fops); 184 oprofilefs_create_file(sb, root, "pointer_size", &pointer_size_fops);
185#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
186 oprofilefs_create_file(sb, root, "time_slice", &timeout_fops);
187#endif
142 oprofile_create_stats_files(sb, root); 188 oprofile_create_stats_files(sb, root);
143 if (oprofile_ops.create_files) 189 if (oprofile_ops.create_files)
144 oprofile_ops.create_files(sb, root); 190 oprofile_ops.create_files(sb, root);
diff --git a/drivers/oprofile/oprofile_stats.c b/drivers/oprofile/oprofile_stats.c
index 3c2270a8300c..61689e814d46 100644
--- a/drivers/oprofile/oprofile_stats.c
+++ b/drivers/oprofile/oprofile_stats.c
@@ -34,6 +34,7 @@ void oprofile_reset_stats(void)
34 atomic_set(&oprofile_stats.sample_lost_no_mapping, 0); 34 atomic_set(&oprofile_stats.sample_lost_no_mapping, 0);
35 atomic_set(&oprofile_stats.event_lost_overflow, 0); 35 atomic_set(&oprofile_stats.event_lost_overflow, 0);
36 atomic_set(&oprofile_stats.bt_lost_no_mapping, 0); 36 atomic_set(&oprofile_stats.bt_lost_no_mapping, 0);
37 atomic_set(&oprofile_stats.multiplex_counter, 0);
37} 38}
38 39
39 40
@@ -76,4 +77,8 @@ void oprofile_create_stats_files(struct super_block *sb, struct dentry *root)
76 &oprofile_stats.event_lost_overflow); 77 &oprofile_stats.event_lost_overflow);
77 oprofilefs_create_ro_atomic(sb, dir, "bt_lost_no_mapping", 78 oprofilefs_create_ro_atomic(sb, dir, "bt_lost_no_mapping",
78 &oprofile_stats.bt_lost_no_mapping); 79 &oprofile_stats.bt_lost_no_mapping);
80#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
81 oprofilefs_create_ro_atomic(sb, dir, "multiplex_counter",
82 &oprofile_stats.multiplex_counter);
83#endif
79} 84}
diff --git a/drivers/oprofile/oprofile_stats.h b/drivers/oprofile/oprofile_stats.h
index 3da0d08dc1f9..0b54e46c3c14 100644
--- a/drivers/oprofile/oprofile_stats.h
+++ b/drivers/oprofile/oprofile_stats.h
@@ -17,6 +17,7 @@ struct oprofile_stat_struct {
17 atomic_t sample_lost_no_mapping; 17 atomic_t sample_lost_no_mapping;
18 atomic_t bt_lost_no_mapping; 18 atomic_t bt_lost_no_mapping;
19 atomic_t event_lost_overflow; 19 atomic_t event_lost_overflow;
20 atomic_t multiplex_counter;
20}; 21};
21 22
22extern struct oprofile_stat_struct oprofile_stats; 23extern struct oprofile_stat_struct oprofile_stats;
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
index 4f5b8712931f..44803644ca05 100644
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -55,15 +55,12 @@ static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
55 return desc->irq_2_iommu; 55 return desc->irq_2_iommu;
56} 56}
57 57
58static struct irq_2_iommu *irq_2_iommu_alloc_node(unsigned int irq, int node) 58static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
59{ 59{
60 struct irq_desc *desc; 60 struct irq_desc *desc;
61 struct irq_2_iommu *irq_iommu; 61 struct irq_2_iommu *irq_iommu;
62 62
63 /* 63 desc = irq_to_desc(irq);
64 * alloc irq desc if not allocated already.
65 */
66 desc = irq_to_desc_alloc_node(irq, node);
67 if (!desc) { 64 if (!desc) {
68 printk(KERN_INFO "can not get irq_desc for %d\n", irq); 65 printk(KERN_INFO "can not get irq_desc for %d\n", irq);
69 return NULL; 66 return NULL;
@@ -72,16 +69,11 @@ static struct irq_2_iommu *irq_2_iommu_alloc_node(unsigned int irq, int node)
72 irq_iommu = desc->irq_2_iommu; 69 irq_iommu = desc->irq_2_iommu;
73 70
74 if (!irq_iommu) 71 if (!irq_iommu)
75 desc->irq_2_iommu = get_one_free_irq_2_iommu(node); 72 desc->irq_2_iommu = get_one_free_irq_2_iommu(irq_node(irq));
76 73
77 return desc->irq_2_iommu; 74 return desc->irq_2_iommu;
78} 75}
79 76
80static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
81{
82 return irq_2_iommu_alloc_node(irq, cpu_to_node(boot_cpu_id));
83}
84
85#else /* !CONFIG_SPARSE_IRQ */ 77#else /* !CONFIG_SPARSE_IRQ */
86 78
87static struct irq_2_iommu irq_2_iommuX[NR_IRQS]; 79static struct irq_2_iommu irq_2_iommuX[NR_IRQS];
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 06b965623962..85ce23997be4 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -992,7 +992,7 @@ DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82454NX,
992 992
993static void __devinit quirk_amd_ide_mode(struct pci_dev *pdev) 993static void __devinit quirk_amd_ide_mode(struct pci_dev *pdev)
994{ 994{
995 /* set sb600/sb700/sb800 sata to ahci mode */ 995 /* set SBX00 SATA in IDE mode to AHCI mode */
996 u8 tmp; 996 u8 tmp;
997 997
998 pci_read_config_byte(pdev, PCI_CLASS_DEVICE, &tmp); 998 pci_read_config_byte(pdev, PCI_CLASS_DEVICE, &tmp);
@@ -1011,6 +1011,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP600_SATA, quirk
1011DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP600_SATA, quirk_amd_ide_mode); 1011DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP600_SATA, quirk_amd_ide_mode);
1012DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP700_SATA, quirk_amd_ide_mode); 1012DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP700_SATA, quirk_amd_ide_mode);
1013DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP700_SATA, quirk_amd_ide_mode); 1013DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP700_SATA, quirk_amd_ide_mode);
1014DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_SB900_SATA_IDE, quirk_amd_ide_mode);
1015DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_SB900_SATA_IDE, quirk_amd_ide_mode);
1014 1016
1015/* 1017/*
1016 * Serverworks CSB5 IDE does not fully support native mode 1018 * Serverworks CSB5 IDE does not fully support native mode
diff --git a/fs/dcache.c b/fs/dcache.c
index 9e5cd3c3a6ba..a100fa35a48f 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -32,6 +32,7 @@
32#include <linux/swap.h> 32#include <linux/swap.h>
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34#include <linux/fs_struct.h> 34#include <linux/fs_struct.h>
35#include <linux/hardirq.h>
35#include "internal.h" 36#include "internal.h"
36 37
37int sysctl_vfs_cache_pressure __read_mostly = 100; 38int sysctl_vfs_cache_pressure __read_mostly = 100;
diff --git a/fs/locks.c b/fs/locks.c
index 52366e877d76..19ee18a6829b 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -768,7 +768,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
768 * give it the opportunity to lock the file. 768 * give it the opportunity to lock the file.
769 */ 769 */
770 if (found) 770 if (found)
771 cond_resched_bkl(); 771 cond_resched();
772 772
773find_conflict: 773find_conflict:
774 for_each_lock(inode, before) { 774 for_each_lock(inode, before) {
diff --git a/include/asm-generic/dma-mapping-common.h b/include/asm-generic/dma-mapping-common.h
index 5406a601185c..e694263445f7 100644
--- a/include/asm-generic/dma-mapping-common.h
+++ b/include/asm-generic/dma-mapping-common.h
@@ -103,7 +103,6 @@ static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
103 if (ops->sync_single_for_cpu) 103 if (ops->sync_single_for_cpu)
104 ops->sync_single_for_cpu(dev, addr, size, dir); 104 ops->sync_single_for_cpu(dev, addr, size, dir);
105 debug_dma_sync_single_for_cpu(dev, addr, size, dir); 105 debug_dma_sync_single_for_cpu(dev, addr, size, dir);
106 flush_write_buffers();
107} 106}
108 107
109static inline void dma_sync_single_for_device(struct device *dev, 108static inline void dma_sync_single_for_device(struct device *dev,
@@ -116,7 +115,6 @@ static inline void dma_sync_single_for_device(struct device *dev,
116 if (ops->sync_single_for_device) 115 if (ops->sync_single_for_device)
117 ops->sync_single_for_device(dev, addr, size, dir); 116 ops->sync_single_for_device(dev, addr, size, dir);
118 debug_dma_sync_single_for_device(dev, addr, size, dir); 117 debug_dma_sync_single_for_device(dev, addr, size, dir);
119 flush_write_buffers();
120} 118}
121 119
122static inline void dma_sync_single_range_for_cpu(struct device *dev, 120static inline void dma_sync_single_range_for_cpu(struct device *dev,
@@ -132,7 +130,6 @@ static inline void dma_sync_single_range_for_cpu(struct device *dev,
132 ops->sync_single_range_for_cpu(dev, addr, offset, size, dir); 130 ops->sync_single_range_for_cpu(dev, addr, offset, size, dir);
133 debug_dma_sync_single_range_for_cpu(dev, addr, offset, size, dir); 131 debug_dma_sync_single_range_for_cpu(dev, addr, offset, size, dir);
134 132
135 flush_write_buffers();
136 } else 133 } else
137 dma_sync_single_for_cpu(dev, addr, size, dir); 134 dma_sync_single_for_cpu(dev, addr, size, dir);
138} 135}
@@ -150,7 +147,6 @@ static inline void dma_sync_single_range_for_device(struct device *dev,
150 ops->sync_single_range_for_device(dev, addr, offset, size, dir); 147 ops->sync_single_range_for_device(dev, addr, offset, size, dir);
151 debug_dma_sync_single_range_for_device(dev, addr, offset, size, dir); 148 debug_dma_sync_single_range_for_device(dev, addr, offset, size, dir);
152 149
153 flush_write_buffers();
154 } else 150 } else
155 dma_sync_single_for_device(dev, addr, size, dir); 151 dma_sync_single_for_device(dev, addr, size, dir);
156} 152}
@@ -165,7 +161,6 @@ dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
165 if (ops->sync_sg_for_cpu) 161 if (ops->sync_sg_for_cpu)
166 ops->sync_sg_for_cpu(dev, sg, nelems, dir); 162 ops->sync_sg_for_cpu(dev, sg, nelems, dir);
167 debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir); 163 debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir);
168 flush_write_buffers();
169} 164}
170 165
171static inline void 166static inline void
@@ -179,7 +174,6 @@ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
179 ops->sync_sg_for_device(dev, sg, nelems, dir); 174 ops->sync_sg_for_device(dev, sg, nelems, dir);
180 debug_dma_sync_sg_for_device(dev, sg, nelems, dir); 175 debug_dma_sync_sg_for_device(dev, sg, nelems, dir);
181 176
182 flush_write_buffers();
183} 177}
184 178
185#define dma_map_single(d, a, s, r) dma_map_single_attrs(d, a, s, r, NULL) 179#define dma_map_single(d, a, s, r) dma_map_single_attrs(d, a, s, r, NULL)
diff --git a/include/linux/ata.h b/include/linux/ata.h
index 9c75921f0c16..6299a259ed19 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -210,15 +210,25 @@ enum {
210 ATA_CMD_STANDBY = 0xE2, /* place in standby power mode */ 210 ATA_CMD_STANDBY = 0xE2, /* place in standby power mode */
211 ATA_CMD_IDLE = 0xE3, /* place in idle power mode */ 211 ATA_CMD_IDLE = 0xE3, /* place in idle power mode */
212 ATA_CMD_EDD = 0x90, /* execute device diagnostic */ 212 ATA_CMD_EDD = 0x90, /* execute device diagnostic */
213 ATA_CMD_DOWNLOAD_MICRO = 0x92,
214 ATA_CMD_NOP = 0x00,
213 ATA_CMD_FLUSH = 0xE7, 215 ATA_CMD_FLUSH = 0xE7,
214 ATA_CMD_FLUSH_EXT = 0xEA, 216 ATA_CMD_FLUSH_EXT = 0xEA,
215 ATA_CMD_ID_ATA = 0xEC, 217 ATA_CMD_ID_ATA = 0xEC,
216 ATA_CMD_ID_ATAPI = 0xA1, 218 ATA_CMD_ID_ATAPI = 0xA1,
219 ATA_CMD_SERVICE = 0xA2,
217 ATA_CMD_READ = 0xC8, 220 ATA_CMD_READ = 0xC8,
218 ATA_CMD_READ_EXT = 0x25, 221 ATA_CMD_READ_EXT = 0x25,
222 ATA_CMD_READ_QUEUED = 0x26,
223 ATA_CMD_READ_STREAM_EXT = 0x2B,
224 ATA_CMD_READ_STREAM_DMA_EXT = 0x2A,
219 ATA_CMD_WRITE = 0xCA, 225 ATA_CMD_WRITE = 0xCA,
220 ATA_CMD_WRITE_EXT = 0x35, 226 ATA_CMD_WRITE_EXT = 0x35,
227 ATA_CMD_WRITE_QUEUED = 0x36,
228 ATA_CMD_WRITE_STREAM_EXT = 0x3B,
229 ATA_CMD_WRITE_STREAM_DMA_EXT = 0x3A,
221 ATA_CMD_WRITE_FUA_EXT = 0x3D, 230 ATA_CMD_WRITE_FUA_EXT = 0x3D,
231 ATA_CMD_WRITE_QUEUED_FUA_EXT = 0x3E,
222 ATA_CMD_FPDMA_READ = 0x60, 232 ATA_CMD_FPDMA_READ = 0x60,
223 ATA_CMD_FPDMA_WRITE = 0x61, 233 ATA_CMD_FPDMA_WRITE = 0x61,
224 ATA_CMD_PIO_READ = 0x20, 234 ATA_CMD_PIO_READ = 0x20,
@@ -235,6 +245,7 @@ enum {
235 ATA_CMD_PACKET = 0xA0, 245 ATA_CMD_PACKET = 0xA0,
236 ATA_CMD_VERIFY = 0x40, 246 ATA_CMD_VERIFY = 0x40,
237 ATA_CMD_VERIFY_EXT = 0x42, 247 ATA_CMD_VERIFY_EXT = 0x42,
248 ATA_CMD_WRITE_UNCORR_EXT = 0x45,
238 ATA_CMD_STANDBYNOW1 = 0xE0, 249 ATA_CMD_STANDBYNOW1 = 0xE0,
239 ATA_CMD_IDLEIMMEDIATE = 0xE1, 250 ATA_CMD_IDLEIMMEDIATE = 0xE1,
240 ATA_CMD_SLEEP = 0xE6, 251 ATA_CMD_SLEEP = 0xE6,
@@ -243,15 +254,34 @@ enum {
243 ATA_CMD_READ_NATIVE_MAX_EXT = 0x27, 254 ATA_CMD_READ_NATIVE_MAX_EXT = 0x27,
244 ATA_CMD_SET_MAX = 0xF9, 255 ATA_CMD_SET_MAX = 0xF9,
245 ATA_CMD_SET_MAX_EXT = 0x37, 256 ATA_CMD_SET_MAX_EXT = 0x37,
246 ATA_CMD_READ_LOG_EXT = 0x2f, 257 ATA_CMD_READ_LOG_EXT = 0x2F,
258 ATA_CMD_WRITE_LOG_EXT = 0x3F,
259 ATA_CMD_READ_LOG_DMA_EXT = 0x47,
260 ATA_CMD_WRITE_LOG_DMA_EXT = 0x57,
261 ATA_CMD_TRUSTED_RCV = 0x5C,
262 ATA_CMD_TRUSTED_RCV_DMA = 0x5D,
263 ATA_CMD_TRUSTED_SND = 0x5E,
264 ATA_CMD_TRUSTED_SND_DMA = 0x5F,
247 ATA_CMD_PMP_READ = 0xE4, 265 ATA_CMD_PMP_READ = 0xE4,
248 ATA_CMD_PMP_WRITE = 0xE8, 266 ATA_CMD_PMP_WRITE = 0xE8,
249 ATA_CMD_CONF_OVERLAY = 0xB1, 267 ATA_CMD_CONF_OVERLAY = 0xB1,
268 ATA_CMD_SEC_SET_PASS = 0xF1,
269 ATA_CMD_SEC_UNLOCK = 0xF2,
270 ATA_CMD_SEC_ERASE_PREP = 0xF3,
271 ATA_CMD_SEC_ERASE_UNIT = 0xF4,
250 ATA_CMD_SEC_FREEZE_LOCK = 0xF5, 272 ATA_CMD_SEC_FREEZE_LOCK = 0xF5,
273 ATA_CMD_SEC_DISABLE_PASS = 0xF6,
274 ATA_CMD_CONFIG_STREAM = 0x51,
251 ATA_CMD_SMART = 0xB0, 275 ATA_CMD_SMART = 0xB0,
252 ATA_CMD_MEDIA_LOCK = 0xDE, 276 ATA_CMD_MEDIA_LOCK = 0xDE,
253 ATA_CMD_MEDIA_UNLOCK = 0xDF, 277 ATA_CMD_MEDIA_UNLOCK = 0xDF,
254 ATA_CMD_DSM = 0x06, 278 ATA_CMD_DSM = 0x06,
279 ATA_CMD_CHK_MED_CRD_TYP = 0xD1,
280 ATA_CMD_CFA_REQ_EXT_ERR = 0x03,
281 ATA_CMD_CFA_WRITE_NE = 0x38,
282 ATA_CMD_CFA_TRANS_SECT = 0x87,
283 ATA_CMD_CFA_ERASE = 0xC0,
284 ATA_CMD_CFA_WRITE_MULT_NE = 0xCD,
255 /* marked obsolete in the ATA/ATAPI-7 spec */ 285 /* marked obsolete in the ATA/ATAPI-7 spec */
256 ATA_CMD_RESTORE = 0x10, 286 ATA_CMD_RESTORE = 0x10,
257 287
@@ -306,6 +336,7 @@ enum {
306 /* SETFEATURE Sector counts for SATA features */ 336 /* SETFEATURE Sector counts for SATA features */
307 SATA_AN = 0x05, /* Asynchronous Notification */ 337 SATA_AN = 0x05, /* Asynchronous Notification */
308 SATA_DIPM = 0x03, /* Device Initiated Power Management */ 338 SATA_DIPM = 0x03, /* Device Initiated Power Management */
339 SATA_FPDMA_AA = 0x02, /* DMA Setup FIS Auto-Activate */
309 340
310 /* feature values for SET_MAX */ 341 /* feature values for SET_MAX */
311 ATA_SET_MAX_ADDR = 0x00, 342 ATA_SET_MAX_ADDR = 0x00,
@@ -525,6 +556,9 @@ static inline int ata_is_data(u8 prot)
525#define ata_id_has_atapi_AN(id) \ 556#define ata_id_has_atapi_AN(id) \
526 ( (((id)[76] != 0x0000) && ((id)[76] != 0xffff)) && \ 557 ( (((id)[76] != 0x0000) && ((id)[76] != 0xffff)) && \
527 ((id)[78] & (1 << 5)) ) 558 ((id)[78] & (1 << 5)) )
559#define ata_id_has_fpdma_aa(id) \
560 ( (((id)[76] != 0x0000) && ((id)[76] != 0xffff)) && \
561 ((id)[78] & (1 << 2)) )
528#define ata_id_iordy_disable(id) ((id)[ATA_ID_CAPABILITY] & (1 << 10)) 562#define ata_id_iordy_disable(id) ((id)[ATA_ID_CAPABILITY] & (1 << 10))
529#define ata_id_has_iordy(id) ((id)[ATA_ID_CAPABILITY] & (1 << 11)) 563#define ata_id_has_iordy(id) ((id)[ATA_ID_CAPABILITY] & (1 << 11))
530#define ata_id_u32(id,n) \ 564#define ata_id_u32(id,n) \
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 4d668e05d458..47536197ffdd 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -48,6 +48,15 @@ struct notifier_block;
48 48
49#ifdef CONFIG_SMP 49#ifdef CONFIG_SMP
50/* Need to know about CPUs going up/down? */ 50/* Need to know about CPUs going up/down? */
51#if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE)
52#define cpu_notifier(fn, pri) { \
53 static struct notifier_block fn##_nb __cpuinitdata = \
54 { .notifier_call = fn, .priority = pri }; \
55 register_cpu_notifier(&fn##_nb); \
56}
57#else /* #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) */
58#define cpu_notifier(fn, pri) do { (void)(fn); } while (0)
59#endif /* #else #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) */
51#ifdef CONFIG_HOTPLUG_CPU 60#ifdef CONFIG_HOTPLUG_CPU
52extern int register_cpu_notifier(struct notifier_block *nb); 61extern int register_cpu_notifier(struct notifier_block *nb);
53extern void unregister_cpu_notifier(struct notifier_block *nb); 62extern void unregister_cpu_notifier(struct notifier_block *nb);
@@ -74,6 +83,8 @@ extern void cpu_maps_update_done(void);
74 83
75#else /* CONFIG_SMP */ 84#else /* CONFIG_SMP */
76 85
86#define cpu_notifier(fn, pri) do { (void)(fn); } while (0)
87
77static inline int register_cpu_notifier(struct notifier_block *nb) 88static inline int register_cpu_notifier(struct notifier_block *nb)
78{ 89{
79 return 0; 90 return 0;
@@ -99,11 +110,7 @@ extern struct sysdev_class cpu_sysdev_class;
99 110
100extern void get_online_cpus(void); 111extern void get_online_cpus(void);
101extern void put_online_cpus(void); 112extern void put_online_cpus(void);
102#define hotcpu_notifier(fn, pri) { \ 113#define hotcpu_notifier(fn, pri) cpu_notifier(fn, pri)
103 static struct notifier_block fn##_nb __cpuinitdata = \
104 { .notifier_call = fn, .priority = pri }; \
105 register_cpu_notifier(&fn##_nb); \
106}
107#define register_hotcpu_notifier(nb) register_cpu_notifier(nb) 114#define register_hotcpu_notifier(nb) register_cpu_notifier(nb)
108#define unregister_hotcpu_notifier(nb) unregister_cpu_notifier(nb) 115#define unregister_hotcpu_notifier(nb) unregister_cpu_notifier(nb)
109int cpu_down(unsigned int cpu); 116int cpu_down(unsigned int cpu);
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 07dfd460d286..c0f6c3cd788c 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -98,11 +98,6 @@ static inline int is_device_dma_capable(struct device *dev)
98 return dev->dma_mask != NULL && *dev->dma_mask != DMA_MASK_NONE; 98 return dev->dma_mask != NULL && *dev->dma_mask != DMA_MASK_NONE;
99} 99}
100 100
101static inline int is_buffer_dma_capable(u64 mask, dma_addr_t addr, size_t size)
102{
103 return addr + size <= mask;
104}
105
106#ifdef CONFIG_HAS_DMA 101#ifdef CONFIG_HAS_DMA
107#include <asm/dma-mapping.h> 102#include <asm/dma-mapping.h>
108#else 103#else
diff --git a/include/linux/dmi.h b/include/linux/dmi.h
index bb5489c82c99..a8a3e1ac281d 100644
--- a/include/linux/dmi.h
+++ b/include/linux/dmi.h
@@ -43,7 +43,7 @@ extern const char * dmi_get_system_info(int field);
43extern const struct dmi_device * dmi_find_device(int type, const char *name, 43extern const struct dmi_device * dmi_find_device(int type, const char *name,
44 const struct dmi_device *from); 44 const struct dmi_device *from);
45extern void dmi_scan_machine(void); 45extern void dmi_scan_machine(void);
46extern int dmi_get_year(int field); 46extern bool dmi_get_date(int field, int *yearp, int *monthp, int *dayp);
47extern int dmi_name_in_vendors(const char *str); 47extern int dmi_name_in_vendors(const char *str);
48extern int dmi_name_in_serial(const char *str); 48extern int dmi_name_in_serial(const char *str);
49extern int dmi_available; 49extern int dmi_available;
@@ -58,7 +58,16 @@ static inline const char * dmi_get_system_info(int field) { return NULL; }
58static inline const struct dmi_device * dmi_find_device(int type, const char *name, 58static inline const struct dmi_device * dmi_find_device(int type, const char *name,
59 const struct dmi_device *from) { return NULL; } 59 const struct dmi_device *from) { return NULL; }
60static inline void dmi_scan_machine(void) { return; } 60static inline void dmi_scan_machine(void) { return; }
61static inline int dmi_get_year(int year) { return 0; } 61static inline bool dmi_get_date(int field, int *yearp, int *monthp, int *dayp)
62{
63 if (yearp)
64 *yearp = 0;
65 if (monthp)
66 *monthp = 0;
67 if (dayp)
68 *dayp = 0;
69 return false;
70}
62static inline int dmi_name_in_vendors(const char *s) { return 0; } 71static inline int dmi_name_in_vendors(const char *s) { return 0; }
63static inline int dmi_name_in_serial(const char *s) { return 0; } 72static inline int dmi_name_in_serial(const char *s) { return 0; }
64#define dmi_available 0 73#define dmi_available 0
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index a81170de7f6b..23f7179bf74e 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -93,16 +93,22 @@ void tracing_generic_entry_update(struct trace_entry *entry,
93 unsigned long flags, 93 unsigned long flags,
94 int pc); 94 int pc);
95struct ring_buffer_event * 95struct ring_buffer_event *
96trace_current_buffer_lock_reserve(int type, unsigned long len, 96trace_current_buffer_lock_reserve(struct ring_buffer **current_buffer,
97 int type, unsigned long len,
97 unsigned long flags, int pc); 98 unsigned long flags, int pc);
98void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, 99void trace_current_buffer_unlock_commit(struct ring_buffer *buffer,
100 struct ring_buffer_event *event,
99 unsigned long flags, int pc); 101 unsigned long flags, int pc);
100void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, 102void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer,
103 struct ring_buffer_event *event,
101 unsigned long flags, int pc); 104 unsigned long flags, int pc);
102void trace_current_buffer_discard_commit(struct ring_buffer_event *event); 105void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
106 struct ring_buffer_event *event);
103 107
104void tracing_record_cmdline(struct task_struct *tsk); 108void tracing_record_cmdline(struct task_struct *tsk);
105 109
110struct event_filter;
111
106struct ftrace_event_call { 112struct ftrace_event_call {
107 struct list_head list; 113 struct list_head list;
108 char *name; 114 char *name;
@@ -110,16 +116,18 @@ struct ftrace_event_call {
110 struct dentry *dir; 116 struct dentry *dir;
111 struct trace_event *event; 117 struct trace_event *event;
112 int enabled; 118 int enabled;
113 int (*regfunc)(void); 119 int (*regfunc)(void *);
114 void (*unregfunc)(void); 120 void (*unregfunc)(void *);
115 int id; 121 int id;
116 int (*raw_init)(void); 122 int (*raw_init)(void);
117 int (*show_format)(struct trace_seq *s); 123 int (*show_format)(struct ftrace_event_call *call,
118 int (*define_fields)(void); 124 struct trace_seq *s);
125 int (*define_fields)(struct ftrace_event_call *);
119 struct list_head fields; 126 struct list_head fields;
120 int filter_active; 127 int filter_active;
121 void *filter; 128 struct event_filter *filter;
122 void *mod; 129 void *mod;
130 void *data;
123 131
124 atomic_t profile_count; 132 atomic_t profile_count;
125 int (*profile_enable)(struct ftrace_event_call *); 133 int (*profile_enable)(struct ftrace_event_call *);
@@ -129,15 +137,25 @@ struct ftrace_event_call {
129#define MAX_FILTER_PRED 32 137#define MAX_FILTER_PRED 32
130#define MAX_FILTER_STR_VAL 128 138#define MAX_FILTER_STR_VAL 128
131 139
132extern int init_preds(struct ftrace_event_call *call);
133extern void destroy_preds(struct ftrace_event_call *call); 140extern void destroy_preds(struct ftrace_event_call *call);
134extern int filter_match_preds(struct ftrace_event_call *call, void *rec); 141extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
135extern int filter_current_check_discard(struct ftrace_event_call *call, 142extern int filter_current_check_discard(struct ring_buffer *buffer,
143 struct ftrace_event_call *call,
136 void *rec, 144 void *rec,
137 struct ring_buffer_event *event); 145 struct ring_buffer_event *event);
138 146
139extern int trace_define_field(struct ftrace_event_call *call, char *type, 147enum {
140 char *name, int offset, int size, int is_signed); 148 FILTER_OTHER = 0,
149 FILTER_STATIC_STRING,
150 FILTER_DYN_STRING,
151 FILTER_PTR_STRING,
152};
153
154extern int trace_define_field(struct ftrace_event_call *call,
155 const char *type, const char *name,
156 int offset, int size, int is_signed,
157 int filter_type);
158extern int trace_define_common_fields(struct ftrace_event_call *call);
141 159
142#define is_signed_type(type) (((type)(-1)) < 0) 160#define is_signed_type(type) (((type)(-1)) < 0)
143 161
@@ -162,11 +180,4 @@ do { \
162 __trace_printk(ip, fmt, ##args); \ 180 __trace_printk(ip, fmt, ##args); \
163} while (0) 181} while (0)
164 182
165#define __common_field(type, item, is_signed) \
166 ret = trace_define_field(event_call, #type, "common_" #item, \
167 offsetof(typeof(field.ent), item), \
168 sizeof(field.ent.item), is_signed); \
169 if (ret) \
170 return ret;
171
172#endif /* _LINUX_FTRACE_EVENT_H */ 183#endif /* _LINUX_FTRACE_EVENT_H */
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 8246c697863d..6d527ee82b2b 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -64,6 +64,12 @@
64#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) 64#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
65#define NMI_OFFSET (1UL << NMI_SHIFT) 65#define NMI_OFFSET (1UL << NMI_SHIFT)
66 66
67#ifndef PREEMPT_ACTIVE
68#define PREEMPT_ACTIVE_BITS 1
69#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS)
70#define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT)
71#endif
72
67#if PREEMPT_ACTIVE < (1 << (NMI_SHIFT + NMI_BITS)) 73#if PREEMPT_ACTIVE < (1 << (NMI_SHIFT + NMI_BITS))
68#error PREEMPT_ACTIVE is too low! 74#error PREEMPT_ACTIVE is too low!
69#endif 75#endif
@@ -132,7 +138,7 @@ static inline void account_system_vtime(struct task_struct *tsk)
132} 138}
133#endif 139#endif
134 140
135#if defined(CONFIG_NO_HZ) && !defined(CONFIG_CLASSIC_RCU) 141#if defined(CONFIG_NO_HZ)
136extern void rcu_irq_enter(void); 142extern void rcu_irq_enter(void);
137extern void rcu_irq_exit(void); 143extern void rcu_irq_exit(void);
138extern void rcu_nmi_enter(void); 144extern void rcu_nmi_enter(void);
@@ -142,7 +148,7 @@ extern void rcu_nmi_exit(void);
142# define rcu_irq_exit() do { } while (0) 148# define rcu_irq_exit() do { } while (0)
143# define rcu_nmi_enter() do { } while (0) 149# define rcu_nmi_enter() do { } while (0)
144# define rcu_nmi_exit() do { } while (0) 150# define rcu_nmi_exit() do { } while (0)
145#endif /* #if defined(CONFIG_NO_HZ) && !defined(CONFIG_CLASSIC_RCU) */ 151#endif /* #if defined(CONFIG_NO_HZ) */
146 152
147/* 153/*
148 * It is safe to do non-atomic ops on ->hardirq_context, 154 * It is safe to do non-atomic ops on ->hardirq_context,
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 7fc01b13be43..9e7f2e8fc66e 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -94,6 +94,16 @@ extern struct group_info init_groups;
94# define CAP_INIT_BSET CAP_INIT_EFF_SET 94# define CAP_INIT_BSET CAP_INIT_EFF_SET
95#endif 95#endif
96 96
97#ifdef CONFIG_TREE_PREEMPT_RCU
98#define INIT_TASK_RCU_PREEMPT(tsk) \
99 .rcu_read_lock_nesting = 0, \
100 .rcu_read_unlock_special = 0, \
101 .rcu_blocked_node = NULL, \
102 .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry),
103#else
104#define INIT_TASK_RCU_PREEMPT(tsk)
105#endif
106
97extern struct cred init_cred; 107extern struct cred init_cred;
98 108
99#ifdef CONFIG_PERF_COUNTERS 109#ifdef CONFIG_PERF_COUNTERS
@@ -173,6 +183,7 @@ extern struct cred init_cred;
173 INIT_LOCKDEP \ 183 INIT_LOCKDEP \
174 INIT_FTRACE_GRAPH \ 184 INIT_FTRACE_GRAPH \
175 INIT_TRACE_RECURSION \ 185 INIT_TRACE_RECURSION \
186 INIT_TASK_RCU_PREEMPT(tsk) \
176} 187}
177 188
178 189
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 35e7df1e9f30..1ac57e522a1f 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -50,6 +50,9 @@
50 * IRQF_IRQPOLL - Interrupt is used for polling (only the interrupt that is 50 * IRQF_IRQPOLL - Interrupt is used for polling (only the interrupt that is
51 * registered first in an shared interrupt is considered for 51 * registered first in an shared interrupt is considered for
52 * performance reasons) 52 * performance reasons)
53 * IRQF_ONESHOT - Interrupt is not reenabled after the hardirq handler finished.
54 * Used by threaded interrupts which need to keep the
55 * irq line disabled until the threaded handler has been run.
53 */ 56 */
54#define IRQF_DISABLED 0x00000020 57#define IRQF_DISABLED 0x00000020
55#define IRQF_SAMPLE_RANDOM 0x00000040 58#define IRQF_SAMPLE_RANDOM 0x00000040
@@ -59,6 +62,7 @@
59#define IRQF_PERCPU 0x00000400 62#define IRQF_PERCPU 0x00000400
60#define IRQF_NOBALANCING 0x00000800 63#define IRQF_NOBALANCING 0x00000800
61#define IRQF_IRQPOLL 0x00001000 64#define IRQF_IRQPOLL 0x00001000
65#define IRQF_ONESHOT 0x00002000
62 66
63/* 67/*
64 * Bits used by threaded handlers: 68 * Bits used by threaded handlers:
diff --git a/include/linux/irq.h b/include/linux/irq.h
index cb2e77a3f7f7..ae9653dbcd78 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -69,6 +69,8 @@ typedef void (*irq_flow_handler_t)(unsigned int irq,
69#define IRQ_MOVE_PCNTXT 0x01000000 /* IRQ migration from process context */ 69#define IRQ_MOVE_PCNTXT 0x01000000 /* IRQ migration from process context */
70#define IRQ_AFFINITY_SET 0x02000000 /* IRQ affinity was set from userspace*/ 70#define IRQ_AFFINITY_SET 0x02000000 /* IRQ affinity was set from userspace*/
71#define IRQ_SUSPENDED 0x04000000 /* IRQ has gone through suspend sequence */ 71#define IRQ_SUSPENDED 0x04000000 /* IRQ has gone through suspend sequence */
72#define IRQ_ONESHOT 0x08000000 /* IRQ is not unmasked after hardirq */
73#define IRQ_NESTED_THREAD 0x10000000 /* IRQ is nested into another, no own handler thread */
72 74
73#ifdef CONFIG_IRQ_PER_CPU 75#ifdef CONFIG_IRQ_PER_CPU
74# define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU) 76# define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU)
@@ -100,6 +102,9 @@ struct msi_desc;
100 * @set_type: set the flow type (IRQ_TYPE_LEVEL/etc.) of an IRQ 102 * @set_type: set the flow type (IRQ_TYPE_LEVEL/etc.) of an IRQ
101 * @set_wake: enable/disable power-management wake-on of an IRQ 103 * @set_wake: enable/disable power-management wake-on of an IRQ
102 * 104 *
105 * @bus_lock: function to lock access to slow bus (i2c) chips
106 * @bus_sync_unlock: function to sync and unlock slow bus (i2c) chips
107 *
103 * @release: release function solely used by UML 108 * @release: release function solely used by UML
104 * @typename: obsoleted by name, kept as migration helper 109 * @typename: obsoleted by name, kept as migration helper
105 */ 110 */
@@ -123,6 +128,9 @@ struct irq_chip {
123 int (*set_type)(unsigned int irq, unsigned int flow_type); 128 int (*set_type)(unsigned int irq, unsigned int flow_type);
124 int (*set_wake)(unsigned int irq, unsigned int on); 129 int (*set_wake)(unsigned int irq, unsigned int on);
125 130
131 void (*bus_lock)(unsigned int irq);
132 void (*bus_sync_unlock)(unsigned int irq);
133
126 /* Currently used only by UML, might disappear one day.*/ 134 /* Currently used only by UML, might disappear one day.*/
127#ifdef CONFIG_IRQ_RELEASE_METHOD 135#ifdef CONFIG_IRQ_RELEASE_METHOD
128 void (*release)(unsigned int irq, void *dev_id); 136 void (*release)(unsigned int irq, void *dev_id);
@@ -220,13 +228,6 @@ static inline struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
220extern struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node); 228extern struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node);
221 229
222/* 230/*
223 * Migration helpers for obsolete names, they will go away:
224 */
225#define hw_interrupt_type irq_chip
226#define no_irq_type no_irq_chip
227typedef struct irq_desc irq_desc_t;
228
229/*
230 * Pick up the arch-dependent methods: 231 * Pick up the arch-dependent methods:
231 */ 232 */
232#include <asm/hw_irq.h> 233#include <asm/hw_irq.h>
@@ -289,6 +290,7 @@ extern void handle_edge_irq(unsigned int irq, struct irq_desc *desc);
289extern void handle_simple_irq(unsigned int irq, struct irq_desc *desc); 290extern void handle_simple_irq(unsigned int irq, struct irq_desc *desc);
290extern void handle_percpu_irq(unsigned int irq, struct irq_desc *desc); 291extern void handle_percpu_irq(unsigned int irq, struct irq_desc *desc);
291extern void handle_bad_irq(unsigned int irq, struct irq_desc *desc); 292extern void handle_bad_irq(unsigned int irq, struct irq_desc *desc);
293extern void handle_nested_irq(unsigned int irq);
292 294
293/* 295/*
294 * Monolithic do_IRQ implementation. 296 * Monolithic do_IRQ implementation.
@@ -379,6 +381,8 @@ set_irq_chained_handler(unsigned int irq,
379 __set_irq_handler(irq, handle, 1, NULL); 381 __set_irq_handler(irq, handle, 1, NULL);
380} 382}
381 383
384extern void set_irq_nested_thread(unsigned int irq, int nest);
385
382extern void set_irq_noprobe(unsigned int irq); 386extern void set_irq_noprobe(unsigned int irq);
383extern void set_irq_probe(unsigned int irq); 387extern void set_irq_probe(unsigned int irq);
384 388
diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h
index ec87b212ff7d..7bf89bc8cbca 100644
--- a/include/linux/irqnr.h
+++ b/include/linux/irqnr.h
@@ -41,6 +41,12 @@ extern struct irq_desc *irq_to_desc(unsigned int irq);
41 ; \ 41 ; \
42 else 42 else
43 43
44#ifdef CONFIG_SMP
45#define irq_node(irq) (irq_to_desc(irq)->node)
46#else
47#define irq_node(irq) 0
48#endif
49
44#endif /* CONFIG_GENERIC_HARDIRQS */ 50#endif /* CONFIG_GENERIC_HARDIRQS */
45 51
46#define for_each_irq_nr(irq) \ 52#define for_each_irq_nr(irq) \
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index d6320a3e8def..2b5b1e0899a8 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -125,7 +125,7 @@ extern int _cond_resched(void);
125#endif 125#endif
126 126
127#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 127#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
128 void __might_sleep(char *file, int line); 128 void __might_sleep(char *file, int line, int preempt_offset);
129/** 129/**
130 * might_sleep - annotation for functions that can sleep 130 * might_sleep - annotation for functions that can sleep
131 * 131 *
@@ -137,8 +137,9 @@ extern int _cond_resched(void);
137 * supposed to. 137 * supposed to.
138 */ 138 */
139# define might_sleep() \ 139# define might_sleep() \
140 do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0) 140 do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
141#else 141#else
142 static inline void __might_sleep(char *file, int line, int preempt_offset) { }
142# define might_sleep() do { might_resched(); } while (0) 143# define might_sleep() do { might_resched(); } while (0)
143#endif 144#endif
144 145
diff --git a/include/linux/libata.h b/include/linux/libata.h
index e5b6e33c6571..76319bf03e37 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -143,7 +143,6 @@ enum {
143 143
144 ATA_DFLAG_PIO = (1 << 12), /* device limited to PIO mode */ 144 ATA_DFLAG_PIO = (1 << 12), /* device limited to PIO mode */
145 ATA_DFLAG_NCQ_OFF = (1 << 13), /* device limited to non-NCQ mode */ 145 ATA_DFLAG_NCQ_OFF = (1 << 13), /* device limited to non-NCQ mode */
146 ATA_DFLAG_SPUNDOWN = (1 << 14), /* XXX: for spindown_compat */
147 ATA_DFLAG_SLEEPING = (1 << 15), /* device is sleeping */ 146 ATA_DFLAG_SLEEPING = (1 << 15), /* device is sleeping */
148 ATA_DFLAG_DUBIOUS_XFER = (1 << 16), /* data transfer not verified */ 147 ATA_DFLAG_DUBIOUS_XFER = (1 << 16), /* data transfer not verified */
149 ATA_DFLAG_NO_UNLOAD = (1 << 17), /* device doesn't support unload */ 148 ATA_DFLAG_NO_UNLOAD = (1 << 17), /* device doesn't support unload */
@@ -190,6 +189,7 @@ enum {
190 ATA_FLAG_NO_POWEROFF_SPINDOWN = (1 << 11), /* don't spindown before poweroff */ 189 ATA_FLAG_NO_POWEROFF_SPINDOWN = (1 << 11), /* don't spindown before poweroff */
191 ATA_FLAG_NO_HIBERNATE_SPINDOWN = (1 << 12), /* don't spindown before hibernation */ 190 ATA_FLAG_NO_HIBERNATE_SPINDOWN = (1 << 12), /* don't spindown before hibernation */
192 ATA_FLAG_DEBUGMSG = (1 << 13), 191 ATA_FLAG_DEBUGMSG = (1 << 13),
192 ATA_FLAG_FPDMA_AA = (1 << 14), /* driver supports Auto-Activate */
193 ATA_FLAG_IGN_SIMPLEX = (1 << 15), /* ignore SIMPLEX */ 193 ATA_FLAG_IGN_SIMPLEX = (1 << 15), /* ignore SIMPLEX */
194 ATA_FLAG_NO_IORDY = (1 << 16), /* controller lacks iordy */ 194 ATA_FLAG_NO_IORDY = (1 << 16), /* controller lacks iordy */
195 ATA_FLAG_ACPI_SATA = (1 << 17), /* need native SATA ACPI layout */ 195 ATA_FLAG_ACPI_SATA = (1 << 17), /* need native SATA ACPI layout */
@@ -386,6 +386,7 @@ enum {
386 ATA_HORKAGE_FIRMWARE_WARN = (1 << 12), /* firmware update warning */ 386 ATA_HORKAGE_FIRMWARE_WARN = (1 << 12), /* firmware update warning */
387 ATA_HORKAGE_1_5_GBPS = (1 << 13), /* force 1.5 Gbps */ 387 ATA_HORKAGE_1_5_GBPS = (1 << 13), /* force 1.5 Gbps */
388 ATA_HORKAGE_NOSETXFER = (1 << 14), /* skip SETXFER, SATA only */ 388 ATA_HORKAGE_NOSETXFER = (1 << 14), /* skip SETXFER, SATA only */
389 ATA_HORKAGE_BROKEN_FPDMA_AA = (1 << 15), /* skip AA */
389 390
390 /* DMA mask for user DMA control: User visible values; DO NOT 391 /* DMA mask for user DMA control: User visible values; DO NOT
391 renumber */ 392 renumber */
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index b25d1b53df0d..9ccf0e286b2a 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -149,6 +149,12 @@ struct lock_list {
149 struct lock_class *class; 149 struct lock_class *class;
150 struct stack_trace trace; 150 struct stack_trace trace;
151 int distance; 151 int distance;
152
153 /*
154 * The parent field is used to implement breadth-first search, and the
155 * bit 0 is reused to indicate if the lock has been accessed in BFS.
156 */
157 struct lock_list *parent;
152}; 158};
153 159
154/* 160/*
@@ -208,10 +214,12 @@ struct held_lock {
208 * interrupt context: 214 * interrupt context:
209 */ 215 */
210 unsigned int irq_context:2; /* bit 0 - soft, bit 1 - hard */ 216 unsigned int irq_context:2; /* bit 0 - soft, bit 1 - hard */
211 unsigned int trylock:1; 217 unsigned int trylock:1; /* 16 bits */
218
212 unsigned int read:2; /* see lock_acquire() comment */ 219 unsigned int read:2; /* see lock_acquire() comment */
213 unsigned int check:2; /* see lock_acquire() comment */ 220 unsigned int check:2; /* see lock_acquire() comment */
214 unsigned int hardirqs_off:1; 221 unsigned int hardirqs_off:1;
222 unsigned int references:11; /* 32 bits */
215}; 223};
216 224
217/* 225/*
@@ -291,6 +299,10 @@ extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
291extern void lock_release(struct lockdep_map *lock, int nested, 299extern void lock_release(struct lockdep_map *lock, int nested,
292 unsigned long ip); 300 unsigned long ip);
293 301
302#define lockdep_is_held(lock) lock_is_held(&(lock)->dep_map)
303
304extern int lock_is_held(struct lockdep_map *lock);
305
294extern void lock_set_class(struct lockdep_map *lock, const char *name, 306extern void lock_set_class(struct lockdep_map *lock, const char *name,
295 struct lock_class_key *key, unsigned int subclass, 307 struct lock_class_key *key, unsigned int subclass,
296 unsigned long ip); 308 unsigned long ip);
@@ -309,6 +321,8 @@ extern void lockdep_trace_alloc(gfp_t mask);
309 321
310#define lockdep_depth(tsk) (debug_locks ? (tsk)->lockdep_depth : 0) 322#define lockdep_depth(tsk) (debug_locks ? (tsk)->lockdep_depth : 0)
311 323
324#define lockdep_assert_held(l) WARN_ON(debug_locks && !lockdep_is_held(l))
325
312#else /* !LOCKDEP */ 326#else /* !LOCKDEP */
313 327
314static inline void lockdep_off(void) 328static inline void lockdep_off(void)
@@ -353,6 +367,8 @@ struct lock_class_key { };
353 367
354#define lockdep_depth(tsk) (0) 368#define lockdep_depth(tsk) (0)
355 369
370#define lockdep_assert_held(l) do { } while (0)
371
356#endif /* !LOCKDEP */ 372#endif /* !LOCKDEP */
357 373
358#ifdef CONFIG_LOCK_STAT 374#ifdef CONFIG_LOCK_STAT
diff --git a/include/linux/module.h b/include/linux/module.h
index 098bdb7bfacf..f8f92d015efe 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -17,10 +17,12 @@
17#include <linux/moduleparam.h> 17#include <linux/moduleparam.h>
18#include <linux/marker.h> 18#include <linux/marker.h>
19#include <linux/tracepoint.h> 19#include <linux/tracepoint.h>
20#include <asm/local.h>
21 20
21#include <asm/local.h>
22#include <asm/module.h> 22#include <asm/module.h>
23 23
24#include <trace/events/module.h>
25
24/* Not Yet Implemented */ 26/* Not Yet Implemented */
25#define MODULE_SUPPORTED_DEVICE(name) 27#define MODULE_SUPPORTED_DEVICE(name)
26 28
@@ -462,7 +464,10 @@ static inline local_t *__module_ref_addr(struct module *mod, int cpu)
462static inline void __module_get(struct module *module) 464static inline void __module_get(struct module *module)
463{ 465{
464 if (module) { 466 if (module) {
465 local_inc(__module_ref_addr(module, get_cpu())); 467 unsigned int cpu = get_cpu();
468 local_inc(__module_ref_addr(module, cpu));
469 trace_module_get(module, _THIS_IP_,
470 local_read(__module_ref_addr(module, cpu)));
466 put_cpu(); 471 put_cpu();
467 } 472 }
468} 473}
@@ -473,8 +478,11 @@ static inline int try_module_get(struct module *module)
473 478
474 if (module) { 479 if (module) {
475 unsigned int cpu = get_cpu(); 480 unsigned int cpu = get_cpu();
476 if (likely(module_is_live(module))) 481 if (likely(module_is_live(module))) {
477 local_inc(__module_ref_addr(module, cpu)); 482 local_inc(__module_ref_addr(module, cpu));
483 trace_module_get(module, _THIS_IP_,
484 local_read(__module_ref_addr(module, cpu)));
485 }
478 else 486 else
479 ret = 0; 487 ret = 0;
480 put_cpu(); 488 put_cpu();
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 29af2d5df097..b752e807adde 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -28,8 +28,23 @@ static inline void acpi_nmi_disable(void) { }
28static inline void acpi_nmi_enable(void) { } 28static inline void acpi_nmi_enable(void) { }
29#endif 29#endif
30 30
31#ifndef trigger_all_cpu_backtrace 31/*
32#define trigger_all_cpu_backtrace() do { } while (0) 32 * Create trigger_all_cpu_backtrace() out of the arch-provided
33 * base function. Return whether such support was available,
34 * to allow calling code to fall back to some other mechanism:
35 */
36#ifdef arch_trigger_all_cpu_backtrace
37static inline bool trigger_all_cpu_backtrace(void)
38{
39 arch_trigger_all_cpu_backtrace();
40
41 return true;
42}
43#else
44static inline bool trigger_all_cpu_backtrace(void)
45{
46 return false;
47}
33#endif 48#endif
34 49
35#endif 50#endif
diff --git a/include/linux/oprofile.h b/include/linux/oprofile.h
index 1d9518bc4c58..5171639ecf0f 100644
--- a/include/linux/oprofile.h
+++ b/include/linux/oprofile.h
@@ -67,6 +67,9 @@ struct oprofile_operations {
67 67
68 /* Initiate a stack backtrace. Optional. */ 68 /* Initiate a stack backtrace. Optional. */
69 void (*backtrace)(struct pt_regs * const regs, unsigned int depth); 69 void (*backtrace)(struct pt_regs * const regs, unsigned int depth);
70
71 /* Multiplex between different events. Optional. */
72 int (*switch_events)(void);
70 /* CPU identification string. */ 73 /* CPU identification string. */
71 char * cpu_type; 74 char * cpu_type;
72}; 75};
@@ -171,7 +174,6 @@ struct op_sample;
171struct op_entry { 174struct op_entry {
172 struct ring_buffer_event *event; 175 struct ring_buffer_event *event;
173 struct op_sample *sample; 176 struct op_sample *sample;
174 unsigned long irq_flags;
175 unsigned long size; 177 unsigned long size;
176 unsigned long *data; 178 unsigned long *data;
177}; 179};
@@ -180,6 +182,7 @@ void oprofile_write_reserve(struct op_entry *entry,
180 struct pt_regs * const regs, 182 struct pt_regs * const regs,
181 unsigned long pc, int code, int size); 183 unsigned long pc, int code, int size);
182int oprofile_add_data(struct op_entry *entry, unsigned long val); 184int oprofile_add_data(struct op_entry *entry, unsigned long val);
185int oprofile_add_data64(struct op_entry *entry, u64 val);
183int oprofile_write_commit(struct op_entry *entry); 186int oprofile_write_commit(struct op_entry *entry);
184 187
185#endif /* OPROFILE_H */ 188#endif /* OPROFILE_H */
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index aec3252afcf5..ed5d7501e181 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -132,7 +132,7 @@ static inline int page_cache_get_speculative(struct page *page)
132{ 132{
133 VM_BUG_ON(in_interrupt()); 133 VM_BUG_ON(in_interrupt());
134 134
135#if !defined(CONFIG_SMP) && defined(CONFIG_CLASSIC_RCU) 135#if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU)
136# ifdef CONFIG_PREEMPT 136# ifdef CONFIG_PREEMPT
137 VM_BUG_ON(!in_atomic()); 137 VM_BUG_ON(!in_atomic());
138# endif 138# endif
@@ -170,7 +170,7 @@ static inline int page_cache_add_speculative(struct page *page, int count)
170{ 170{
171 VM_BUG_ON(in_interrupt()); 171 VM_BUG_ON(in_interrupt());
172 172
173#if !defined(CONFIG_SMP) && defined(CONFIG_CLASSIC_RCU) 173#if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU)
174# ifdef CONFIG_PREEMPT 174# ifdef CONFIG_PREEMPT
175 VM_BUG_ON(!in_atomic()); 175 VM_BUG_ON(!in_atomic());
176# endif 176# endif
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 73b46b6b904f..c8fdcadce437 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -376,6 +376,9 @@
376#define PCI_DEVICE_ID_ATI_IXP600_IDE 0x438c 376#define PCI_DEVICE_ID_ATI_IXP600_IDE 0x438c
377#define PCI_DEVICE_ID_ATI_IXP700_SATA 0x4390 377#define PCI_DEVICE_ID_ATI_IXP700_SATA 0x4390
378#define PCI_DEVICE_ID_ATI_IXP700_IDE 0x439c 378#define PCI_DEVICE_ID_ATI_IXP700_IDE 0x439c
379/* AMD SB Chipset */
380#define PCI_DEVICE_ID_AMD_SB900_IDE 0x780c
381#define PCI_DEVICE_ID_AMD_SB900_SATA_IDE 0x7800
379 382
380#define PCI_VENDOR_ID_VLSI 0x1004 383#define PCI_VENDOR_ID_VLSI 0x1004
381#define PCI_DEVICE_ID_VLSI_82C592 0x0005 384#define PCI_DEVICE_ID_VLSI_82C592 0x0005
@@ -537,6 +540,7 @@
537#define PCI_DEVICE_ID_AMD_8131_BRIDGE 0x7450 540#define PCI_DEVICE_ID_AMD_8131_BRIDGE 0x7450
538#define PCI_DEVICE_ID_AMD_8131_APIC 0x7451 541#define PCI_DEVICE_ID_AMD_8131_APIC 0x7451
539#define PCI_DEVICE_ID_AMD_8132_BRIDGE 0x7458 542#define PCI_DEVICE_ID_AMD_8132_BRIDGE 0x7458
543#define PCI_DEVICE_ID_AMD_CS5535_IDE 0x208F
540#define PCI_DEVICE_ID_AMD_CS5536_ISA 0x2090 544#define PCI_DEVICE_ID_AMD_CS5536_ISA 0x2090
541#define PCI_DEVICE_ID_AMD_CS5536_FLASH 0x2091 545#define PCI_DEVICE_ID_AMD_CS5536_FLASH 0x2091
542#define PCI_DEVICE_ID_AMD_CS5536_AUDIO 0x2093 546#define PCI_DEVICE_ID_AMD_CS5536_AUDIO 0x2093
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index b53f7006cc4e..972f90d7a32f 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -216,6 +216,7 @@ struct perf_counter_attr {
216#define PERF_COUNTER_IOC_REFRESH _IO ('$', 2) 216#define PERF_COUNTER_IOC_REFRESH _IO ('$', 2)
217#define PERF_COUNTER_IOC_RESET _IO ('$', 3) 217#define PERF_COUNTER_IOC_RESET _IO ('$', 3)
218#define PERF_COUNTER_IOC_PERIOD _IOW('$', 4, u64) 218#define PERF_COUNTER_IOC_PERIOD _IOW('$', 4, u64)
219#define PERF_COUNTER_IOC_SET_OUTPUT _IO ('$', 5)
219 220
220enum perf_counter_ioc_flags { 221enum perf_counter_ioc_flags {
221 PERF_IOC_FLAG_GROUP = 1U << 0, 222 PERF_IOC_FLAG_GROUP = 1U << 0,
@@ -415,6 +416,9 @@ enum perf_callchain_context {
415 PERF_CONTEXT_MAX = (__u64)-4095, 416 PERF_CONTEXT_MAX = (__u64)-4095,
416}; 417};
417 418
419#define PERF_FLAG_FD_NO_GROUP (1U << 0)
420#define PERF_FLAG_FD_OUTPUT (1U << 1)
421
418#ifdef __KERNEL__ 422#ifdef __KERNEL__
419/* 423/*
420 * Kernel-internal data types and definitions: 424 * Kernel-internal data types and definitions:
@@ -536,6 +540,7 @@ struct perf_counter {
536 struct list_head sibling_list; 540 struct list_head sibling_list;
537 int nr_siblings; 541 int nr_siblings;
538 struct perf_counter *group_leader; 542 struct perf_counter *group_leader;
543 struct perf_counter *output;
539 const struct pmu *pmu; 544 const struct pmu *pmu;
540 545
541 enum perf_counter_active_state state; 546 enum perf_counter_active_state state;
@@ -761,6 +766,8 @@ extern int sysctl_perf_counter_mlock;
761extern int sysctl_perf_counter_sample_rate; 766extern int sysctl_perf_counter_sample_rate;
762 767
763extern void perf_counter_init(void); 768extern void perf_counter_init(void);
769extern void perf_tpcounter_event(int event_id, u64 addr, u64 count,
770 void *record, int entry_size);
764 771
765#ifndef perf_misc_flags 772#ifndef perf_misc_flags
766#define perf_misc_flags(regs) (user_mode(regs) ? PERF_EVENT_MISC_USER : \ 773#define perf_misc_flags(regs) (user_mode(regs) ? PERF_EVENT_MISC_USER : \
diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h
deleted file mode 100644
index bfd92e1e5d2c..000000000000
--- a/include/linux/rcuclassic.h
+++ /dev/null
@@ -1,178 +0,0 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion (classic version)
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2001
19 *
20 * Author: Dipankar Sarma <dipankar@in.ibm.com>
21 *
22 * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
23 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
24 * Papers:
25 * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
26 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
27 *
28 * For detailed explanation of Read-Copy Update mechanism see -
29 * Documentation/RCU
30 *
31 */
32
33#ifndef __LINUX_RCUCLASSIC_H
34#define __LINUX_RCUCLASSIC_H
35
36#include <linux/cache.h>
37#include <linux/spinlock.h>
38#include <linux/threads.h>
39#include <linux/cpumask.h>
40#include <linux/seqlock.h>
41
42#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
43#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rcp->jiffies_stall */
44#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rcp->jiffies_stall */
45#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
46
47/* Global control variables for rcupdate callback mechanism. */
48struct rcu_ctrlblk {
49 long cur; /* Current batch number. */
50 long completed; /* Number of the last completed batch */
51 long pending; /* Number of the last pending batch */
52#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
53 unsigned long gp_start; /* Time at which GP started in jiffies. */
54 unsigned long jiffies_stall;
55 /* Time at which to check for CPU stalls. */
56#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
57
58 int signaled;
59
60 spinlock_t lock ____cacheline_internodealigned_in_smp;
61 DECLARE_BITMAP(cpumask, NR_CPUS); /* CPUs that need to switch for */
62 /* current batch to proceed. */
63} ____cacheline_internodealigned_in_smp;
64
65/* Is batch a before batch b ? */
66static inline int rcu_batch_before(long a, long b)
67{
68 return (a - b) < 0;
69}
70
71/* Is batch a after batch b ? */
72static inline int rcu_batch_after(long a, long b)
73{
74 return (a - b) > 0;
75}
76
77/* Per-CPU data for Read-Copy UPdate. */
78struct rcu_data {
79 /* 1) quiescent state handling : */
80 long quiescbatch; /* Batch # for grace period */
81 int passed_quiesc; /* User-mode/idle loop etc. */
82 int qs_pending; /* core waits for quiesc state */
83
84 /* 2) batch handling */
85 /*
86 * if nxtlist is not NULL, then:
87 * batch:
88 * The batch # for the last entry of nxtlist
89 * [*nxttail[1], NULL = *nxttail[2]):
90 * Entries that batch # <= batch
91 * [*nxttail[0], *nxttail[1]):
92 * Entries that batch # <= batch - 1
93 * [nxtlist, *nxttail[0]):
94 * Entries that batch # <= batch - 2
95 * The grace period for these entries has completed, and
96 * the other grace-period-completed entries may be moved
97 * here temporarily in rcu_process_callbacks().
98 */
99 long batch;
100 struct rcu_head *nxtlist;
101 struct rcu_head **nxttail[3];
102 long qlen; /* # of queued callbacks */
103 struct rcu_head *donelist;
104 struct rcu_head **donetail;
105 long blimit; /* Upper limit on a processed batch */
106 int cpu;
107 struct rcu_head barrier;
108};
109
110/*
111 * Increment the quiescent state counter.
112 * The counter is a bit degenerated: We do not need to know
113 * how many quiescent states passed, just if there was at least
114 * one since the start of the grace period. Thus just a flag.
115 */
116extern void rcu_qsctr_inc(int cpu);
117extern void rcu_bh_qsctr_inc(int cpu);
118
119extern int rcu_pending(int cpu);
120extern int rcu_needs_cpu(int cpu);
121
122#ifdef CONFIG_DEBUG_LOCK_ALLOC
123extern struct lockdep_map rcu_lock_map;
124# define rcu_read_acquire() \
125 lock_acquire(&rcu_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_)
126# define rcu_read_release() lock_release(&rcu_lock_map, 1, _THIS_IP_)
127#else
128# define rcu_read_acquire() do { } while (0)
129# define rcu_read_release() do { } while (0)
130#endif
131
132#define __rcu_read_lock() \
133 do { \
134 preempt_disable(); \
135 __acquire(RCU); \
136 rcu_read_acquire(); \
137 } while (0)
138#define __rcu_read_unlock() \
139 do { \
140 rcu_read_release(); \
141 __release(RCU); \
142 preempt_enable(); \
143 } while (0)
144#define __rcu_read_lock_bh() \
145 do { \
146 local_bh_disable(); \
147 __acquire(RCU_BH); \
148 rcu_read_acquire(); \
149 } while (0)
150#define __rcu_read_unlock_bh() \
151 do { \
152 rcu_read_release(); \
153 __release(RCU_BH); \
154 local_bh_enable(); \
155 } while (0)
156
157#define __synchronize_sched() synchronize_rcu()
158
159#define call_rcu_sched(head, func) call_rcu(head, func)
160
161extern void __rcu_init(void);
162#define rcu_init_sched() do { } while (0)
163extern void rcu_check_callbacks(int cpu, int user);
164extern void rcu_restart_cpu(int cpu);
165
166extern long rcu_batches_completed(void);
167extern long rcu_batches_completed_bh(void);
168
169#define rcu_enter_nohz() do { } while (0)
170#define rcu_exit_nohz() do { } while (0)
171
172/* A context switch is a grace period for rcuclassic. */
173static inline int rcu_blocking_is_gp(void)
174{
175 return num_online_cpus() == 1;
176}
177
178#endif /* __LINUX_RCUCLASSIC_H */
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 15fbb3ca634d..95e0615f4d75 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -51,18 +51,26 @@ struct rcu_head {
51 void (*func)(struct rcu_head *head); 51 void (*func)(struct rcu_head *head);
52}; 52};
53 53
54/* Internal to kernel, but needed by rcupreempt.h. */ 54/* Exported common interfaces */
55extern void synchronize_rcu(void);
56extern void synchronize_rcu_bh(void);
57extern void rcu_barrier(void);
58extern void rcu_barrier_bh(void);
59extern void rcu_barrier_sched(void);
60extern void synchronize_sched_expedited(void);
61extern int sched_expedited_torture_stats(char *page);
62
63/* Internal to kernel */
64extern void rcu_init(void);
65extern void rcu_scheduler_starting(void);
66extern int rcu_needs_cpu(int cpu);
55extern int rcu_scheduler_active; 67extern int rcu_scheduler_active;
56 68
57#if defined(CONFIG_CLASSIC_RCU) 69#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU)
58#include <linux/rcuclassic.h>
59#elif defined(CONFIG_TREE_RCU)
60#include <linux/rcutree.h> 70#include <linux/rcutree.h>
61#elif defined(CONFIG_PREEMPT_RCU)
62#include <linux/rcupreempt.h>
63#else 71#else
64#error "Unknown RCU implementation specified to kernel configuration" 72#error "Unknown RCU implementation specified to kernel configuration"
65#endif /* #else #if defined(CONFIG_CLASSIC_RCU) */ 73#endif
66 74
67#define RCU_HEAD_INIT { .next = NULL, .func = NULL } 75#define RCU_HEAD_INIT { .next = NULL, .func = NULL }
68#define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT 76#define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT
@@ -70,6 +78,16 @@ extern int rcu_scheduler_active;
70 (ptr)->next = NULL; (ptr)->func = NULL; \ 78 (ptr)->next = NULL; (ptr)->func = NULL; \
71} while (0) 79} while (0)
72 80
81#ifdef CONFIG_DEBUG_LOCK_ALLOC
82extern struct lockdep_map rcu_lock_map;
83# define rcu_read_acquire() \
84 lock_acquire(&rcu_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_)
85# define rcu_read_release() lock_release(&rcu_lock_map, 1, _THIS_IP_)
86#else
87# define rcu_read_acquire() do { } while (0)
88# define rcu_read_release() do { } while (0)
89#endif
90
73/** 91/**
74 * rcu_read_lock - mark the beginning of an RCU read-side critical section. 92 * rcu_read_lock - mark the beginning of an RCU read-side critical section.
75 * 93 *
@@ -99,7 +117,12 @@ extern int rcu_scheduler_active;
99 * 117 *
100 * It is illegal to block while in an RCU read-side critical section. 118 * It is illegal to block while in an RCU read-side critical section.
101 */ 119 */
102#define rcu_read_lock() __rcu_read_lock() 120static inline void rcu_read_lock(void)
121{
122 __rcu_read_lock();
123 __acquire(RCU);
124 rcu_read_acquire();
125}
103 126
104/** 127/**
105 * rcu_read_unlock - marks the end of an RCU read-side critical section. 128 * rcu_read_unlock - marks the end of an RCU read-side critical section.
@@ -116,7 +139,12 @@ extern int rcu_scheduler_active;
116 * used as well. RCU does not care how the writers keep out of each 139 * used as well. RCU does not care how the writers keep out of each
117 * others' way, as long as they do so. 140 * others' way, as long as they do so.
118 */ 141 */
119#define rcu_read_unlock() __rcu_read_unlock() 142static inline void rcu_read_unlock(void)
143{
144 rcu_read_release();
145 __release(RCU);
146 __rcu_read_unlock();
147}
120 148
121/** 149/**
122 * rcu_read_lock_bh - mark the beginning of a softirq-only RCU critical section 150 * rcu_read_lock_bh - mark the beginning of a softirq-only RCU critical section
@@ -129,14 +157,24 @@ extern int rcu_scheduler_active;
129 * can use just rcu_read_lock(). 157 * can use just rcu_read_lock().
130 * 158 *
131 */ 159 */
132#define rcu_read_lock_bh() __rcu_read_lock_bh() 160static inline void rcu_read_lock_bh(void)
161{
162 __rcu_read_lock_bh();
163 __acquire(RCU_BH);
164 rcu_read_acquire();
165}
133 166
134/* 167/*
135 * rcu_read_unlock_bh - marks the end of a softirq-only RCU critical section 168 * rcu_read_unlock_bh - marks the end of a softirq-only RCU critical section
136 * 169 *
137 * See rcu_read_lock_bh() for more information. 170 * See rcu_read_lock_bh() for more information.
138 */ 171 */
139#define rcu_read_unlock_bh() __rcu_read_unlock_bh() 172static inline void rcu_read_unlock_bh(void)
173{
174 rcu_read_release();
175 __release(RCU_BH);
176 __rcu_read_unlock_bh();
177}
140 178
141/** 179/**
142 * rcu_read_lock_sched - mark the beginning of a RCU-classic critical section 180 * rcu_read_lock_sched - mark the beginning of a RCU-classic critical section
@@ -147,17 +185,34 @@ extern int rcu_scheduler_active;
147 * - call_rcu_sched() and rcu_barrier_sched() 185 * - call_rcu_sched() and rcu_barrier_sched()
148 * on the write-side to insure proper synchronization. 186 * on the write-side to insure proper synchronization.
149 */ 187 */
150#define rcu_read_lock_sched() preempt_disable() 188static inline void rcu_read_lock_sched(void)
151#define rcu_read_lock_sched_notrace() preempt_disable_notrace() 189{
190 preempt_disable();
191 __acquire(RCU_SCHED);
192 rcu_read_acquire();
193}
194static inline notrace void rcu_read_lock_sched_notrace(void)
195{
196 preempt_disable_notrace();
197 __acquire(RCU_SCHED);
198}
152 199
153/* 200/*
154 * rcu_read_unlock_sched - marks the end of a RCU-classic critical section 201 * rcu_read_unlock_sched - marks the end of a RCU-classic critical section
155 * 202 *
156 * See rcu_read_lock_sched for more information. 203 * See rcu_read_lock_sched for more information.
157 */ 204 */
158#define rcu_read_unlock_sched() preempt_enable() 205static inline void rcu_read_unlock_sched(void)
159#define rcu_read_unlock_sched_notrace() preempt_enable_notrace() 206{
160 207 rcu_read_release();
208 __release(RCU_SCHED);
209 preempt_enable();
210}
211static inline notrace void rcu_read_unlock_sched_notrace(void)
212{
213 __release(RCU_SCHED);
214 preempt_enable_notrace();
215}
161 216
162 217
163/** 218/**
@@ -259,15 +314,4 @@ extern void call_rcu(struct rcu_head *head,
259extern void call_rcu_bh(struct rcu_head *head, 314extern void call_rcu_bh(struct rcu_head *head,
260 void (*func)(struct rcu_head *head)); 315 void (*func)(struct rcu_head *head));
261 316
262/* Exported common interfaces */
263extern void synchronize_rcu(void);
264extern void rcu_barrier(void);
265extern void rcu_barrier_bh(void);
266extern void rcu_barrier_sched(void);
267
268/* Internal to kernel */
269extern void rcu_init(void);
270extern void rcu_scheduler_starting(void);
271extern int rcu_needs_cpu(int cpu);
272
273#endif /* __LINUX_RCUPDATE_H */ 317#endif /* __LINUX_RCUPDATE_H */
diff --git a/include/linux/rcupreempt.h b/include/linux/rcupreempt.h
deleted file mode 100644
index fce522782ffa..000000000000
--- a/include/linux/rcupreempt.h
+++ /dev/null
@@ -1,127 +0,0 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion (RT implementation)
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2006
19 *
20 * Author: Paul McKenney <paulmck@us.ibm.com>
21 *
22 * Based on the original work by Paul McKenney <paul.mckenney@us.ibm.com>
23 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
24 * Papers:
25 * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
26 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
27 *
28 * For detailed explanation of Read-Copy Update mechanism see -
29 * Documentation/RCU
30 *
31 */
32
33#ifndef __LINUX_RCUPREEMPT_H
34#define __LINUX_RCUPREEMPT_H
35
36#include <linux/cache.h>
37#include <linux/spinlock.h>
38#include <linux/threads.h>
39#include <linux/smp.h>
40#include <linux/cpumask.h>
41#include <linux/seqlock.h>
42
43extern void rcu_qsctr_inc(int cpu);
44static inline void rcu_bh_qsctr_inc(int cpu) { }
45
46/*
47 * Someone might want to pass call_rcu_bh as a function pointer.
48 * So this needs to just be a rename and not a macro function.
49 * (no parentheses)
50 */
51#define call_rcu_bh call_rcu
52
53/**
54 * call_rcu_sched - Queue RCU callback for invocation after sched grace period.
55 * @head: structure to be used for queueing the RCU updates.
56 * @func: actual update function to be invoked after the grace period
57 *
58 * The update function will be invoked some time after a full
59 * synchronize_sched()-style grace period elapses, in other words after
60 * all currently executing preempt-disabled sections of code (including
61 * hardirq handlers, NMI handlers, and local_irq_save() blocks) have
62 * completed.
63 */
64extern void call_rcu_sched(struct rcu_head *head,
65 void (*func)(struct rcu_head *head));
66
67extern void __rcu_read_lock(void) __acquires(RCU);
68extern void __rcu_read_unlock(void) __releases(RCU);
69extern int rcu_pending(int cpu);
70extern int rcu_needs_cpu(int cpu);
71
72#define __rcu_read_lock_bh() { rcu_read_lock(); local_bh_disable(); }
73#define __rcu_read_unlock_bh() { local_bh_enable(); rcu_read_unlock(); }
74
75extern void __synchronize_sched(void);
76
77extern void __rcu_init(void);
78extern void rcu_init_sched(void);
79extern void rcu_check_callbacks(int cpu, int user);
80extern void rcu_restart_cpu(int cpu);
81extern long rcu_batches_completed(void);
82
83/*
84 * Return the number of RCU batches processed thus far. Useful for debug
85 * and statistic. The _bh variant is identifcal to straight RCU
86 */
87static inline long rcu_batches_completed_bh(void)
88{
89 return rcu_batches_completed();
90}
91
92#ifdef CONFIG_RCU_TRACE
93struct rcupreempt_trace;
94extern long *rcupreempt_flipctr(int cpu);
95extern long rcupreempt_data_completed(void);
96extern int rcupreempt_flip_flag(int cpu);
97extern int rcupreempt_mb_flag(int cpu);
98extern char *rcupreempt_try_flip_state_name(void);
99extern struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu);
100#endif
101
102struct softirq_action;
103
104#ifdef CONFIG_NO_HZ
105extern void rcu_enter_nohz(void);
106extern void rcu_exit_nohz(void);
107#else
108# define rcu_enter_nohz() do { } while (0)
109# define rcu_exit_nohz() do { } while (0)
110#endif
111
112/*
113 * A context switch is a grace period for rcupreempt synchronize_rcu()
114 * only during early boot, before the scheduler has been initialized.
115 * So, how the heck do we get a context switch? Well, if the caller
116 * invokes synchronize_rcu(), they are willing to accept a context
117 * switch, so we simply pretend that one happened.
118 *
119 * After boot, there might be a blocked or preempted task in an RCU
120 * read-side critical section, so we cannot then take the fastpath.
121 */
122static inline int rcu_blocking_is_gp(void)
123{
124 return num_online_cpus() == 1 && !rcu_scheduler_active;
125}
126
127#endif /* __LINUX_RCUPREEMPT_H */
diff --git a/include/linux/rcupreempt_trace.h b/include/linux/rcupreempt_trace.h
deleted file mode 100644
index b99ae073192a..000000000000
--- a/include/linux/rcupreempt_trace.h
+++ /dev/null
@@ -1,97 +0,0 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion (RT implementation)
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2006
19 *
20 * Author: Paul McKenney <paulmck@us.ibm.com>
21 *
22 * Based on the original work by Paul McKenney <paul.mckenney@us.ibm.com>
23 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
24 * Papers:
25 * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
26 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
27 *
28 * For detailed explanation of the Preemptible Read-Copy Update mechanism see -
29 * http://lwn.net/Articles/253651/
30 */
31
32#ifndef __LINUX_RCUPREEMPT_TRACE_H
33#define __LINUX_RCUPREEMPT_TRACE_H
34
35#include <linux/types.h>
36#include <linux/kernel.h>
37
38#include <asm/atomic.h>
39
40/*
41 * PREEMPT_RCU data structures.
42 */
43
44struct rcupreempt_trace {
45 long next_length;
46 long next_add;
47 long wait_length;
48 long wait_add;
49 long done_length;
50 long done_add;
51 long done_remove;
52 atomic_t done_invoked;
53 long rcu_check_callbacks;
54 atomic_t rcu_try_flip_1;
55 atomic_t rcu_try_flip_e1;
56 long rcu_try_flip_i1;
57 long rcu_try_flip_ie1;
58 long rcu_try_flip_g1;
59 long rcu_try_flip_a1;
60 long rcu_try_flip_ae1;
61 long rcu_try_flip_a2;
62 long rcu_try_flip_z1;
63 long rcu_try_flip_ze1;
64 long rcu_try_flip_z2;
65 long rcu_try_flip_m1;
66 long rcu_try_flip_me1;
67 long rcu_try_flip_m2;
68};
69
70#ifdef CONFIG_RCU_TRACE
71#define RCU_TRACE(fn, arg) fn(arg);
72#else
73#define RCU_TRACE(fn, arg)
74#endif
75
76extern void rcupreempt_trace_move2done(struct rcupreempt_trace *trace);
77extern void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace);
78extern void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace);
79extern void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace);
80extern void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace);
81extern void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace);
82extern void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace);
83extern void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace);
84extern void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace);
85extern void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace);
86extern void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace);
87extern void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace);
88extern void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace);
89extern void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace);
90extern void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace);
91extern void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace);
92extern void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace);
93extern void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace);
94extern void rcupreempt_trace_invoke(struct rcupreempt_trace *trace);
95extern void rcupreempt_trace_next_add(struct rcupreempt_trace *trace);
96
97#endif /* __LINUX_RCUPREEMPT_TRACE_H */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 5a5153806c42..a89307717825 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -30,264 +30,57 @@
30#ifndef __LINUX_RCUTREE_H 30#ifndef __LINUX_RCUTREE_H
31#define __LINUX_RCUTREE_H 31#define __LINUX_RCUTREE_H
32 32
33#include <linux/cache.h> 33extern void rcu_sched_qs(int cpu);
34#include <linux/spinlock.h> 34extern void rcu_bh_qs(int cpu);
35#include <linux/threads.h>
36#include <linux/cpumask.h>
37#include <linux/seqlock.h>
38 35
39/* 36extern int rcu_needs_cpu(int cpu);
40 * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
41 * In theory, it should be possible to add more levels straightforwardly.
42 * In practice, this has not been tested, so there is probably some
43 * bug somewhere.
44 */
45#define MAX_RCU_LVLS 3
46#define RCU_FANOUT (CONFIG_RCU_FANOUT)
47#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT)
48#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT)
49
50#if NR_CPUS <= RCU_FANOUT
51# define NUM_RCU_LVLS 1
52# define NUM_RCU_LVL_0 1
53# define NUM_RCU_LVL_1 (NR_CPUS)
54# define NUM_RCU_LVL_2 0
55# define NUM_RCU_LVL_3 0
56#elif NR_CPUS <= RCU_FANOUT_SQ
57# define NUM_RCU_LVLS 2
58# define NUM_RCU_LVL_0 1
59# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT)
60# define NUM_RCU_LVL_2 (NR_CPUS)
61# define NUM_RCU_LVL_3 0
62#elif NR_CPUS <= RCU_FANOUT_CUBE
63# define NUM_RCU_LVLS 3
64# define NUM_RCU_LVL_0 1
65# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ)
66# define NUM_RCU_LVL_2 (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT))
67# define NUM_RCU_LVL_3 NR_CPUS
68#else
69# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
70#endif /* #if (NR_CPUS) <= RCU_FANOUT */
71
72#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
73#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
74
75/*
76 * Dynticks per-CPU state.
77 */
78struct rcu_dynticks {
79 int dynticks_nesting; /* Track nesting level, sort of. */
80 int dynticks; /* Even value for dynticks-idle, else odd. */
81 int dynticks_nmi; /* Even value for either dynticks-idle or */
82 /* not in nmi handler, else odd. So this */
83 /* remains even for nmi from irq handler. */
84};
85
86/*
87 * Definition for node within the RCU grace-period-detection hierarchy.
88 */
89struct rcu_node {
90 spinlock_t lock;
91 unsigned long qsmask; /* CPUs or groups that need to switch in */
92 /* order for current grace period to proceed.*/
93 unsigned long qsmaskinit;
94 /* Per-GP initialization for qsmask. */
95 unsigned long grpmask; /* Mask to apply to parent qsmask. */
96 int grplo; /* lowest-numbered CPU or group here. */
97 int grphi; /* highest-numbered CPU or group here. */
98 u8 grpnum; /* CPU/group number for next level up. */
99 u8 level; /* root is at level 0. */
100 struct rcu_node *parent;
101} ____cacheline_internodealigned_in_smp;
102
103/* Index values for nxttail array in struct rcu_data. */
104#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
105#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */
106#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */
107#define RCU_NEXT_TAIL 3
108#define RCU_NEXT_SIZE 4
109
110/* Per-CPU data for read-copy update. */
111struct rcu_data {
112 /* 1) quiescent-state and grace-period handling : */
113 long completed; /* Track rsp->completed gp number */
114 /* in order to detect GP end. */
115 long gpnum; /* Highest gp number that this CPU */
116 /* is aware of having started. */
117 long passed_quiesc_completed;
118 /* Value of completed at time of qs. */
119 bool passed_quiesc; /* User-mode/idle loop etc. */
120 bool qs_pending; /* Core waits for quiesc state. */
121 bool beenonline; /* CPU online at least once. */
122 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
123 unsigned long grpmask; /* Mask to apply to leaf qsmask. */
124
125 /* 2) batch handling */
126 /*
127 * If nxtlist is not NULL, it is partitioned as follows.
128 * Any of the partitions might be empty, in which case the
129 * pointer to that partition will be equal to the pointer for
130 * the following partition. When the list is empty, all of
131 * the nxttail elements point to nxtlist, which is NULL.
132 *
133 * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]):
134 * Entries that might have arrived after current GP ended
135 * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
136 * Entries known to have arrived before current GP ended
137 * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
138 * Entries that batch # <= ->completed - 1: waiting for current GP
139 * [nxtlist, *nxttail[RCU_DONE_TAIL]):
140 * Entries that batch # <= ->completed
141 * The grace period for these entries has completed, and
142 * the other grace-period-completed entries may be moved
143 * here temporarily in rcu_process_callbacks().
144 */
145 struct rcu_head *nxtlist;
146 struct rcu_head **nxttail[RCU_NEXT_SIZE];
147 long qlen; /* # of queued callbacks */
148 long blimit; /* Upper limit on a processed batch */
149
150#ifdef CONFIG_NO_HZ
151 /* 3) dynticks interface. */
152 struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */
153 int dynticks_snap; /* Per-GP tracking for dynticks. */
154 int dynticks_nmi_snap; /* Per-GP tracking for dynticks_nmi. */
155#endif /* #ifdef CONFIG_NO_HZ */
156
157 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
158#ifdef CONFIG_NO_HZ
159 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
160#endif /* #ifdef CONFIG_NO_HZ */
161 unsigned long offline_fqs; /* Kicked due to being offline. */
162 unsigned long resched_ipi; /* Sent a resched IPI. */
163
164 /* 5) __rcu_pending() statistics. */
165 long n_rcu_pending; /* rcu_pending() calls since boot. */
166 long n_rp_qs_pending;
167 long n_rp_cb_ready;
168 long n_rp_cpu_needs_gp;
169 long n_rp_gp_completed;
170 long n_rp_gp_started;
171 long n_rp_need_fqs;
172 long n_rp_need_nothing;
173
174 int cpu;
175};
176
177/* Values for signaled field in struct rcu_state. */
178#define RCU_GP_INIT 0 /* Grace period being initialized. */
179#define RCU_SAVE_DYNTICK 1 /* Need to scan dyntick state. */
180#define RCU_FORCE_QS 2 /* Need to force quiescent state. */
181#ifdef CONFIG_NO_HZ
182#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
183#else /* #ifdef CONFIG_NO_HZ */
184#define RCU_SIGNAL_INIT RCU_FORCE_QS
185#endif /* #else #ifdef CONFIG_NO_HZ */
186
187#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
188#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
189#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rsp->jiffies_stall */
190#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rsp->jiffies_stall */
191#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
192 /* to take at least one */
193 /* scheduling clock irq */
194 /* before ratting on them. */
195
196#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
197
198/*
199 * RCU global state, including node hierarchy. This hierarchy is
200 * represented in "heap" form in a dense array. The root (first level)
201 * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
202 * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]),
203 * and the third level in ->node[m+1] and following (->node[m+1] referenced
204 * by ->level[2]). The number of levels is determined by the number of
205 * CPUs and by CONFIG_RCU_FANOUT. Small systems will have a "hierarchy"
206 * consisting of a single rcu_node.
207 */
208struct rcu_state {
209 struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */
210 struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */
211 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
212 u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */
213 struct rcu_data *rda[NR_CPUS]; /* array of rdp pointers. */
214
215 /* The following fields are guarded by the root rcu_node's lock. */
216
217 u8 signaled ____cacheline_internodealigned_in_smp;
218 /* Force QS state. */
219 long gpnum; /* Current gp number. */
220 long completed; /* # of last completed gp. */
221 spinlock_t onofflock; /* exclude on/offline and */
222 /* starting new GP. */
223 spinlock_t fqslock; /* Only one task forcing */
224 /* quiescent states. */
225 unsigned long jiffies_force_qs; /* Time at which to invoke */
226 /* force_quiescent_state(). */
227 unsigned long n_force_qs; /* Number of calls to */
228 /* force_quiescent_state(). */
229 unsigned long n_force_qs_lh; /* ~Number of calls leaving */
230 /* due to lock unavailable. */
231 unsigned long n_force_qs_ngp; /* Number of calls leaving */
232 /* due to no GP active. */
233#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
234 unsigned long gp_start; /* Time at which GP started, */
235 /* but in jiffies. */
236 unsigned long jiffies_stall; /* Time at which to check */
237 /* for CPU stalls. */
238#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
239#ifdef CONFIG_NO_HZ
240 long dynticks_completed; /* Value of completed @ snap. */
241#endif /* #ifdef CONFIG_NO_HZ */
242};
243 37
244extern void rcu_qsctr_inc(int cpu); 38#ifdef CONFIG_TREE_PREEMPT_RCU
245extern void rcu_bh_qsctr_inc(int cpu);
246 39
247extern int rcu_pending(int cpu); 40extern void __rcu_read_lock(void);
248extern int rcu_needs_cpu(int cpu); 41extern void __rcu_read_unlock(void);
42extern void exit_rcu(void);
249 43
250#ifdef CONFIG_DEBUG_LOCK_ALLOC 44#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
251extern struct lockdep_map rcu_lock_map;
252# define rcu_read_acquire() \
253 lock_acquire(&rcu_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_)
254# define rcu_read_release() lock_release(&rcu_lock_map, 1, _THIS_IP_)
255#else
256# define rcu_read_acquire() do { } while (0)
257# define rcu_read_release() do { } while (0)
258#endif
259 45
260static inline void __rcu_read_lock(void) 46static inline void __rcu_read_lock(void)
261{ 47{
262 preempt_disable(); 48 preempt_disable();
263 __acquire(RCU);
264 rcu_read_acquire();
265} 49}
50
266static inline void __rcu_read_unlock(void) 51static inline void __rcu_read_unlock(void)
267{ 52{
268 rcu_read_release();
269 __release(RCU);
270 preempt_enable(); 53 preempt_enable();
271} 54}
55
56static inline void exit_rcu(void)
57{
58}
59
60#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
61
272static inline void __rcu_read_lock_bh(void) 62static inline void __rcu_read_lock_bh(void)
273{ 63{
274 local_bh_disable(); 64 local_bh_disable();
275 __acquire(RCU_BH);
276 rcu_read_acquire();
277} 65}
278static inline void __rcu_read_unlock_bh(void) 66static inline void __rcu_read_unlock_bh(void)
279{ 67{
280 rcu_read_release();
281 __release(RCU_BH);
282 local_bh_enable(); 68 local_bh_enable();
283} 69}
284 70
285#define __synchronize_sched() synchronize_rcu() 71#define __synchronize_sched() synchronize_rcu()
286 72
287#define call_rcu_sched(head, func) call_rcu(head, func) 73extern void call_rcu_sched(struct rcu_head *head,
74 void (*func)(struct rcu_head *rcu));
288 75
289static inline void rcu_init_sched(void) 76static inline void synchronize_rcu_expedited(void)
290{ 77{
78 synchronize_sched_expedited();
79}
80
81static inline void synchronize_rcu_bh_expedited(void)
82{
83 synchronize_sched_expedited();
291} 84}
292 85
293extern void __rcu_init(void); 86extern void __rcu_init(void);
@@ -296,6 +89,11 @@ extern void rcu_restart_cpu(int cpu);
296 89
297extern long rcu_batches_completed(void); 90extern long rcu_batches_completed(void);
298extern long rcu_batches_completed_bh(void); 91extern long rcu_batches_completed_bh(void);
92extern long rcu_batches_completed_sched(void);
93
94static inline void rcu_init_sched(void)
95{
96}
299 97
300#ifdef CONFIG_NO_HZ 98#ifdef CONFIG_NO_HZ
301void rcu_enter_nohz(void); 99void rcu_enter_nohz(void);
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 29f8599e6bea..5fcc31ed5771 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -75,20 +75,6 @@ ring_buffer_event_time_delta(struct ring_buffer_event *event)
75} 75}
76 76
77/* 77/*
78 * ring_buffer_event_discard can discard any event in the ring buffer.
79 * it is up to the caller to protect against a reader from
80 * consuming it or a writer from wrapping and replacing it.
81 *
82 * No external protection is needed if this is called before
83 * the event is commited. But in that case it would be better to
84 * use ring_buffer_discard_commit.
85 *
86 * Note, if an event that has not been committed is discarded
87 * with ring_buffer_event_discard, it must still be committed.
88 */
89void ring_buffer_event_discard(struct ring_buffer_event *event);
90
91/*
92 * ring_buffer_discard_commit will remove an event that has not 78 * ring_buffer_discard_commit will remove an event that has not
93 * ben committed yet. If this is used, then ring_buffer_unlock_commit 79 * ben committed yet. If this is used, then ring_buffer_unlock_commit
94 * must not be called on the discarded event. This function 80 * must not be called on the discarded event. This function
@@ -154,8 +140,17 @@ unsigned long ring_buffer_size(struct ring_buffer *buffer);
154void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu); 140void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu);
155void ring_buffer_reset(struct ring_buffer *buffer); 141void ring_buffer_reset(struct ring_buffer *buffer);
156 142
143#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
157int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, 144int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
158 struct ring_buffer *buffer_b, int cpu); 145 struct ring_buffer *buffer_b, int cpu);
146#else
147static inline int
148ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
149 struct ring_buffer *buffer_b, int cpu)
150{
151 return -ENODEV;
152}
153#endif
159 154
160int ring_buffer_empty(struct ring_buffer *buffer); 155int ring_buffer_empty(struct ring_buffer *buffer);
161int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu); 156int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu);
@@ -170,7 +165,6 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer);
170unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu); 165unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu);
171unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu); 166unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu);
172unsigned long ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu); 167unsigned long ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu);
173unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu);
174 168
175u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu); 169u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu);
176void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer, 170void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9304027673b0..f3d74bd04d18 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -38,6 +38,8 @@
38#define SCHED_BATCH 3 38#define SCHED_BATCH 3
39/* SCHED_ISO: reserved but not implemented yet */ 39/* SCHED_ISO: reserved but not implemented yet */
40#define SCHED_IDLE 5 40#define SCHED_IDLE 5
41/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
42#define SCHED_RESET_ON_FORK 0x40000000
41 43
42#ifdef __KERNEL__ 44#ifdef __KERNEL__
43 45
@@ -796,18 +798,19 @@ enum cpu_idle_type {
796#define SCHED_LOAD_SCALE_FUZZ SCHED_LOAD_SCALE 798#define SCHED_LOAD_SCALE_FUZZ SCHED_LOAD_SCALE
797 799
798#ifdef CONFIG_SMP 800#ifdef CONFIG_SMP
799#define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */ 801#define SD_LOAD_BALANCE 0x0001 /* Do load balancing on this domain. */
800#define SD_BALANCE_NEWIDLE 2 /* Balance when about to become idle */ 802#define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */
801#define SD_BALANCE_EXEC 4 /* Balance on exec */ 803#define SD_BALANCE_EXEC 0x0004 /* Balance on exec */
802#define SD_BALANCE_FORK 8 /* Balance on fork, clone */ 804#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */
803#define SD_WAKE_IDLE 16 /* Wake to idle CPU on task wakeup */ 805#define SD_WAKE_IDLE 0x0010 /* Wake to idle CPU on task wakeup */
804#define SD_WAKE_AFFINE 32 /* Wake task to waking CPU */ 806#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */
805#define SD_WAKE_BALANCE 64 /* Perform balancing at task wakeup */ 807#define SD_WAKE_BALANCE 0x0040 /* Perform balancing at task wakeup */
806#define SD_SHARE_CPUPOWER 128 /* Domain members share cpu power */ 808#define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */
807#define SD_POWERSAVINGS_BALANCE 256 /* Balance for power savings */ 809#define SD_POWERSAVINGS_BALANCE 0x0100 /* Balance for power savings */
808#define SD_SHARE_PKG_RESOURCES 512 /* Domain members share cpu pkg resources */ 810#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */
809#define SD_SERIALIZE 1024 /* Only a single load balancing instance */ 811#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
810#define SD_WAKE_IDLE_FAR 2048 /* Gain latency sacrificing cache hit */ 812#define SD_WAKE_IDLE_FAR 0x0800 /* Gain latency sacrificing cache hit */
813#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */
811 814
812enum powersavings_balance_level { 815enum powersavings_balance_level {
813 POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */ 816 POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */
@@ -827,7 +830,7 @@ static inline int sd_balance_for_mc_power(void)
827 if (sched_smt_power_savings) 830 if (sched_smt_power_savings)
828 return SD_POWERSAVINGS_BALANCE; 831 return SD_POWERSAVINGS_BALANCE;
829 832
830 return 0; 833 return SD_PREFER_SIBLING;
831} 834}
832 835
833static inline int sd_balance_for_package_power(void) 836static inline int sd_balance_for_package_power(void)
@@ -835,7 +838,7 @@ static inline int sd_balance_for_package_power(void)
835 if (sched_mc_power_savings | sched_smt_power_savings) 838 if (sched_mc_power_savings | sched_smt_power_savings)
836 return SD_POWERSAVINGS_BALANCE; 839 return SD_POWERSAVINGS_BALANCE;
837 840
838 return 0; 841 return SD_PREFER_SIBLING;
839} 842}
840 843
841/* 844/*
@@ -857,15 +860,9 @@ struct sched_group {
857 860
858 /* 861 /*
859 * CPU power of this group, SCHED_LOAD_SCALE being max power for a 862 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
860 * single CPU. This is read only (except for setup, hotplug CPU). 863 * single CPU.
861 * Note : Never change cpu_power without recompute its reciprocal
862 */ 864 */
863 unsigned int __cpu_power; 865 unsigned int cpu_power;
864 /*
865 * reciprocal value of cpu_power to avoid expensive divides
866 * (see include/linux/reciprocal_div.h)
867 */
868 u32 reciprocal_cpu_power;
869 866
870 /* 867 /*
871 * The CPUs this group covers. 868 * The CPUs this group covers.
@@ -918,6 +915,7 @@ struct sched_domain {
918 unsigned int newidle_idx; 915 unsigned int newidle_idx;
919 unsigned int wake_idx; 916 unsigned int wake_idx;
920 unsigned int forkexec_idx; 917 unsigned int forkexec_idx;
918 unsigned int smt_gain;
921 int flags; /* See SD_* */ 919 int flags; /* See SD_* */
922 enum sched_domain_level level; 920 enum sched_domain_level level;
923 921
@@ -1045,7 +1043,6 @@ struct sched_class {
1045 struct rq *busiest, struct sched_domain *sd, 1043 struct rq *busiest, struct sched_domain *sd,
1046 enum cpu_idle_type idle); 1044 enum cpu_idle_type idle);
1047 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); 1045 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
1048 int (*needs_post_schedule) (struct rq *this_rq);
1049 void (*post_schedule) (struct rq *this_rq); 1046 void (*post_schedule) (struct rq *this_rq);
1050 void (*task_wake_up) (struct rq *this_rq, struct task_struct *task); 1047 void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
1051 1048
@@ -1110,6 +1107,8 @@ struct sched_entity {
1110 u64 wait_max; 1107 u64 wait_max;
1111 u64 wait_count; 1108 u64 wait_count;
1112 u64 wait_sum; 1109 u64 wait_sum;
1110 u64 iowait_count;
1111 u64 iowait_sum;
1113 1112
1114 u64 sleep_start; 1113 u64 sleep_start;
1115 u64 sleep_max; 1114 u64 sleep_max;
@@ -1163,6 +1162,8 @@ struct sched_rt_entity {
1163#endif 1162#endif
1164}; 1163};
1165 1164
1165struct rcu_node;
1166
1166struct task_struct { 1167struct task_struct {
1167 volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ 1168 volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
1168 void *stack; 1169 void *stack;
@@ -1206,10 +1207,12 @@ struct task_struct {
1206 unsigned int policy; 1207 unsigned int policy;
1207 cpumask_t cpus_allowed; 1208 cpumask_t cpus_allowed;
1208 1209
1209#ifdef CONFIG_PREEMPT_RCU 1210#ifdef CONFIG_TREE_PREEMPT_RCU
1210 int rcu_read_lock_nesting; 1211 int rcu_read_lock_nesting;
1211 int rcu_flipctr_idx; 1212 char rcu_read_unlock_special;
1212#endif /* #ifdef CONFIG_PREEMPT_RCU */ 1213 struct rcu_node *rcu_blocked_node;
1214 struct list_head rcu_node_entry;
1215#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1213 1216
1214#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 1217#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1215 struct sched_info sched_info; 1218 struct sched_info sched_info;
@@ -1230,11 +1233,19 @@ struct task_struct {
1230 unsigned did_exec:1; 1233 unsigned did_exec:1;
1231 unsigned in_execve:1; /* Tell the LSMs that the process is doing an 1234 unsigned in_execve:1; /* Tell the LSMs that the process is doing an
1232 * execve */ 1235 * execve */
1236 unsigned in_iowait:1;
1237
1238
1239 /* Revert to default priority/policy when forking */
1240 unsigned sched_reset_on_fork:1;
1241
1233 pid_t pid; 1242 pid_t pid;
1234 pid_t tgid; 1243 pid_t tgid;
1235 1244
1245#ifdef CONFIG_CC_STACKPROTECTOR
1236 /* Canary value for the -fstack-protector gcc feature */ 1246 /* Canary value for the -fstack-protector gcc feature */
1237 unsigned long stack_canary; 1247 unsigned long stack_canary;
1248#endif
1238 1249
1239 /* 1250 /*
1240 * pointers to (original) parent process, youngest child, younger sibling, 1251 * pointers to (original) parent process, youngest child, younger sibling,
@@ -1725,6 +1736,28 @@ extern cputime_t task_gtime(struct task_struct *p);
1725#define tsk_used_math(p) ((p)->flags & PF_USED_MATH) 1736#define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
1726#define used_math() tsk_used_math(current) 1737#define used_math() tsk_used_math(current)
1727 1738
1739#ifdef CONFIG_TREE_PREEMPT_RCU
1740
1741#define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
1742#define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
1743#define RCU_READ_UNLOCK_GOT_QS (1 << 2) /* CPU has responded to RCU core. */
1744
1745static inline void rcu_copy_process(struct task_struct *p)
1746{
1747 p->rcu_read_lock_nesting = 0;
1748 p->rcu_read_unlock_special = 0;
1749 p->rcu_blocked_node = NULL;
1750 INIT_LIST_HEAD(&p->rcu_node_entry);
1751}
1752
1753#else
1754
1755static inline void rcu_copy_process(struct task_struct *p)
1756{
1757}
1758
1759#endif
1760
1728#ifdef CONFIG_SMP 1761#ifdef CONFIG_SMP
1729extern int set_cpus_allowed_ptr(struct task_struct *p, 1762extern int set_cpus_allowed_ptr(struct task_struct *p,
1730 const struct cpumask *new_mask); 1763 const struct cpumask *new_mask);
@@ -1814,11 +1847,12 @@ extern unsigned int sysctl_sched_min_granularity;
1814extern unsigned int sysctl_sched_wakeup_granularity; 1847extern unsigned int sysctl_sched_wakeup_granularity;
1815extern unsigned int sysctl_sched_shares_ratelimit; 1848extern unsigned int sysctl_sched_shares_ratelimit;
1816extern unsigned int sysctl_sched_shares_thresh; 1849extern unsigned int sysctl_sched_shares_thresh;
1817#ifdef CONFIG_SCHED_DEBUG
1818extern unsigned int sysctl_sched_child_runs_first; 1850extern unsigned int sysctl_sched_child_runs_first;
1851#ifdef CONFIG_SCHED_DEBUG
1819extern unsigned int sysctl_sched_features; 1852extern unsigned int sysctl_sched_features;
1820extern unsigned int sysctl_sched_migration_cost; 1853extern unsigned int sysctl_sched_migration_cost;
1821extern unsigned int sysctl_sched_nr_migrate; 1854extern unsigned int sysctl_sched_nr_migrate;
1855extern unsigned int sysctl_sched_time_avg;
1822extern unsigned int sysctl_timer_migration; 1856extern unsigned int sysctl_timer_migration;
1823 1857
1824int sched_nr_latency_handler(struct ctl_table *table, int write, 1858int sched_nr_latency_handler(struct ctl_table *table, int write,
@@ -2282,23 +2316,31 @@ static inline int need_resched(void)
2282 * cond_resched_softirq() will enable bhs before scheduling. 2316 * cond_resched_softirq() will enable bhs before scheduling.
2283 */ 2317 */
2284extern int _cond_resched(void); 2318extern int _cond_resched(void);
2285#ifdef CONFIG_PREEMPT_BKL 2319
2286static inline int cond_resched(void) 2320#define cond_resched() ({ \
2287{ 2321 __might_sleep(__FILE__, __LINE__, 0); \
2288 return 0; 2322 _cond_resched(); \
2289} 2323})
2324
2325extern int __cond_resched_lock(spinlock_t *lock);
2326
2327#ifdef CONFIG_PREEMPT
2328#define PREEMPT_LOCK_OFFSET PREEMPT_OFFSET
2290#else 2329#else
2291static inline int cond_resched(void) 2330#define PREEMPT_LOCK_OFFSET 0
2292{
2293 return _cond_resched();
2294}
2295#endif 2331#endif
2296extern int cond_resched_lock(spinlock_t * lock); 2332
2297extern int cond_resched_softirq(void); 2333#define cond_resched_lock(lock) ({ \
2298static inline int cond_resched_bkl(void) 2334 __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \
2299{ 2335 __cond_resched_lock(lock); \
2300 return _cond_resched(); 2336})
2301} 2337
2338extern int __cond_resched_softirq(void);
2339
2340#define cond_resched_softirq() ({ \
2341 __might_sleep(__FILE__, __LINE__, SOFTIRQ_OFFSET); \
2342 __cond_resched_softirq(); \
2343})
2302 2344
2303/* 2345/*
2304 * Does a critical section need to be broken due to another 2346 * Does a critical section need to be broken due to another
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 4be57ab03478..f0ca7a7a1757 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -143,15 +143,6 @@ static inline void smp_mb__after_lock(void) { smp_mb(); }
143 */ 143 */
144#define spin_unlock_wait(lock) __raw_spin_unlock_wait(&(lock)->raw_lock) 144#define spin_unlock_wait(lock) __raw_spin_unlock_wait(&(lock)->raw_lock)
145 145
146/*
147 * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
148 */
149#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
150# include <linux/spinlock_api_smp.h>
151#else
152# include <linux/spinlock_api_up.h>
153#endif
154
155#ifdef CONFIG_DEBUG_SPINLOCK 146#ifdef CONFIG_DEBUG_SPINLOCK
156 extern void _raw_spin_lock(spinlock_t *lock); 147 extern void _raw_spin_lock(spinlock_t *lock);
157#define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock) 148#define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock)
@@ -268,50 +259,16 @@ static inline void smp_mb__after_lock(void) { smp_mb(); }
268 259
269#define spin_lock_irq(lock) _spin_lock_irq(lock) 260#define spin_lock_irq(lock) _spin_lock_irq(lock)
270#define spin_lock_bh(lock) _spin_lock_bh(lock) 261#define spin_lock_bh(lock) _spin_lock_bh(lock)
271
272#define read_lock_irq(lock) _read_lock_irq(lock) 262#define read_lock_irq(lock) _read_lock_irq(lock)
273#define read_lock_bh(lock) _read_lock_bh(lock) 263#define read_lock_bh(lock) _read_lock_bh(lock)
274
275#define write_lock_irq(lock) _write_lock_irq(lock) 264#define write_lock_irq(lock) _write_lock_irq(lock)
276#define write_lock_bh(lock) _write_lock_bh(lock) 265#define write_lock_bh(lock) _write_lock_bh(lock)
277 266#define spin_unlock(lock) _spin_unlock(lock)
278/* 267#define read_unlock(lock) _read_unlock(lock)
279 * We inline the unlock functions in the nondebug case: 268#define write_unlock(lock) _write_unlock(lock)
280 */ 269#define spin_unlock_irq(lock) _spin_unlock_irq(lock)
281#if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) || \ 270#define read_unlock_irq(lock) _read_unlock_irq(lock)
282 !defined(CONFIG_SMP) 271#define write_unlock_irq(lock) _write_unlock_irq(lock)
283# define spin_unlock(lock) _spin_unlock(lock)
284# define read_unlock(lock) _read_unlock(lock)
285# define write_unlock(lock) _write_unlock(lock)
286# define spin_unlock_irq(lock) _spin_unlock_irq(lock)
287# define read_unlock_irq(lock) _read_unlock_irq(lock)
288# define write_unlock_irq(lock) _write_unlock_irq(lock)
289#else
290# define spin_unlock(lock) \
291 do {__raw_spin_unlock(&(lock)->raw_lock); __release(lock); } while (0)
292# define read_unlock(lock) \
293 do {__raw_read_unlock(&(lock)->raw_lock); __release(lock); } while (0)
294# define write_unlock(lock) \
295 do {__raw_write_unlock(&(lock)->raw_lock); __release(lock); } while (0)
296# define spin_unlock_irq(lock) \
297do { \
298 __raw_spin_unlock(&(lock)->raw_lock); \
299 __release(lock); \
300 local_irq_enable(); \
301} while (0)
302# define read_unlock_irq(lock) \
303do { \
304 __raw_read_unlock(&(lock)->raw_lock); \
305 __release(lock); \
306 local_irq_enable(); \
307} while (0)
308# define write_unlock_irq(lock) \
309do { \
310 __raw_write_unlock(&(lock)->raw_lock); \
311 __release(lock); \
312 local_irq_enable(); \
313} while (0)
314#endif
315 272
316#define spin_unlock_irqrestore(lock, flags) \ 273#define spin_unlock_irqrestore(lock, flags) \
317 do { \ 274 do { \
@@ -380,4 +337,13 @@ extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
380 */ 337 */
381#define spin_can_lock(lock) (!spin_is_locked(lock)) 338#define spin_can_lock(lock) (!spin_is_locked(lock))
382 339
340/*
341 * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
342 */
343#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
344# include <linux/spinlock_api_smp.h>
345#else
346# include <linux/spinlock_api_up.h>
347#endif
348
383#endif /* __LINUX_SPINLOCK_H */ 349#endif /* __LINUX_SPINLOCK_H */
diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
index d79845d034b5..7a7e18fc2415 100644
--- a/include/linux/spinlock_api_smp.h
+++ b/include/linux/spinlock_api_smp.h
@@ -60,4 +60,398 @@ void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
60void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 60void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
61 __releases(lock); 61 __releases(lock);
62 62
63/*
64 * We inline the unlock functions in the nondebug case:
65 */
66#if !defined(CONFIG_DEBUG_SPINLOCK) && !defined(CONFIG_PREEMPT)
67#define __always_inline__spin_unlock
68#define __always_inline__read_unlock
69#define __always_inline__write_unlock
70#define __always_inline__spin_unlock_irq
71#define __always_inline__read_unlock_irq
72#define __always_inline__write_unlock_irq
73#endif
74
75#ifndef CONFIG_DEBUG_SPINLOCK
76#ifndef CONFIG_GENERIC_LOCKBREAK
77
78#ifdef __always_inline__spin_lock
79#define _spin_lock(lock) __spin_lock(lock)
80#endif
81
82#ifdef __always_inline__read_lock
83#define _read_lock(lock) __read_lock(lock)
84#endif
85
86#ifdef __always_inline__write_lock
87#define _write_lock(lock) __write_lock(lock)
88#endif
89
90#ifdef __always_inline__spin_lock_bh
91#define _spin_lock_bh(lock) __spin_lock_bh(lock)
92#endif
93
94#ifdef __always_inline__read_lock_bh
95#define _read_lock_bh(lock) __read_lock_bh(lock)
96#endif
97
98#ifdef __always_inline__write_lock_bh
99#define _write_lock_bh(lock) __write_lock_bh(lock)
100#endif
101
102#ifdef __always_inline__spin_lock_irq
103#define _spin_lock_irq(lock) __spin_lock_irq(lock)
104#endif
105
106#ifdef __always_inline__read_lock_irq
107#define _read_lock_irq(lock) __read_lock_irq(lock)
108#endif
109
110#ifdef __always_inline__write_lock_irq
111#define _write_lock_irq(lock) __write_lock_irq(lock)
112#endif
113
114#ifdef __always_inline__spin_lock_irqsave
115#define _spin_lock_irqsave(lock) __spin_lock_irqsave(lock)
116#endif
117
118#ifdef __always_inline__read_lock_irqsave
119#define _read_lock_irqsave(lock) __read_lock_irqsave(lock)
120#endif
121
122#ifdef __always_inline__write_lock_irqsave
123#define _write_lock_irqsave(lock) __write_lock_irqsave(lock)
124#endif
125
126#endif /* !CONFIG_GENERIC_LOCKBREAK */
127
128#ifdef __always_inline__spin_trylock
129#define _spin_trylock(lock) __spin_trylock(lock)
130#endif
131
132#ifdef __always_inline__read_trylock
133#define _read_trylock(lock) __read_trylock(lock)
134#endif
135
136#ifdef __always_inline__write_trylock
137#define _write_trylock(lock) __write_trylock(lock)
138#endif
139
140#ifdef __always_inline__spin_trylock_bh
141#define _spin_trylock_bh(lock) __spin_trylock_bh(lock)
142#endif
143
144#ifdef __always_inline__spin_unlock
145#define _spin_unlock(lock) __spin_unlock(lock)
146#endif
147
148#ifdef __always_inline__read_unlock
149#define _read_unlock(lock) __read_unlock(lock)
150#endif
151
152#ifdef __always_inline__write_unlock
153#define _write_unlock(lock) __write_unlock(lock)
154#endif
155
156#ifdef __always_inline__spin_unlock_bh
157#define _spin_unlock_bh(lock) __spin_unlock_bh(lock)
158#endif
159
160#ifdef __always_inline__read_unlock_bh
161#define _read_unlock_bh(lock) __read_unlock_bh(lock)
162#endif
163
164#ifdef __always_inline__write_unlock_bh
165#define _write_unlock_bh(lock) __write_unlock_bh(lock)
166#endif
167
168#ifdef __always_inline__spin_unlock_irq
169#define _spin_unlock_irq(lock) __spin_unlock_irq(lock)
170#endif
171
172#ifdef __always_inline__read_unlock_irq
173#define _read_unlock_irq(lock) __read_unlock_irq(lock)
174#endif
175
176#ifdef __always_inline__write_unlock_irq
177#define _write_unlock_irq(lock) __write_unlock_irq(lock)
178#endif
179
180#ifdef __always_inline__spin_unlock_irqrestore
181#define _spin_unlock_irqrestore(lock, flags) __spin_unlock_irqrestore(lock, flags)
182#endif
183
184#ifdef __always_inline__read_unlock_irqrestore
185#define _read_unlock_irqrestore(lock, flags) __read_unlock_irqrestore(lock, flags)
186#endif
187
188#ifdef __always_inline__write_unlock_irqrestore
189#define _write_unlock_irqrestore(lock, flags) __write_unlock_irqrestore(lock, flags)
190#endif
191
192#endif /* CONFIG_DEBUG_SPINLOCK */
193
194static inline int __spin_trylock(spinlock_t *lock)
195{
196 preempt_disable();
197 if (_raw_spin_trylock(lock)) {
198 spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
199 return 1;
200 }
201 preempt_enable();
202 return 0;
203}
204
205static inline int __read_trylock(rwlock_t *lock)
206{
207 preempt_disable();
208 if (_raw_read_trylock(lock)) {
209 rwlock_acquire_read(&lock->dep_map, 0, 1, _RET_IP_);
210 return 1;
211 }
212 preempt_enable();
213 return 0;
214}
215
216static inline int __write_trylock(rwlock_t *lock)
217{
218 preempt_disable();
219 if (_raw_write_trylock(lock)) {
220 rwlock_acquire(&lock->dep_map, 0, 1, _RET_IP_);
221 return 1;
222 }
223 preempt_enable();
224 return 0;
225}
226
227/*
228 * If lockdep is enabled then we use the non-preemption spin-ops
229 * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
230 * not re-enabled during lock-acquire (which the preempt-spin-ops do):
231 */
232#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
233
234static inline void __read_lock(rwlock_t *lock)
235{
236 preempt_disable();
237 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
238 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
239}
240
241static inline unsigned long __spin_lock_irqsave(spinlock_t *lock)
242{
243 unsigned long flags;
244
245 local_irq_save(flags);
246 preempt_disable();
247 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
248 /*
249 * On lockdep we dont want the hand-coded irq-enable of
250 * _raw_spin_lock_flags() code, because lockdep assumes
251 * that interrupts are not re-enabled during lock-acquire:
252 */
253#ifdef CONFIG_LOCKDEP
254 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
255#else
256 _raw_spin_lock_flags(lock, &flags);
257#endif
258 return flags;
259}
260
261static inline void __spin_lock_irq(spinlock_t *lock)
262{
263 local_irq_disable();
264 preempt_disable();
265 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
266 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
267}
268
269static inline void __spin_lock_bh(spinlock_t *lock)
270{
271 local_bh_disable();
272 preempt_disable();
273 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
274 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
275}
276
277static inline unsigned long __read_lock_irqsave(rwlock_t *lock)
278{
279 unsigned long flags;
280
281 local_irq_save(flags);
282 preempt_disable();
283 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
284 LOCK_CONTENDED_FLAGS(lock, _raw_read_trylock, _raw_read_lock,
285 _raw_read_lock_flags, &flags);
286 return flags;
287}
288
289static inline void __read_lock_irq(rwlock_t *lock)
290{
291 local_irq_disable();
292 preempt_disable();
293 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
294 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
295}
296
297static inline void __read_lock_bh(rwlock_t *lock)
298{
299 local_bh_disable();
300 preempt_disable();
301 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
302 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
303}
304
305static inline unsigned long __write_lock_irqsave(rwlock_t *lock)
306{
307 unsigned long flags;
308
309 local_irq_save(flags);
310 preempt_disable();
311 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
312 LOCK_CONTENDED_FLAGS(lock, _raw_write_trylock, _raw_write_lock,
313 _raw_write_lock_flags, &flags);
314 return flags;
315}
316
317static inline void __write_lock_irq(rwlock_t *lock)
318{
319 local_irq_disable();
320 preempt_disable();
321 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
322 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
323}
324
325static inline void __write_lock_bh(rwlock_t *lock)
326{
327 local_bh_disable();
328 preempt_disable();
329 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
330 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
331}
332
333static inline void __spin_lock(spinlock_t *lock)
334{
335 preempt_disable();
336 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
337 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
338}
339
340static inline void __write_lock(rwlock_t *lock)
341{
342 preempt_disable();
343 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
344 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
345}
346
347#endif /* CONFIG_PREEMPT */
348
349static inline void __spin_unlock(spinlock_t *lock)
350{
351 spin_release(&lock->dep_map, 1, _RET_IP_);
352 _raw_spin_unlock(lock);
353 preempt_enable();
354}
355
356static inline void __write_unlock(rwlock_t *lock)
357{
358 rwlock_release(&lock->dep_map, 1, _RET_IP_);
359 _raw_write_unlock(lock);
360 preempt_enable();
361}
362
363static inline void __read_unlock(rwlock_t *lock)
364{
365 rwlock_release(&lock->dep_map, 1, _RET_IP_);
366 _raw_read_unlock(lock);
367 preempt_enable();
368}
369
370static inline void __spin_unlock_irqrestore(spinlock_t *lock,
371 unsigned long flags)
372{
373 spin_release(&lock->dep_map, 1, _RET_IP_);
374 _raw_spin_unlock(lock);
375 local_irq_restore(flags);
376 preempt_enable();
377}
378
379static inline void __spin_unlock_irq(spinlock_t *lock)
380{
381 spin_release(&lock->dep_map, 1, _RET_IP_);
382 _raw_spin_unlock(lock);
383 local_irq_enable();
384 preempt_enable();
385}
386
387static inline void __spin_unlock_bh(spinlock_t *lock)
388{
389 spin_release(&lock->dep_map, 1, _RET_IP_);
390 _raw_spin_unlock(lock);
391 preempt_enable_no_resched();
392 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
393}
394
395static inline void __read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
396{
397 rwlock_release(&lock->dep_map, 1, _RET_IP_);
398 _raw_read_unlock(lock);
399 local_irq_restore(flags);
400 preempt_enable();
401}
402
403static inline void __read_unlock_irq(rwlock_t *lock)
404{
405 rwlock_release(&lock->dep_map, 1, _RET_IP_);
406 _raw_read_unlock(lock);
407 local_irq_enable();
408 preempt_enable();
409}
410
411static inline void __read_unlock_bh(rwlock_t *lock)
412{
413 rwlock_release(&lock->dep_map, 1, _RET_IP_);
414 _raw_read_unlock(lock);
415 preempt_enable_no_resched();
416 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
417}
418
419static inline void __write_unlock_irqrestore(rwlock_t *lock,
420 unsigned long flags)
421{
422 rwlock_release(&lock->dep_map, 1, _RET_IP_);
423 _raw_write_unlock(lock);
424 local_irq_restore(flags);
425 preempt_enable();
426}
427
428static inline void __write_unlock_irq(rwlock_t *lock)
429{
430 rwlock_release(&lock->dep_map, 1, _RET_IP_);
431 _raw_write_unlock(lock);
432 local_irq_enable();
433 preempt_enable();
434}
435
436static inline void __write_unlock_bh(rwlock_t *lock)
437{
438 rwlock_release(&lock->dep_map, 1, _RET_IP_);
439 _raw_write_unlock(lock);
440 preempt_enable_no_resched();
441 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
442}
443
444static inline int __spin_trylock_bh(spinlock_t *lock)
445{
446 local_bh_disable();
447 preempt_disable();
448 if (_raw_spin_trylock(lock)) {
449 spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
450 return 1;
451 }
452 preempt_enable_no_resched();
453 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
454 return 0;
455}
456
63#endif /* __LINUX_SPINLOCK_API_SMP_H */ 457#endif /* __LINUX_SPINLOCK_API_SMP_H */
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index cb1a6631b8f4..73b1f1cec423 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -14,7 +14,6 @@ struct scatterlist;
14 */ 14 */
15#define IO_TLB_SEGSIZE 128 15#define IO_TLB_SEGSIZE 128
16 16
17
18/* 17/*
19 * log of the size of each IO TLB slab. The number of slabs is command line 18 * log of the size of each IO TLB slab. The number of slabs is command line
20 * controllable. 19 * controllable.
@@ -24,16 +23,6 @@ struct scatterlist;
24extern void 23extern void
25swiotlb_init(void); 24swiotlb_init(void);
26 25
27extern void *swiotlb_alloc_boot(size_t bytes, unsigned long nslabs);
28extern void *swiotlb_alloc(unsigned order, unsigned long nslabs);
29
30extern dma_addr_t swiotlb_phys_to_bus(struct device *hwdev,
31 phys_addr_t address);
32extern phys_addr_t swiotlb_bus_to_phys(struct device *hwdev,
33 dma_addr_t address);
34
35extern int swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size);
36
37extern void 26extern void
38*swiotlb_alloc_coherent(struct device *hwdev, size_t size, 27*swiotlb_alloc_coherent(struct device *hwdev, size_t size,
39 dma_addr_t *dma_handle, gfp_t flags); 28 dma_addr_t *dma_handle, gfp_t flags);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 80de7003d8c2..a8e37821cc60 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -64,6 +64,7 @@ struct perf_counter_attr;
64#include <linux/sem.h> 64#include <linux/sem.h>
65#include <asm/siginfo.h> 65#include <asm/siginfo.h>
66#include <asm/signal.h> 66#include <asm/signal.h>
67#include <linux/unistd.h>
67#include <linux/quota.h> 68#include <linux/quota.h>
68#include <linux/key.h> 69#include <linux/key.h>
69#include <trace/syscall.h> 70#include <trace/syscall.h>
@@ -97,6 +98,53 @@ struct perf_counter_attr;
97#define __SC_TEST5(t5, a5, ...) __SC_TEST(t5); __SC_TEST4(__VA_ARGS__) 98#define __SC_TEST5(t5, a5, ...) __SC_TEST(t5); __SC_TEST4(__VA_ARGS__)
98#define __SC_TEST6(t6, a6, ...) __SC_TEST(t6); __SC_TEST5(__VA_ARGS__) 99#define __SC_TEST6(t6, a6, ...) __SC_TEST(t6); __SC_TEST5(__VA_ARGS__)
99 100
101#ifdef CONFIG_EVENT_PROFILE
102#define TRACE_SYS_ENTER_PROFILE(sname) \
103static int prof_sysenter_enable_##sname(struct ftrace_event_call *event_call) \
104{ \
105 int ret = 0; \
106 if (!atomic_inc_return(&event_enter_##sname.profile_count)) \
107 ret = reg_prof_syscall_enter("sys"#sname); \
108 return ret; \
109} \
110 \
111static void prof_sysenter_disable_##sname(struct ftrace_event_call *event_call)\
112{ \
113 if (atomic_add_negative(-1, &event_enter_##sname.profile_count)) \
114 unreg_prof_syscall_enter("sys"#sname); \
115}
116
117#define TRACE_SYS_EXIT_PROFILE(sname) \
118static int prof_sysexit_enable_##sname(struct ftrace_event_call *event_call) \
119{ \
120 int ret = 0; \
121 if (!atomic_inc_return(&event_exit_##sname.profile_count)) \
122 ret = reg_prof_syscall_exit("sys"#sname); \
123 return ret; \
124} \
125 \
126static void prof_sysexit_disable_##sname(struct ftrace_event_call *event_call) \
127{ \
128 if (atomic_add_negative(-1, &event_exit_##sname.profile_count)) \
129 unreg_prof_syscall_exit("sys"#sname); \
130}
131
132#define TRACE_SYS_ENTER_PROFILE_INIT(sname) \
133 .profile_count = ATOMIC_INIT(-1), \
134 .profile_enable = prof_sysenter_enable_##sname, \
135 .profile_disable = prof_sysenter_disable_##sname,
136
137#define TRACE_SYS_EXIT_PROFILE_INIT(sname) \
138 .profile_count = ATOMIC_INIT(-1), \
139 .profile_enable = prof_sysexit_enable_##sname, \
140 .profile_disable = prof_sysexit_disable_##sname,
141#else
142#define TRACE_SYS_ENTER_PROFILE(sname)
143#define TRACE_SYS_ENTER_PROFILE_INIT(sname)
144#define TRACE_SYS_EXIT_PROFILE(sname)
145#define TRACE_SYS_EXIT_PROFILE_INIT(sname)
146#endif
147
100#ifdef CONFIG_FTRACE_SYSCALLS 148#ifdef CONFIG_FTRACE_SYSCALLS
101#define __SC_STR_ADECL1(t, a) #a 149#define __SC_STR_ADECL1(t, a) #a
102#define __SC_STR_ADECL2(t, a, ...) #a, __SC_STR_ADECL1(__VA_ARGS__) 150#define __SC_STR_ADECL2(t, a, ...) #a, __SC_STR_ADECL1(__VA_ARGS__)
@@ -112,7 +160,81 @@ struct perf_counter_attr;
112#define __SC_STR_TDECL5(t, a, ...) #t, __SC_STR_TDECL4(__VA_ARGS__) 160#define __SC_STR_TDECL5(t, a, ...) #t, __SC_STR_TDECL4(__VA_ARGS__)
113#define __SC_STR_TDECL6(t, a, ...) #t, __SC_STR_TDECL5(__VA_ARGS__) 161#define __SC_STR_TDECL6(t, a, ...) #t, __SC_STR_TDECL5(__VA_ARGS__)
114 162
163#define SYSCALL_TRACE_ENTER_EVENT(sname) \
164 static struct ftrace_event_call event_enter_##sname; \
165 struct trace_event enter_syscall_print_##sname = { \
166 .trace = print_syscall_enter, \
167 }; \
168 static int init_enter_##sname(void) \
169 { \
170 int num, id; \
171 num = syscall_name_to_nr("sys"#sname); \
172 if (num < 0) \
173 return -ENOSYS; \
174 id = register_ftrace_event(&enter_syscall_print_##sname);\
175 if (!id) \
176 return -ENODEV; \
177 event_enter_##sname.id = id; \
178 set_syscall_enter_id(num, id); \
179 INIT_LIST_HEAD(&event_enter_##sname.fields); \
180 return 0; \
181 } \
182 TRACE_SYS_ENTER_PROFILE(sname); \
183 static struct ftrace_event_call __used \
184 __attribute__((__aligned__(4))) \
185 __attribute__((section("_ftrace_events"))) \
186 event_enter_##sname = { \
187 .name = "sys_enter"#sname, \
188 .system = "syscalls", \
189 .event = &event_syscall_enter, \
190 .raw_init = init_enter_##sname, \
191 .show_format = syscall_enter_format, \
192 .define_fields = syscall_enter_define_fields, \
193 .regfunc = reg_event_syscall_enter, \
194 .unregfunc = unreg_event_syscall_enter, \
195 .data = "sys"#sname, \
196 TRACE_SYS_ENTER_PROFILE_INIT(sname) \
197 }
198
199#define SYSCALL_TRACE_EXIT_EVENT(sname) \
200 static struct ftrace_event_call event_exit_##sname; \
201 struct trace_event exit_syscall_print_##sname = { \
202 .trace = print_syscall_exit, \
203 }; \
204 static int init_exit_##sname(void) \
205 { \
206 int num, id; \
207 num = syscall_name_to_nr("sys"#sname); \
208 if (num < 0) \
209 return -ENOSYS; \
210 id = register_ftrace_event(&exit_syscall_print_##sname);\
211 if (!id) \
212 return -ENODEV; \
213 event_exit_##sname.id = id; \
214 set_syscall_exit_id(num, id); \
215 INIT_LIST_HEAD(&event_exit_##sname.fields); \
216 return 0; \
217 } \
218 TRACE_SYS_EXIT_PROFILE(sname); \
219 static struct ftrace_event_call __used \
220 __attribute__((__aligned__(4))) \
221 __attribute__((section("_ftrace_events"))) \
222 event_exit_##sname = { \
223 .name = "sys_exit"#sname, \
224 .system = "syscalls", \
225 .event = &event_syscall_exit, \
226 .raw_init = init_exit_##sname, \
227 .show_format = syscall_exit_format, \
228 .define_fields = syscall_exit_define_fields, \
229 .regfunc = reg_event_syscall_exit, \
230 .unregfunc = unreg_event_syscall_exit, \
231 .data = "sys"#sname, \
232 TRACE_SYS_EXIT_PROFILE_INIT(sname) \
233 }
234
115#define SYSCALL_METADATA(sname, nb) \ 235#define SYSCALL_METADATA(sname, nb) \
236 SYSCALL_TRACE_ENTER_EVENT(sname); \
237 SYSCALL_TRACE_EXIT_EVENT(sname); \
116 static const struct syscall_metadata __used \ 238 static const struct syscall_metadata __used \
117 __attribute__((__aligned__(4))) \ 239 __attribute__((__aligned__(4))) \
118 __attribute__((section("__syscalls_metadata"))) \ 240 __attribute__((section("__syscalls_metadata"))) \
@@ -121,18 +243,23 @@ struct perf_counter_attr;
121 .nb_args = nb, \ 243 .nb_args = nb, \
122 .types = types_##sname, \ 244 .types = types_##sname, \
123 .args = args_##sname, \ 245 .args = args_##sname, \
124 } 246 .enter_event = &event_enter_##sname, \
247 .exit_event = &event_exit_##sname, \
248 };
125 249
126#define SYSCALL_DEFINE0(sname) \ 250#define SYSCALL_DEFINE0(sname) \
251 SYSCALL_TRACE_ENTER_EVENT(_##sname); \
252 SYSCALL_TRACE_EXIT_EVENT(_##sname); \
127 static const struct syscall_metadata __used \ 253 static const struct syscall_metadata __used \
128 __attribute__((__aligned__(4))) \ 254 __attribute__((__aligned__(4))) \
129 __attribute__((section("__syscalls_metadata"))) \ 255 __attribute__((section("__syscalls_metadata"))) \
130 __syscall_meta_##sname = { \ 256 __syscall_meta_##sname = { \
131 .name = "sys_"#sname, \ 257 .name = "sys_"#sname, \
132 .nb_args = 0, \ 258 .nb_args = 0, \
259 .enter_event = &event_enter__##sname, \
260 .exit_event = &event_exit__##sname, \
133 }; \ 261 }; \
134 asmlinkage long sys_##sname(void) 262 asmlinkage long sys_##sname(void)
135
136#else 263#else
137#define SYSCALL_DEFINE0(name) asmlinkage long sys_##name(void) 264#define SYSCALL_DEFINE0(name) asmlinkage long sys_##name(void)
138#endif 265#endif
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 7402c1a27c4f..85e8cf7d393c 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -85,20 +85,29 @@ int arch_update_cpu_topology(void);
85#define ARCH_HAS_SCHED_WAKE_IDLE 85#define ARCH_HAS_SCHED_WAKE_IDLE
86/* Common values for SMT siblings */ 86/* Common values for SMT siblings */
87#ifndef SD_SIBLING_INIT 87#ifndef SD_SIBLING_INIT
88#define SD_SIBLING_INIT (struct sched_domain) { \ 88#define SD_SIBLING_INIT (struct sched_domain) { \
89 .min_interval = 1, \ 89 .min_interval = 1, \
90 .max_interval = 2, \ 90 .max_interval = 2, \
91 .busy_factor = 64, \ 91 .busy_factor = 64, \
92 .imbalance_pct = 110, \ 92 .imbalance_pct = 110, \
93 .flags = SD_LOAD_BALANCE \ 93 \
94 | SD_BALANCE_NEWIDLE \ 94 .flags = 1*SD_LOAD_BALANCE \
95 | SD_BALANCE_FORK \ 95 | 1*SD_BALANCE_NEWIDLE \
96 | SD_BALANCE_EXEC \ 96 | 1*SD_BALANCE_EXEC \
97 | SD_WAKE_AFFINE \ 97 | 1*SD_BALANCE_FORK \
98 | SD_WAKE_BALANCE \ 98 | 0*SD_WAKE_IDLE \
99 | SD_SHARE_CPUPOWER, \ 99 | 1*SD_WAKE_AFFINE \
100 .last_balance = jiffies, \ 100 | 1*SD_WAKE_BALANCE \
101 .balance_interval = 1, \ 101 | 1*SD_SHARE_CPUPOWER \
102 | 0*SD_POWERSAVINGS_BALANCE \
103 | 0*SD_SHARE_PKG_RESOURCES \
104 | 0*SD_SERIALIZE \
105 | 0*SD_WAKE_IDLE_FAR \
106 | 0*SD_PREFER_SIBLING \
107 , \
108 .last_balance = jiffies, \
109 .balance_interval = 1, \
110 .smt_gain = 1178, /* 15% */ \
102} 111}
103#endif 112#endif
104#endif /* CONFIG_SCHED_SMT */ 113#endif /* CONFIG_SCHED_SMT */
@@ -106,69 +115,94 @@ int arch_update_cpu_topology(void);
106#ifdef CONFIG_SCHED_MC 115#ifdef CONFIG_SCHED_MC
107/* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */ 116/* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */
108#ifndef SD_MC_INIT 117#ifndef SD_MC_INIT
109#define SD_MC_INIT (struct sched_domain) { \ 118#define SD_MC_INIT (struct sched_domain) { \
110 .min_interval = 1, \ 119 .min_interval = 1, \
111 .max_interval = 4, \ 120 .max_interval = 4, \
112 .busy_factor = 64, \ 121 .busy_factor = 64, \
113 .imbalance_pct = 125, \ 122 .imbalance_pct = 125, \
114 .cache_nice_tries = 1, \ 123 .cache_nice_tries = 1, \
115 .busy_idx = 2, \ 124 .busy_idx = 2, \
116 .wake_idx = 1, \ 125 .wake_idx = 1, \
117 .forkexec_idx = 1, \ 126 .forkexec_idx = 1, \
118 .flags = SD_LOAD_BALANCE \ 127 \
119 | SD_BALANCE_FORK \ 128 .flags = 1*SD_LOAD_BALANCE \
120 | SD_BALANCE_EXEC \ 129 | 1*SD_BALANCE_NEWIDLE \
121 | SD_WAKE_AFFINE \ 130 | 1*SD_BALANCE_EXEC \
122 | SD_WAKE_BALANCE \ 131 | 1*SD_BALANCE_FORK \
123 | SD_SHARE_PKG_RESOURCES\ 132 | 1*SD_WAKE_IDLE \
124 | sd_balance_for_mc_power()\ 133 | 1*SD_WAKE_AFFINE \
125 | sd_power_saving_flags(),\ 134 | 1*SD_WAKE_BALANCE \
126 .last_balance = jiffies, \ 135 | 0*SD_SHARE_CPUPOWER \
127 .balance_interval = 1, \ 136 | 1*SD_SHARE_PKG_RESOURCES \
137 | 0*SD_SERIALIZE \
138 | 0*SD_WAKE_IDLE_FAR \
139 | sd_balance_for_mc_power() \
140 | sd_power_saving_flags() \
141 , \
142 .last_balance = jiffies, \
143 .balance_interval = 1, \
128} 144}
129#endif 145#endif
130#endif /* CONFIG_SCHED_MC */ 146#endif /* CONFIG_SCHED_MC */
131 147
132/* Common values for CPUs */ 148/* Common values for CPUs */
133#ifndef SD_CPU_INIT 149#ifndef SD_CPU_INIT
134#define SD_CPU_INIT (struct sched_domain) { \ 150#define SD_CPU_INIT (struct sched_domain) { \
135 .min_interval = 1, \ 151 .min_interval = 1, \
136 .max_interval = 4, \ 152 .max_interval = 4, \
137 .busy_factor = 64, \ 153 .busy_factor = 64, \
138 .imbalance_pct = 125, \ 154 .imbalance_pct = 125, \
139 .cache_nice_tries = 1, \ 155 .cache_nice_tries = 1, \
140 .busy_idx = 2, \ 156 .busy_idx = 2, \
141 .idle_idx = 1, \ 157 .idle_idx = 1, \
142 .newidle_idx = 2, \ 158 .newidle_idx = 2, \
143 .wake_idx = 1, \ 159 .wake_idx = 1, \
144 .forkexec_idx = 1, \ 160 .forkexec_idx = 1, \
145 .flags = SD_LOAD_BALANCE \ 161 \
146 | SD_BALANCE_EXEC \ 162 .flags = 1*SD_LOAD_BALANCE \
147 | SD_BALANCE_FORK \ 163 | 1*SD_BALANCE_NEWIDLE \
148 | SD_WAKE_AFFINE \ 164 | 1*SD_BALANCE_EXEC \
149 | SD_WAKE_BALANCE \ 165 | 1*SD_BALANCE_FORK \
150 | sd_balance_for_package_power()\ 166 | 1*SD_WAKE_IDLE \
151 | sd_power_saving_flags(),\ 167 | 0*SD_WAKE_AFFINE \
152 .last_balance = jiffies, \ 168 | 1*SD_WAKE_BALANCE \
153 .balance_interval = 1, \ 169 | 0*SD_SHARE_CPUPOWER \
170 | 0*SD_SHARE_PKG_RESOURCES \
171 | 0*SD_SERIALIZE \
172 | 0*SD_WAKE_IDLE_FAR \
173 | sd_balance_for_package_power() \
174 | sd_power_saving_flags() \
175 , \
176 .last_balance = jiffies, \
177 .balance_interval = 1, \
154} 178}
155#endif 179#endif
156 180
157/* sched_domains SD_ALLNODES_INIT for NUMA machines */ 181/* sched_domains SD_ALLNODES_INIT for NUMA machines */
158#define SD_ALLNODES_INIT (struct sched_domain) { \ 182#define SD_ALLNODES_INIT (struct sched_domain) { \
159 .min_interval = 64, \ 183 .min_interval = 64, \
160 .max_interval = 64*num_online_cpus(), \ 184 .max_interval = 64*num_online_cpus(), \
161 .busy_factor = 128, \ 185 .busy_factor = 128, \
162 .imbalance_pct = 133, \ 186 .imbalance_pct = 133, \
163 .cache_nice_tries = 1, \ 187 .cache_nice_tries = 1, \
164 .busy_idx = 3, \ 188 .busy_idx = 3, \
165 .idle_idx = 3, \ 189 .idle_idx = 3, \
166 .flags = SD_LOAD_BALANCE \ 190 .flags = 1*SD_LOAD_BALANCE \
167 | SD_BALANCE_NEWIDLE \ 191 | 1*SD_BALANCE_NEWIDLE \
168 | SD_WAKE_AFFINE \ 192 | 0*SD_BALANCE_EXEC \
169 | SD_SERIALIZE, \ 193 | 0*SD_BALANCE_FORK \
170 .last_balance = jiffies, \ 194 | 0*SD_WAKE_IDLE \
171 .balance_interval = 64, \ 195 | 1*SD_WAKE_AFFINE \
196 | 0*SD_WAKE_BALANCE \
197 | 0*SD_SHARE_CPUPOWER \
198 | 0*SD_POWERSAVINGS_BALANCE \
199 | 0*SD_SHARE_PKG_RESOURCES \
200 | 1*SD_SERIALIZE \
201 | 1*SD_WAKE_IDLE_FAR \
202 | 0*SD_PREFER_SIBLING \
203 , \
204 .last_balance = jiffies, \
205 .balance_interval = 64, \
172} 206}
173 207
174#ifdef CONFIG_NUMA 208#ifdef CONFIG_NUMA
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index b9dc4ca0246f..63a3f7a80580 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -23,6 +23,8 @@ struct tracepoint;
23struct tracepoint { 23struct tracepoint {
24 const char *name; /* Tracepoint name */ 24 const char *name; /* Tracepoint name */
25 int state; /* State. */ 25 int state; /* State. */
26 void (*regfunc)(void);
27 void (*unregfunc)(void);
26 void **funcs; 28 void **funcs;
27} __attribute__((aligned(32))); /* 29} __attribute__((aligned(32))); /*
28 * Aligned on 32 bytes because it is 30 * Aligned on 32 bytes because it is
@@ -78,12 +80,16 @@ struct tracepoint {
78 return tracepoint_probe_unregister(#name, (void *)probe);\ 80 return tracepoint_probe_unregister(#name, (void *)probe);\
79 } 81 }
80 82
81#define DEFINE_TRACE(name) \ 83
84#define DEFINE_TRACE_FN(name, reg, unreg) \
82 static const char __tpstrtab_##name[] \ 85 static const char __tpstrtab_##name[] \
83 __attribute__((section("__tracepoints_strings"))) = #name; \ 86 __attribute__((section("__tracepoints_strings"))) = #name; \
84 struct tracepoint __tracepoint_##name \ 87 struct tracepoint __tracepoint_##name \
85 __attribute__((section("__tracepoints"), aligned(32))) = \ 88 __attribute__((section("__tracepoints"), aligned(32))) = \
86 { __tpstrtab_##name, 0, NULL } 89 { __tpstrtab_##name, 0, reg, unreg, NULL }
90
91#define DEFINE_TRACE(name) \
92 DEFINE_TRACE_FN(name, NULL, NULL);
87 93
88#define EXPORT_TRACEPOINT_SYMBOL_GPL(name) \ 94#define EXPORT_TRACEPOINT_SYMBOL_GPL(name) \
89 EXPORT_SYMBOL_GPL(__tracepoint_##name) 95 EXPORT_SYMBOL_GPL(__tracepoint_##name)
@@ -108,6 +114,7 @@ extern void tracepoint_update_probe_range(struct tracepoint *begin,
108 return -ENOSYS; \ 114 return -ENOSYS; \
109 } 115 }
110 116
117#define DEFINE_TRACE_FN(name, reg, unreg)
111#define DEFINE_TRACE(name) 118#define DEFINE_TRACE(name)
112#define EXPORT_TRACEPOINT_SYMBOL_GPL(name) 119#define EXPORT_TRACEPOINT_SYMBOL_GPL(name)
113#define EXPORT_TRACEPOINT_SYMBOL(name) 120#define EXPORT_TRACEPOINT_SYMBOL(name)
@@ -158,6 +165,15 @@ static inline void tracepoint_synchronize_unregister(void)
158 165
159#define PARAMS(args...) args 166#define PARAMS(args...) args
160 167
168#endif /* _LINUX_TRACEPOINT_H */
169
170/*
171 * Note: we keep the TRACE_EVENT outside the include file ifdef protection.
172 * This is due to the way trace events work. If a file includes two
173 * trace event headers under one "CREATE_TRACE_POINTS" the first include
174 * will override the TRACE_EVENT and break the second include.
175 */
176
161#ifndef TRACE_EVENT 177#ifndef TRACE_EVENT
162/* 178/*
163 * For use with the TRACE_EVENT macro: 179 * For use with the TRACE_EVENT macro:
@@ -259,10 +275,15 @@ static inline void tracepoint_synchronize_unregister(void)
259 * can also by used by generic instrumentation like SystemTap), and 275 * can also by used by generic instrumentation like SystemTap), and
260 * it is also used to expose a structured trace record in 276 * it is also used to expose a structured trace record in
261 * /sys/kernel/debug/tracing/events/. 277 * /sys/kernel/debug/tracing/events/.
278 *
279 * A set of (un)registration functions can be passed to the variant
280 * TRACE_EVENT_FN to perform any (un)registration work.
262 */ 281 */
263 282
264#define TRACE_EVENT(name, proto, args, struct, assign, print) \ 283#define TRACE_EVENT(name, proto, args, struct, assign, print) \
265 DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) 284 DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
266#endif 285#define TRACE_EVENT_FN(name, proto, args, struct, \
286 assign, print, reg, unreg) \
287 DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
267 288
268#endif 289#endif /* ifdef TRACE_EVENT (see note above) */
diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
index f7a7ae1e8f90..2a4b3bf74033 100644
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -26,6 +26,11 @@
26#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \ 26#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \
27 DEFINE_TRACE(name) 27 DEFINE_TRACE(name)
28 28
29#undef TRACE_EVENT_FN
30#define TRACE_EVENT_FN(name, proto, args, tstruct, \
31 assign, print, reg, unreg) \
32 DEFINE_TRACE_FN(name, reg, unreg)
33
29#undef DECLARE_TRACE 34#undef DECLARE_TRACE
30#define DECLARE_TRACE(name, proto, args) \ 35#define DECLARE_TRACE(name, proto, args) \
31 DEFINE_TRACE(name) 36 DEFINE_TRACE(name)
@@ -56,6 +61,8 @@
56#include <trace/ftrace.h> 61#include <trace/ftrace.h>
57#endif 62#endif
58 63
64#undef TRACE_EVENT
65#undef TRACE_EVENT_FN
59#undef TRACE_HEADER_MULTI_READ 66#undef TRACE_HEADER_MULTI_READ
60 67
61/* Only undef what we defined in this file */ 68/* Only undef what we defined in this file */
diff --git a/include/trace/events/module.h b/include/trace/events/module.h
new file mode 100644
index 000000000000..84160fb18478
--- /dev/null
+++ b/include/trace/events/module.h
@@ -0,0 +1,126 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM module
3
4#if !defined(_TRACE_MODULE_H) || defined(TRACE_HEADER_MULTI_READ)
5#define _TRACE_MODULE_H
6
7#include <linux/tracepoint.h>
8
9#ifdef CONFIG_MODULES
10
11struct module;
12
13#define show_module_flags(flags) __print_flags(flags, "", \
14 { (1UL << TAINT_PROPRIETARY_MODULE), "P" }, \
15 { (1UL << TAINT_FORCED_MODULE), "F" }, \
16 { (1UL << TAINT_CRAP), "C" })
17
18TRACE_EVENT(module_load,
19
20 TP_PROTO(struct module *mod),
21
22 TP_ARGS(mod),
23
24 TP_STRUCT__entry(
25 __field( unsigned int, taints )
26 __string( name, mod->name )
27 ),
28
29 TP_fast_assign(
30 __entry->taints = mod->taints;
31 __assign_str(name, mod->name);
32 ),
33
34 TP_printk("%s %s", __get_str(name), show_module_flags(__entry->taints))
35);
36
37TRACE_EVENT(module_free,
38
39 TP_PROTO(struct module *mod),
40
41 TP_ARGS(mod),
42
43 TP_STRUCT__entry(
44 __string( name, mod->name )
45 ),
46
47 TP_fast_assign(
48 __assign_str(name, mod->name);
49 ),
50
51 TP_printk("%s", __get_str(name))
52);
53
54TRACE_EVENT(module_get,
55
56 TP_PROTO(struct module *mod, unsigned long ip, int refcnt),
57
58 TP_ARGS(mod, ip, refcnt),
59
60 TP_STRUCT__entry(
61 __field( unsigned long, ip )
62 __field( int, refcnt )
63 __string( name, mod->name )
64 ),
65
66 TP_fast_assign(
67 __entry->ip = ip;
68 __entry->refcnt = refcnt;
69 __assign_str(name, mod->name);
70 ),
71
72 TP_printk("%s call_site=%pf refcnt=%d",
73 __get_str(name), (void *)__entry->ip, __entry->refcnt)
74);
75
76TRACE_EVENT(module_put,
77
78 TP_PROTO(struct module *mod, unsigned long ip, int refcnt),
79
80 TP_ARGS(mod, ip, refcnt),
81
82 TP_STRUCT__entry(
83 __field( unsigned long, ip )
84 __field( int, refcnt )
85 __string( name, mod->name )
86 ),
87
88 TP_fast_assign(
89 __entry->ip = ip;
90 __entry->refcnt = refcnt;
91 __assign_str(name, mod->name);
92 ),
93
94 TP_printk("%s call_site=%pf refcnt=%d",
95 __get_str(name), (void *)__entry->ip, __entry->refcnt)
96);
97
98TRACE_EVENT(module_request,
99
100 TP_PROTO(char *name, bool wait, unsigned long ip),
101
102 TP_ARGS(name, wait, ip),
103
104 TP_STRUCT__entry(
105 __field( bool, wait )
106 __field( unsigned long, ip )
107 __string( name, name )
108 ),
109
110 TP_fast_assign(
111 __entry->wait = wait;
112 __entry->ip = ip;
113 __assign_str(name, name);
114 ),
115
116 TP_printk("%s wait=%d call_site=%pf",
117 __get_str(name), (int)__entry->wait, (void *)__entry->ip)
118);
119
120#endif /* CONFIG_MODULES */
121
122#endif /* _TRACE_MODULE_H */
123
124/* This part must be outside protection */
125#include <trace/define_trace.h>
126
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 8949bb7eb082..b48f1ad7c946 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -94,6 +94,7 @@ TRACE_EVENT(sched_wakeup,
94 __field( pid_t, pid ) 94 __field( pid_t, pid )
95 __field( int, prio ) 95 __field( int, prio )
96 __field( int, success ) 96 __field( int, success )
97 __field( int, cpu )
97 ), 98 ),
98 99
99 TP_fast_assign( 100 TP_fast_assign(
@@ -101,11 +102,12 @@ TRACE_EVENT(sched_wakeup,
101 __entry->pid = p->pid; 102 __entry->pid = p->pid;
102 __entry->prio = p->prio; 103 __entry->prio = p->prio;
103 __entry->success = success; 104 __entry->success = success;
105 __entry->cpu = task_cpu(p);
104 ), 106 ),
105 107
106 TP_printk("task %s:%d [%d] success=%d", 108 TP_printk("task %s:%d [%d] success=%d [%03d]",
107 __entry->comm, __entry->pid, __entry->prio, 109 __entry->comm, __entry->pid, __entry->prio,
108 __entry->success) 110 __entry->success, __entry->cpu)
109); 111);
110 112
111/* 113/*
@@ -125,6 +127,7 @@ TRACE_EVENT(sched_wakeup_new,
125 __field( pid_t, pid ) 127 __field( pid_t, pid )
126 __field( int, prio ) 128 __field( int, prio )
127 __field( int, success ) 129 __field( int, success )
130 __field( int, cpu )
128 ), 131 ),
129 132
130 TP_fast_assign( 133 TP_fast_assign(
@@ -132,11 +135,12 @@ TRACE_EVENT(sched_wakeup_new,
132 __entry->pid = p->pid; 135 __entry->pid = p->pid;
133 __entry->prio = p->prio; 136 __entry->prio = p->prio;
134 __entry->success = success; 137 __entry->success = success;
138 __entry->cpu = task_cpu(p);
135 ), 139 ),
136 140
137 TP_printk("task %s:%d [%d] success=%d", 141 TP_printk("task %s:%d [%d] success=%d [%03d]",
138 __entry->comm, __entry->pid, __entry->prio, 142 __entry->comm, __entry->pid, __entry->prio,
139 __entry->success) 143 __entry->success, __entry->cpu)
140); 144);
141 145
142/* 146/*
@@ -340,6 +344,101 @@ TRACE_EVENT(sched_signal_send,
340 __entry->sig, __entry->comm, __entry->pid) 344 __entry->sig, __entry->comm, __entry->pid)
341); 345);
342 346
347/*
348 * XXX the below sched_stat tracepoints only apply to SCHED_OTHER/BATCH/IDLE
349 * adding sched_stat support to SCHED_FIFO/RR would be welcome.
350 */
351
352/*
353 * Tracepoint for accounting wait time (time the task is runnable
354 * but not actually running due to scheduler contention).
355 */
356TRACE_EVENT(sched_stat_wait,
357
358 TP_PROTO(struct task_struct *tsk, u64 delay),
359
360 TP_ARGS(tsk, delay),
361
362 TP_STRUCT__entry(
363 __array( char, comm, TASK_COMM_LEN )
364 __field( pid_t, pid )
365 __field( u64, delay )
366 ),
367
368 TP_fast_assign(
369 memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
370 __entry->pid = tsk->pid;
371 __entry->delay = delay;
372 )
373 TP_perf_assign(
374 __perf_count(delay);
375 ),
376
377 TP_printk("task: %s:%d wait: %Lu [ns]",
378 __entry->comm, __entry->pid,
379 (unsigned long long)__entry->delay)
380);
381
382/*
383 * Tracepoint for accounting sleep time (time the task is not runnable,
384 * including iowait, see below).
385 */
386TRACE_EVENT(sched_stat_sleep,
387
388 TP_PROTO(struct task_struct *tsk, u64 delay),
389
390 TP_ARGS(tsk, delay),
391
392 TP_STRUCT__entry(
393 __array( char, comm, TASK_COMM_LEN )
394 __field( pid_t, pid )
395 __field( u64, delay )
396 ),
397
398 TP_fast_assign(
399 memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
400 __entry->pid = tsk->pid;
401 __entry->delay = delay;
402 )
403 TP_perf_assign(
404 __perf_count(delay);
405 ),
406
407 TP_printk("task: %s:%d sleep: %Lu [ns]",
408 __entry->comm, __entry->pid,
409 (unsigned long long)__entry->delay)
410);
411
412/*
413 * Tracepoint for accounting iowait time (time the task is not runnable
414 * due to waiting on IO to complete).
415 */
416TRACE_EVENT(sched_stat_iowait,
417
418 TP_PROTO(struct task_struct *tsk, u64 delay),
419
420 TP_ARGS(tsk, delay),
421
422 TP_STRUCT__entry(
423 __array( char, comm, TASK_COMM_LEN )
424 __field( pid_t, pid )
425 __field( u64, delay )
426 ),
427
428 TP_fast_assign(
429 memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
430 __entry->pid = tsk->pid;
431 __entry->delay = delay;
432 )
433 TP_perf_assign(
434 __perf_count(delay);
435 ),
436
437 TP_printk("task: %s:%d iowait: %Lu [ns]",
438 __entry->comm, __entry->pid,
439 (unsigned long long)__entry->delay)
440);
441
343#endif /* _TRACE_SCHED_H */ 442#endif /* _TRACE_SCHED_H */
344 443
345/* This part must be outside protection */ 444/* This part must be outside protection */
diff --git a/include/trace/events/syscalls.h b/include/trace/events/syscalls.h
new file mode 100644
index 000000000000..397dff2dbd5a
--- /dev/null
+++ b/include/trace/events/syscalls.h
@@ -0,0 +1,70 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM syscalls
3
4#if !defined(_TRACE_EVENTS_SYSCALLS_H) || defined(TRACE_HEADER_MULTI_READ)
5#define _TRACE_EVENTS_SYSCALLS_H
6
7#include <linux/tracepoint.h>
8
9#include <asm/ptrace.h>
10#include <asm/syscall.h>
11
12
13#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
14
15extern void syscall_regfunc(void);
16extern void syscall_unregfunc(void);
17
18TRACE_EVENT_FN(sys_enter,
19
20 TP_PROTO(struct pt_regs *regs, long id),
21
22 TP_ARGS(regs, id),
23
24 TP_STRUCT__entry(
25 __field( long, id )
26 __array( unsigned long, args, 6 )
27 ),
28
29 TP_fast_assign(
30 __entry->id = id;
31 syscall_get_arguments(current, regs, 0, 6, __entry->args);
32 ),
33
34 TP_printk("NR %ld (%lx, %lx, %lx, %lx, %lx, %lx)",
35 __entry->id,
36 __entry->args[0], __entry->args[1], __entry->args[2],
37 __entry->args[3], __entry->args[4], __entry->args[5]),
38
39 syscall_regfunc, syscall_unregfunc
40);
41
42TRACE_EVENT_FN(sys_exit,
43
44 TP_PROTO(struct pt_regs *regs, long ret),
45
46 TP_ARGS(regs, ret),
47
48 TP_STRUCT__entry(
49 __field( long, id )
50 __field( long, ret )
51 ),
52
53 TP_fast_assign(
54 __entry->id = syscall_get_nr(current, regs);
55 __entry->ret = ret;
56 ),
57
58 TP_printk("NR %ld = %ld",
59 __entry->id, __entry->ret),
60
61 syscall_regfunc, syscall_unregfunc
62);
63
64#endif /* CONFIG_HAVE_SYSCALL_TRACEPOINTS */
65
66#endif /* _TRACE_EVENTS_SYSCALLS_H */
67
68/* This part must be outside protection */
69#include <trace/define_trace.h>
70
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index f64fbaae781a..308bafd93325 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -21,11 +21,14 @@
21#undef __field 21#undef __field
22#define __field(type, item) type item; 22#define __field(type, item) type item;
23 23
24#undef __field_ext
25#define __field_ext(type, item, filter_type) type item;
26
24#undef __array 27#undef __array
25#define __array(type, item, len) type item[len]; 28#define __array(type, item, len) type item[len];
26 29
27#undef __dynamic_array 30#undef __dynamic_array
28#define __dynamic_array(type, item, len) unsigned short __data_loc_##item; 31#define __dynamic_array(type, item, len) u32 __data_loc_##item;
29 32
30#undef __string 33#undef __string
31#define __string(item, src) __dynamic_array(char, item, -1) 34#define __string(item, src) __dynamic_array(char, item, -1)
@@ -42,6 +45,16 @@
42 }; \ 45 }; \
43 static struct ftrace_event_call event_##name 46 static struct ftrace_event_call event_##name
44 47
48#undef __cpparg
49#define __cpparg(arg...) arg
50
51/* Callbacks are meaningless to ftrace. */
52#undef TRACE_EVENT_FN
53#define TRACE_EVENT_FN(name, proto, args, tstruct, \
54 assign, print, reg, unreg) \
55 TRACE_EVENT(name, __cpparg(proto), __cpparg(args), \
56 __cpparg(tstruct), __cpparg(assign), __cpparg(print)) \
57
45#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) 58#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
46 59
47 60
@@ -51,23 +64,27 @@
51 * Include the following: 64 * Include the following:
52 * 65 *
53 * struct ftrace_data_offsets_<call> { 66 * struct ftrace_data_offsets_<call> {
54 * int <item1>; 67 * u32 <item1>;
55 * int <item2>; 68 * u32 <item2>;
56 * [...] 69 * [...]
57 * }; 70 * };
58 * 71 *
59 * The __dynamic_array() macro will create each int <item>, this is 72 * The __dynamic_array() macro will create each u32 <item>, this is
60 * to keep the offset of each array from the beginning of the event. 73 * to keep the offset of each array from the beginning of the event.
74 * The size of an array is also encoded, in the higher 16 bits of <item>.
61 */ 75 */
62 76
63#undef __field 77#undef __field
64#define __field(type, item); 78#define __field(type, item)
79
80#undef __field_ext
81#define __field_ext(type, item, filter_type)
65 82
66#undef __array 83#undef __array
67#define __array(type, item, len) 84#define __array(type, item, len)
68 85
69#undef __dynamic_array 86#undef __dynamic_array
70#define __dynamic_array(type, item, len) int item; 87#define __dynamic_array(type, item, len) u32 item;
71 88
72#undef __string 89#undef __string
73#define __string(item, src) __dynamic_array(char, item, -1) 90#define __string(item, src) __dynamic_array(char, item, -1)
@@ -109,6 +126,9 @@
109 if (!ret) \ 126 if (!ret) \
110 return 0; 127 return 0;
111 128
129#undef __field_ext
130#define __field_ext(type, item, filter_type) __field(type, item)
131
112#undef __array 132#undef __array
113#define __array(type, item, len) \ 133#define __array(type, item, len) \
114 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ 134 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
@@ -120,7 +140,7 @@
120 140
121#undef __dynamic_array 141#undef __dynamic_array
122#define __dynamic_array(type, item, len) \ 142#define __dynamic_array(type, item, len) \
123 ret = trace_seq_printf(s, "\tfield:__data_loc " #item ";\t" \ 143 ret = trace_seq_printf(s, "\tfield:__data_loc " #type "[] " #item ";\t"\
124 "offset:%u;\tsize:%u;\n", \ 144 "offset:%u;\tsize:%u;\n", \
125 (unsigned int)offsetof(typeof(field), \ 145 (unsigned int)offsetof(typeof(field), \
126 __data_loc_##item), \ 146 __data_loc_##item), \
@@ -150,7 +170,8 @@
150#undef TRACE_EVENT 170#undef TRACE_EVENT
151#define TRACE_EVENT(call, proto, args, tstruct, func, print) \ 171#define TRACE_EVENT(call, proto, args, tstruct, func, print) \
152static int \ 172static int \
153ftrace_format_##call(struct trace_seq *s) \ 173ftrace_format_##call(struct ftrace_event_call *unused, \
174 struct trace_seq *s) \
154{ \ 175{ \
155 struct ftrace_raw_##call field __attribute__((unused)); \ 176 struct ftrace_raw_##call field __attribute__((unused)); \
156 int ret = 0; \ 177 int ret = 0; \
@@ -210,7 +231,7 @@ ftrace_format_##call(struct trace_seq *s) \
210 231
211#undef __get_dynamic_array 232#undef __get_dynamic_array
212#define __get_dynamic_array(field) \ 233#define __get_dynamic_array(field) \
213 ((void *)__entry + __entry->__data_loc_##field) 234 ((void *)__entry + (__entry->__data_loc_##field & 0xffff))
214 235
215#undef __get_str 236#undef __get_str
216#define __get_str(field) (char *)__get_dynamic_array(field) 237#define __get_str(field) (char *)__get_dynamic_array(field)
@@ -263,28 +284,33 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \
263 284
264#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) 285#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
265 286
266#undef __field 287#undef __field_ext
267#define __field(type, item) \ 288#define __field_ext(type, item, filter_type) \
268 ret = trace_define_field(event_call, #type, #item, \ 289 ret = trace_define_field(event_call, #type, #item, \
269 offsetof(typeof(field), item), \ 290 offsetof(typeof(field), item), \
270 sizeof(field.item), is_signed_type(type)); \ 291 sizeof(field.item), \
292 is_signed_type(type), filter_type); \
271 if (ret) \ 293 if (ret) \
272 return ret; 294 return ret;
273 295
296#undef __field
297#define __field(type, item) __field_ext(type, item, FILTER_OTHER)
298
274#undef __array 299#undef __array
275#define __array(type, item, len) \ 300#define __array(type, item, len) \
276 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ 301 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
277 ret = trace_define_field(event_call, #type "[" #len "]", #item, \ 302 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
278 offsetof(typeof(field), item), \ 303 offsetof(typeof(field), item), \
279 sizeof(field.item), 0); \ 304 sizeof(field.item), 0, FILTER_OTHER); \
280 if (ret) \ 305 if (ret) \
281 return ret; 306 return ret;
282 307
283#undef __dynamic_array 308#undef __dynamic_array
284#define __dynamic_array(type, item, len) \ 309#define __dynamic_array(type, item, len) \
285 ret = trace_define_field(event_call, "__data_loc" "[" #type "]", #item,\ 310 ret = trace_define_field(event_call, "__data_loc " #type "[]", #item, \
286 offsetof(typeof(field), __data_loc_##item), \ 311 offsetof(typeof(field), __data_loc_##item), \
287 sizeof(field.__data_loc_##item), 0); 312 sizeof(field.__data_loc_##item), 0, \
313 FILTER_OTHER);
288 314
289#undef __string 315#undef __string
290#define __string(item, src) __dynamic_array(char, item, -1) 316#define __string(item, src) __dynamic_array(char, item, -1)
@@ -292,17 +318,14 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \
292#undef TRACE_EVENT 318#undef TRACE_EVENT
293#define TRACE_EVENT(call, proto, args, tstruct, func, print) \ 319#define TRACE_EVENT(call, proto, args, tstruct, func, print) \
294int \ 320int \
295ftrace_define_fields_##call(void) \ 321ftrace_define_fields_##call(struct ftrace_event_call *event_call) \
296{ \ 322{ \
297 struct ftrace_raw_##call field; \ 323 struct ftrace_raw_##call field; \
298 struct ftrace_event_call *event_call = &event_##call; \
299 int ret; \ 324 int ret; \
300 \ 325 \
301 __common_field(int, type, 1); \ 326 ret = trace_define_common_fields(event_call); \
302 __common_field(unsigned char, flags, 0); \ 327 if (ret) \
303 __common_field(unsigned char, preempt_count, 0); \ 328 return ret; \
304 __common_field(int, pid, 1); \
305 __common_field(int, tgid, 1); \
306 \ 329 \
307 tstruct; \ 330 tstruct; \
308 \ 331 \
@@ -321,6 +344,9 @@ ftrace_define_fields_##call(void) \
321#undef __field 344#undef __field
322#define __field(type, item) 345#define __field(type, item)
323 346
347#undef __field_ext
348#define __field_ext(type, item, filter_type)
349
324#undef __array 350#undef __array
325#define __array(type, item, len) 351#define __array(type, item, len)
326 352
@@ -328,6 +354,7 @@ ftrace_define_fields_##call(void) \
328#define __dynamic_array(type, item, len) \ 354#define __dynamic_array(type, item, len) \
329 __data_offsets->item = __data_size + \ 355 __data_offsets->item = __data_size + \
330 offsetof(typeof(*entry), __data); \ 356 offsetof(typeof(*entry), __data); \
357 __data_offsets->item |= (len * sizeof(type)) << 16; \
331 __data_size += (len) * sizeof(type); 358 __data_size += (len) * sizeof(type);
332 359
333#undef __string 360#undef __string
@@ -433,13 +460,15 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\
433 * { 460 * {
434 * struct ring_buffer_event *event; 461 * struct ring_buffer_event *event;
435 * struct ftrace_raw_<call> *entry; <-- defined in stage 1 462 * struct ftrace_raw_<call> *entry; <-- defined in stage 1
463 * struct ring_buffer *buffer;
436 * unsigned long irq_flags; 464 * unsigned long irq_flags;
437 * int pc; 465 * int pc;
438 * 466 *
439 * local_save_flags(irq_flags); 467 * local_save_flags(irq_flags);
440 * pc = preempt_count(); 468 * pc = preempt_count();
441 * 469 *
442 * event = trace_current_buffer_lock_reserve(event_<call>.id, 470 * event = trace_current_buffer_lock_reserve(&buffer,
471 * event_<call>.id,
443 * sizeof(struct ftrace_raw_<call>), 472 * sizeof(struct ftrace_raw_<call>),
444 * irq_flags, pc); 473 * irq_flags, pc);
445 * if (!event) 474 * if (!event)
@@ -449,7 +478,7 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\
449 * <assign>; <-- Here we assign the entries by the __field and 478 * <assign>; <-- Here we assign the entries by the __field and
450 * __array macros. 479 * __array macros.
451 * 480 *
452 * trace_current_buffer_unlock_commit(event, irq_flags, pc); 481 * trace_current_buffer_unlock_commit(buffer, event, irq_flags, pc);
453 * } 482 * }
454 * 483 *
455 * static int ftrace_raw_reg_event_<call>(void) 484 * static int ftrace_raw_reg_event_<call>(void)
@@ -541,6 +570,7 @@ static void ftrace_raw_event_##call(proto) \
541 struct ftrace_event_call *event_call = &event_##call; \ 570 struct ftrace_event_call *event_call = &event_##call; \
542 struct ring_buffer_event *event; \ 571 struct ring_buffer_event *event; \
543 struct ftrace_raw_##call *entry; \ 572 struct ftrace_raw_##call *entry; \
573 struct ring_buffer *buffer; \
544 unsigned long irq_flags; \ 574 unsigned long irq_flags; \
545 int __data_size; \ 575 int __data_size; \
546 int pc; \ 576 int pc; \
@@ -550,7 +580,8 @@ static void ftrace_raw_event_##call(proto) \
550 \ 580 \
551 __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ 581 __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
552 \ 582 \
553 event = trace_current_buffer_lock_reserve(event_##call.id, \ 583 event = trace_current_buffer_lock_reserve(&buffer, \
584 event_##call.id, \
554 sizeof(*entry) + __data_size, \ 585 sizeof(*entry) + __data_size, \
555 irq_flags, pc); \ 586 irq_flags, pc); \
556 if (!event) \ 587 if (!event) \
@@ -562,11 +593,12 @@ static void ftrace_raw_event_##call(proto) \
562 \ 593 \
563 { assign; } \ 594 { assign; } \
564 \ 595 \
565 if (!filter_current_check_discard(event_call, entry, event)) \ 596 if (!filter_current_check_discard(buffer, event_call, entry, event)) \
566 trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \ 597 trace_nowake_buffer_unlock_commit(buffer, \
598 event, irq_flags, pc); \
567} \ 599} \
568 \ 600 \
569static int ftrace_raw_reg_event_##call(void) \ 601static int ftrace_raw_reg_event_##call(void *ptr) \
570{ \ 602{ \
571 int ret; \ 603 int ret; \
572 \ 604 \
@@ -577,7 +609,7 @@ static int ftrace_raw_reg_event_##call(void) \
577 return ret; \ 609 return ret; \
578} \ 610} \
579 \ 611 \
580static void ftrace_raw_unreg_event_##call(void) \ 612static void ftrace_raw_unreg_event_##call(void *ptr) \
581{ \ 613{ \
582 unregister_trace_##call(ftrace_raw_event_##call); \ 614 unregister_trace_##call(ftrace_raw_event_##call); \
583} \ 615} \
@@ -595,7 +627,6 @@ static int ftrace_raw_init_event_##call(void) \
595 return -ENODEV; \ 627 return -ENODEV; \
596 event_##call.id = id; \ 628 event_##call.id = id; \
597 INIT_LIST_HEAD(&event_##call.fields); \ 629 INIT_LIST_HEAD(&event_##call.fields); \
598 init_preds(&event_##call); \
599 return 0; \ 630 return 0; \
600} \ 631} \
601 \ 632 \
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index 8cfe515cbc47..5dc283ba5ae0 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -1,8 +1,13 @@
1#ifndef _TRACE_SYSCALL_H 1#ifndef _TRACE_SYSCALL_H
2#define _TRACE_SYSCALL_H 2#define _TRACE_SYSCALL_H
3 3
4#include <linux/tracepoint.h>
5#include <linux/unistd.h>
6#include <linux/ftrace_event.h>
7
4#include <asm/ptrace.h> 8#include <asm/ptrace.h>
5 9
10
6/* 11/*
7 * A syscall entry in the ftrace syscalls array. 12 * A syscall entry in the ftrace syscalls array.
8 * 13 *
@@ -10,26 +15,49 @@
10 * @nb_args: number of parameters it takes 15 * @nb_args: number of parameters it takes
11 * @types: list of types as strings 16 * @types: list of types as strings
12 * @args: list of args as strings (args[i] matches types[i]) 17 * @args: list of args as strings (args[i] matches types[i])
18 * @enter_id: associated ftrace enter event id
19 * @exit_id: associated ftrace exit event id
20 * @enter_event: associated syscall_enter trace event
21 * @exit_event: associated syscall_exit trace event
13 */ 22 */
14struct syscall_metadata { 23struct syscall_metadata {
15 const char *name; 24 const char *name;
16 int nb_args; 25 int nb_args;
17 const char **types; 26 const char **types;
18 const char **args; 27 const char **args;
28 int enter_id;
29 int exit_id;
30
31 struct ftrace_event_call *enter_event;
32 struct ftrace_event_call *exit_event;
19}; 33};
20 34
21#ifdef CONFIG_FTRACE_SYSCALLS 35#ifdef CONFIG_FTRACE_SYSCALLS
22extern void arch_init_ftrace_syscalls(void);
23extern struct syscall_metadata *syscall_nr_to_meta(int nr); 36extern struct syscall_metadata *syscall_nr_to_meta(int nr);
24extern void start_ftrace_syscalls(void); 37extern int syscall_name_to_nr(char *name);
25extern void stop_ftrace_syscalls(void); 38void set_syscall_enter_id(int num, int id);
26extern void ftrace_syscall_enter(struct pt_regs *regs); 39void set_syscall_exit_id(int num, int id);
27extern void ftrace_syscall_exit(struct pt_regs *regs); 40extern struct trace_event event_syscall_enter;
28#else 41extern struct trace_event event_syscall_exit;
29static inline void start_ftrace_syscalls(void) { } 42extern int reg_event_syscall_enter(void *ptr);
30static inline void stop_ftrace_syscalls(void) { } 43extern void unreg_event_syscall_enter(void *ptr);
31static inline void ftrace_syscall_enter(struct pt_regs *regs) { } 44extern int reg_event_syscall_exit(void *ptr);
32static inline void ftrace_syscall_exit(struct pt_regs *regs) { } 45extern void unreg_event_syscall_exit(void *ptr);
46extern int syscall_enter_format(struct ftrace_event_call *call,
47 struct trace_seq *s);
48extern int syscall_exit_format(struct ftrace_event_call *call,
49 struct trace_seq *s);
50extern int syscall_enter_define_fields(struct ftrace_event_call *call);
51extern int syscall_exit_define_fields(struct ftrace_event_call *call);
52enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags);
53enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags);
54#endif
55#ifdef CONFIG_EVENT_PROFILE
56int reg_prof_syscall_enter(char *name);
57void unreg_prof_syscall_enter(char *name);
58int reg_prof_syscall_exit(char *name);
59void unreg_prof_syscall_exit(char *name);
60
33#endif 61#endif
34 62
35#endif /* _TRACE_SYSCALL_H */ 63#endif /* _TRACE_SYSCALL_H */
diff --git a/init/Kconfig b/init/Kconfig
index 3f7e60995c80..8e8b76d8a272 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -316,38 +316,28 @@ choice
316 prompt "RCU Implementation" 316 prompt "RCU Implementation"
317 default TREE_RCU 317 default TREE_RCU
318 318
319config CLASSIC_RCU
320 bool "Classic RCU"
321 help
322 This option selects the classic RCU implementation that is
323 designed for best read-side performance on non-realtime
324 systems.
325
326 Select this option if you are unsure.
327
328config TREE_RCU 319config TREE_RCU
329 bool "Tree-based hierarchical RCU" 320 bool "Tree-based hierarchical RCU"
330 help 321 help
331 This option selects the RCU implementation that is 322 This option selects the RCU implementation that is
332 designed for very large SMP system with hundreds or 323 designed for very large SMP system with hundreds or
333 thousands of CPUs. 324 thousands of CPUs. It also scales down nicely to
325 smaller systems.
334 326
335config PREEMPT_RCU 327config TREE_PREEMPT_RCU
336 bool "Preemptible RCU" 328 bool "Preemptable tree-based hierarchical RCU"
337 depends on PREEMPT 329 depends on PREEMPT
338 help 330 help
339 This option reduces the latency of the kernel by making certain 331 This option selects the RCU implementation that is
340 RCU sections preemptible. Normally RCU code is non-preemptible, if 332 designed for very large SMP systems with hundreds or
341 this option is selected then read-only RCU sections become 333 thousands of CPUs, but for which real-time response
342 preemptible. This helps latency, but may expose bugs due to 334 is also required.
343 now-naive assumptions about each RCU read-side critical section
344 remaining on a given CPU through its execution.
345 335
346endchoice 336endchoice
347 337
348config RCU_TRACE 338config RCU_TRACE
349 bool "Enable tracing for RCU" 339 bool "Enable tracing for RCU"
350 depends on TREE_RCU || PREEMPT_RCU 340 depends on TREE_RCU || TREE_PREEMPT_RCU
351 help 341 help
352 This option provides tracing in RCU which presents stats 342 This option provides tracing in RCU which presents stats
353 in debugfs for debugging RCU implementation. 343 in debugfs for debugging RCU implementation.
@@ -359,7 +349,7 @@ config RCU_FANOUT
359 int "Tree-based hierarchical RCU fanout value" 349 int "Tree-based hierarchical RCU fanout value"
360 range 2 64 if 64BIT 350 range 2 64 if 64BIT
361 range 2 32 if !64BIT 351 range 2 32 if !64BIT
362 depends on TREE_RCU 352 depends on TREE_RCU || TREE_PREEMPT_RCU
363 default 64 if 64BIT 353 default 64 if 64BIT
364 default 32 if !64BIT 354 default 32 if !64BIT
365 help 355 help
@@ -374,7 +364,7 @@ config RCU_FANOUT
374 364
375config RCU_FANOUT_EXACT 365config RCU_FANOUT_EXACT
376 bool "Disable tree-based hierarchical RCU auto-balancing" 366 bool "Disable tree-based hierarchical RCU auto-balancing"
377 depends on TREE_RCU 367 depends on TREE_RCU || TREE_PREEMPT_RCU
378 default n 368 default n
379 help 369 help
380 This option forces use of the exact RCU_FANOUT value specified, 370 This option forces use of the exact RCU_FANOUT value specified,
@@ -387,18 +377,12 @@ config RCU_FANOUT_EXACT
387 Say N if unsure. 377 Say N if unsure.
388 378
389config TREE_RCU_TRACE 379config TREE_RCU_TRACE
390 def_bool RCU_TRACE && TREE_RCU 380 def_bool RCU_TRACE && ( TREE_RCU || TREE_PREEMPT_RCU )
391 select DEBUG_FS
392 help
393 This option provides tracing for the TREE_RCU implementation,
394 permitting Makefile to trivially select kernel/rcutree_trace.c.
395
396config PREEMPT_RCU_TRACE
397 def_bool RCU_TRACE && PREEMPT_RCU
398 select DEBUG_FS 381 select DEBUG_FS
399 help 382 help
400 This option provides tracing for the PREEMPT_RCU implementation, 383 This option provides tracing for the TREE_RCU and
401 permitting Makefile to trivially select kernel/rcupreempt_trace.c. 384 TREE_PREEMPT_RCU implementations, permitting Makefile to
385 trivially select kernel/rcutree_trace.c.
402 386
403endmenu # "RCU Subsystem" 387endmenu # "RCU Subsystem"
404 388
diff --git a/init/main.c b/init/main.c
index 11f4f145be3f..b34fd8e5edef 100644
--- a/init/main.c
+++ b/init/main.c
@@ -451,6 +451,7 @@ static noinline void __init_refok rest_init(void)
451{ 451{
452 int pid; 452 int pid;
453 453
454 rcu_scheduler_starting();
454 kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND); 455 kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND);
455 numa_default_policy(); 456 numa_default_policy();
456 pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES); 457 pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
@@ -462,7 +463,6 @@ static noinline void __init_refok rest_init(void)
462 * at least once to get things moving: 463 * at least once to get things moving:
463 */ 464 */
464 init_idle_bootup_task(current); 465 init_idle_bootup_task(current);
465 rcu_scheduler_starting();
466 preempt_enable_no_resched(); 466 preempt_enable_no_resched();
467 schedule(); 467 schedule();
468 preempt_disable(); 468 preempt_disable();
@@ -631,7 +631,6 @@ asmlinkage void __init start_kernel(void)
631 softirq_init(); 631 softirq_init();
632 timekeeping_init(); 632 timekeeping_init();
633 time_init(); 633 time_init();
634 sched_clock_init();
635 profile_init(); 634 profile_init();
636 if (!irqs_disabled()) 635 if (!irqs_disabled())
637 printk(KERN_CRIT "start_kernel(): bug: interrupts were " 636 printk(KERN_CRIT "start_kernel(): bug: interrupts were "
@@ -682,6 +681,7 @@ asmlinkage void __init start_kernel(void)
682 numa_policy_init(); 681 numa_policy_init();
683 if (late_time_init) 682 if (late_time_init)
684 late_time_init(); 683 late_time_init();
684 sched_clock_init();
685 calibrate_delay(); 685 calibrate_delay();
686 pidmap_init(); 686 pidmap_init();
687 anon_vma_init(); 687 anon_vma_init();
diff --git a/kernel/Makefile b/kernel/Makefile
index 2093a691f1c2..b833bd5cc127 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -80,11 +80,9 @@ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
80obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 80obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
81obj-$(CONFIG_SECCOMP) += seccomp.o 81obj-$(CONFIG_SECCOMP) += seccomp.o
82obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 82obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
83obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
84obj-$(CONFIG_TREE_RCU) += rcutree.o 83obj-$(CONFIG_TREE_RCU) += rcutree.o
85obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o 84obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
86obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o 85obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
87obj-$(CONFIG_PREEMPT_RCU_TRACE) += rcupreempt_trace.o
88obj-$(CONFIG_RELAY) += relay.o 86obj-$(CONFIG_RELAY) += relay.o
89obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 87obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
90obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 88obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
diff --git a/kernel/exit.c b/kernel/exit.c
index c98ff7a8025f..ae5d8660ddff 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1014,6 +1014,7 @@ NORET_TYPE void do_exit(long code)
1014 validate_creds_for_do_exit(tsk); 1014 validate_creds_for_do_exit(tsk);
1015 1015
1016 preempt_disable(); 1016 preempt_disable();
1017 exit_rcu();
1017 /* causes final put_task_struct in finish_task_switch(). */ 1018 /* causes final put_task_struct in finish_task_switch(). */
1018 tsk->state = TASK_DEAD; 1019 tsk->state = TASK_DEAD;
1019 schedule(); 1020 schedule();
diff --git a/kernel/fork.c b/kernel/fork.c
index aab8579c6093..bfee931ee3fb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1007,10 +1007,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1007 copy_flags(clone_flags, p); 1007 copy_flags(clone_flags, p);
1008 INIT_LIST_HEAD(&p->children); 1008 INIT_LIST_HEAD(&p->children);
1009 INIT_LIST_HEAD(&p->sibling); 1009 INIT_LIST_HEAD(&p->sibling);
1010#ifdef CONFIG_PREEMPT_RCU 1010 rcu_copy_process(p);
1011 p->rcu_read_lock_nesting = 0;
1012 p->rcu_flipctr_idx = 0;
1013#endif /* #ifdef CONFIG_PREEMPT_RCU */
1014 p->vfork_done = NULL; 1011 p->vfork_done = NULL;
1015 spin_lock_init(&p->alloc_lock); 1012 spin_lock_init(&p->alloc_lock);
1016 1013
diff --git a/kernel/futex.c b/kernel/futex.c
index e18cfbdc7190..248dd119a86e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -115,6 +115,9 @@ struct futex_q {
115 /* rt_waiter storage for requeue_pi: */ 115 /* rt_waiter storage for requeue_pi: */
116 struct rt_mutex_waiter *rt_waiter; 116 struct rt_mutex_waiter *rt_waiter;
117 117
118 /* The expected requeue pi target futex key: */
119 union futex_key *requeue_pi_key;
120
118 /* Bitset for the optional bitmasked wakeup */ 121 /* Bitset for the optional bitmasked wakeup */
119 u32 bitset; 122 u32 bitset;
120}; 123};
@@ -1089,6 +1092,10 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1089 if (!top_waiter) 1092 if (!top_waiter)
1090 return 0; 1093 return 0;
1091 1094
1095 /* Ensure we requeue to the expected futex. */
1096 if (!match_futex(top_waiter->requeue_pi_key, key2))
1097 return -EINVAL;
1098
1092 /* 1099 /*
1093 * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in 1100 * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in
1094 * the contended case or if set_waiters is 1. The pi_state is returned 1101 * the contended case or if set_waiters is 1. The pi_state is returned
@@ -1276,6 +1283,12 @@ retry_private:
1276 continue; 1283 continue;
1277 } 1284 }
1278 1285
1286 /* Ensure we requeue to the expected futex for requeue_pi. */
1287 if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) {
1288 ret = -EINVAL;
1289 break;
1290 }
1291
1279 /* 1292 /*
1280 * Requeue nr_requeue waiters and possibly one more in the case 1293 * Requeue nr_requeue waiters and possibly one more in the case
1281 * of requeue_pi if we couldn't acquire the lock atomically. 1294 * of requeue_pi if we couldn't acquire the lock atomically.
@@ -1751,6 +1764,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1751 q.pi_state = NULL; 1764 q.pi_state = NULL;
1752 q.bitset = bitset; 1765 q.bitset = bitset;
1753 q.rt_waiter = NULL; 1766 q.rt_waiter = NULL;
1767 q.requeue_pi_key = NULL;
1754 1768
1755 if (abs_time) { 1769 if (abs_time) {
1756 to = &timeout; 1770 to = &timeout;
@@ -1858,6 +1872,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1858 1872
1859 q.pi_state = NULL; 1873 q.pi_state = NULL;
1860 q.rt_waiter = NULL; 1874 q.rt_waiter = NULL;
1875 q.requeue_pi_key = NULL;
1861retry: 1876retry:
1862 q.key = FUTEX_KEY_INIT; 1877 q.key = FUTEX_KEY_INIT;
1863 ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE); 1878 ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE);
@@ -2118,11 +2133,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2118 * We call schedule in futex_wait_queue_me() when we enqueue and return there 2133 * We call schedule in futex_wait_queue_me() when we enqueue and return there
2119 * via the following: 2134 * via the following:
2120 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() 2135 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
2121 * 2) wakeup on uaddr2 after a requeue and subsequent unlock 2136 * 2) wakeup on uaddr2 after a requeue
2122 * 3) signal (before or after requeue) 2137 * 3) signal
2123 * 4) timeout (before or after requeue) 2138 * 4) timeout
2124 * 2139 *
2125 * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function. 2140 * If 3, cleanup and return -ERESTARTNOINTR.
2126 * 2141 *
2127 * If 2, we may then block on trying to take the rt_mutex and return via: 2142 * If 2, we may then block on trying to take the rt_mutex and return via:
2128 * 5) successful lock 2143 * 5) successful lock
@@ -2130,7 +2145,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2130 * 7) timeout 2145 * 7) timeout
2131 * 8) other lock acquisition failure 2146 * 8) other lock acquisition failure
2132 * 2147 *
2133 * If 6, we setup a restart_block with futex_lock_pi() as the function. 2148 * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
2134 * 2149 *
2135 * If 4 or 7, we cleanup and return with -ETIMEDOUT. 2150 * If 4 or 7, we cleanup and return with -ETIMEDOUT.
2136 * 2151 *
@@ -2169,15 +2184,16 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2169 debug_rt_mutex_init_waiter(&rt_waiter); 2184 debug_rt_mutex_init_waiter(&rt_waiter);
2170 rt_waiter.task = NULL; 2185 rt_waiter.task = NULL;
2171 2186
2172 q.pi_state = NULL;
2173 q.bitset = bitset;
2174 q.rt_waiter = &rt_waiter;
2175
2176 key2 = FUTEX_KEY_INIT; 2187 key2 = FUTEX_KEY_INIT;
2177 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); 2188 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
2178 if (unlikely(ret != 0)) 2189 if (unlikely(ret != 0))
2179 goto out; 2190 goto out;
2180 2191
2192 q.pi_state = NULL;
2193 q.bitset = bitset;
2194 q.rt_waiter = &rt_waiter;
2195 q.requeue_pi_key = &key2;
2196
2181 /* Prepare to wait on uaddr. */ 2197 /* Prepare to wait on uaddr. */
2182 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); 2198 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
2183 if (ret) 2199 if (ret)
@@ -2248,14 +2264,11 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2248 rt_mutex_unlock(pi_mutex); 2264 rt_mutex_unlock(pi_mutex);
2249 } else if (ret == -EINTR) { 2265 } else if (ret == -EINTR) {
2250 /* 2266 /*
2251 * We've already been requeued, but we have no way to 2267 * We've already been requeued, but cannot restart by calling
2252 * restart by calling futex_lock_pi() directly. We 2268 * futex_lock_pi() directly. We could restart this syscall, but
2253 * could restart the syscall, but that will look at 2269 * it would detect that the user space "val" changed and return
2254 * the user space value and return right away. So we 2270 * -EWOULDBLOCK. Save the overhead of the restart and return
2255 * drop back with EWOULDBLOCK to tell user space that 2271 * -EWOULDBLOCK directly.
2256 * "val" has been changed. That's the same what the
2257 * restart of the syscall would do in
2258 * futex_wait_setup().
2259 */ 2272 */
2260 ret = -EWOULDBLOCK; 2273 ret = -EWOULDBLOCK;
2261 } 2274 }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 13c68e71b726..c1660194d115 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -222,6 +222,34 @@ int set_irq_chip_data(unsigned int irq, void *data)
222} 222}
223EXPORT_SYMBOL(set_irq_chip_data); 223EXPORT_SYMBOL(set_irq_chip_data);
224 224
225/**
226 * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq
227 *
228 * @irq: Interrupt number
229 * @nest: 0 to clear / 1 to set the IRQ_NESTED_THREAD flag
230 *
231 * The IRQ_NESTED_THREAD flag indicates that on
232 * request_threaded_irq() no separate interrupt thread should be
233 * created for the irq as the handler are called nested in the
234 * context of a demultiplexing interrupt handler thread.
235 */
236void set_irq_nested_thread(unsigned int irq, int nest)
237{
238 struct irq_desc *desc = irq_to_desc(irq);
239 unsigned long flags;
240
241 if (!desc)
242 return;
243
244 spin_lock_irqsave(&desc->lock, flags);
245 if (nest)
246 desc->status |= IRQ_NESTED_THREAD;
247 else
248 desc->status &= ~IRQ_NESTED_THREAD;
249 spin_unlock_irqrestore(&desc->lock, flags);
250}
251EXPORT_SYMBOL_GPL(set_irq_nested_thread);
252
225/* 253/*
226 * default enable function 254 * default enable function
227 */ 255 */
@@ -299,6 +327,45 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)
299 } 327 }
300} 328}
301 329
330/*
331 * handle_nested_irq - Handle a nested irq from a irq thread
332 * @irq: the interrupt number
333 *
334 * Handle interrupts which are nested into a threaded interrupt
335 * handler. The handler function is called inside the calling
336 * threads context.
337 */
338void handle_nested_irq(unsigned int irq)
339{
340 struct irq_desc *desc = irq_to_desc(irq);
341 struct irqaction *action;
342 irqreturn_t action_ret;
343
344 might_sleep();
345
346 spin_lock_irq(&desc->lock);
347
348 kstat_incr_irqs_this_cpu(irq, desc);
349
350 action = desc->action;
351 if (unlikely(!action || (desc->status & IRQ_DISABLED)))
352 goto out_unlock;
353
354 desc->status |= IRQ_INPROGRESS;
355 spin_unlock_irq(&desc->lock);
356
357 action_ret = action->thread_fn(action->irq, action->dev_id);
358 if (!noirqdebug)
359 note_interrupt(irq, desc, action_ret);
360
361 spin_lock_irq(&desc->lock);
362 desc->status &= ~IRQ_INPROGRESS;
363
364out_unlock:
365 spin_unlock_irq(&desc->lock);
366}
367EXPORT_SYMBOL_GPL(handle_nested_irq);
368
302/** 369/**
303 * handle_simple_irq - Simple and software-decoded IRQs. 370 * handle_simple_irq - Simple and software-decoded IRQs.
304 * @irq: the interrupt number 371 * @irq: the interrupt number
@@ -382,7 +449,10 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
382 449
383 spin_lock(&desc->lock); 450 spin_lock(&desc->lock);
384 desc->status &= ~IRQ_INPROGRESS; 451 desc->status &= ~IRQ_INPROGRESS;
385 if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) 452
453 if (unlikely(desc->status & IRQ_ONESHOT))
454 desc->status |= IRQ_MASKED;
455 else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
386 desc->chip->unmask(irq); 456 desc->chip->unmask(irq);
387out_unlock: 457out_unlock:
388 spin_unlock(&desc->lock); 458 spin_unlock(&desc->lock);
@@ -572,6 +642,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
572 desc->chip = &dummy_irq_chip; 642 desc->chip = &dummy_irq_chip;
573 } 643 }
574 644
645 chip_bus_lock(irq, desc);
575 spin_lock_irqsave(&desc->lock, flags); 646 spin_lock_irqsave(&desc->lock, flags);
576 647
577 /* Uninstall? */ 648 /* Uninstall? */
@@ -591,6 +662,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
591 desc->chip->startup(irq); 662 desc->chip->startup(irq);
592 } 663 }
593 spin_unlock_irqrestore(&desc->lock, flags); 664 spin_unlock_irqrestore(&desc->lock, flags);
665 chip_bus_sync_unlock(irq, desc);
594} 666}
595EXPORT_SYMBOL_GPL(__set_irq_handler); 667EXPORT_SYMBOL_GPL(__set_irq_handler);
596 668
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 065205bdd920..a81cf80554db 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -161,7 +161,7 @@ int __init early_irq_init(void)
161 161
162 desc = irq_desc_legacy; 162 desc = irq_desc_legacy;
163 legacy_count = ARRAY_SIZE(irq_desc_legacy); 163 legacy_count = ARRAY_SIZE(irq_desc_legacy);
164 node = first_online_node; 164 node = first_online_node;
165 165
166 /* allocate irq_desc_ptrs array based on nr_irqs */ 166 /* allocate irq_desc_ptrs array based on nr_irqs */
167 irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT); 167 irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
@@ -172,6 +172,9 @@ int __init early_irq_init(void)
172 172
173 for (i = 0; i < legacy_count; i++) { 173 for (i = 0; i < legacy_count; i++) {
174 desc[i].irq = i; 174 desc[i].irq = i;
175#ifdef CONFIG_SMP
176 desc[i].node = node;
177#endif
175 desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; 178 desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
176 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 179 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
177 alloc_desc_masks(&desc[i], node, true); 180 alloc_desc_masks(&desc[i], node, true);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index e70ed5592eb9..1b5d742c6a77 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -44,6 +44,19 @@ extern int irq_select_affinity_usr(unsigned int irq);
44 44
45extern void irq_set_thread_affinity(struct irq_desc *desc); 45extern void irq_set_thread_affinity(struct irq_desc *desc);
46 46
47/* Inline functions for support of irq chips on slow busses */
48static inline void chip_bus_lock(unsigned int irq, struct irq_desc *desc)
49{
50 if (unlikely(desc->chip->bus_lock))
51 desc->chip->bus_lock(irq);
52}
53
54static inline void chip_bus_sync_unlock(unsigned int irq, struct irq_desc *desc)
55{
56 if (unlikely(desc->chip->bus_sync_unlock))
57 desc->chip->bus_sync_unlock(irq);
58}
59
47/* 60/*
48 * Debugging printout: 61 * Debugging printout:
49 */ 62 */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0ec9ed831737..bde4c667d24d 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -230,9 +230,11 @@ void disable_irq_nosync(unsigned int irq)
230 if (!desc) 230 if (!desc)
231 return; 231 return;
232 232
233 chip_bus_lock(irq, desc);
233 spin_lock_irqsave(&desc->lock, flags); 234 spin_lock_irqsave(&desc->lock, flags);
234 __disable_irq(desc, irq, false); 235 __disable_irq(desc, irq, false);
235 spin_unlock_irqrestore(&desc->lock, flags); 236 spin_unlock_irqrestore(&desc->lock, flags);
237 chip_bus_sync_unlock(irq, desc);
236} 238}
237EXPORT_SYMBOL(disable_irq_nosync); 239EXPORT_SYMBOL(disable_irq_nosync);
238 240
@@ -294,7 +296,8 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
294 * matches the last disable, processing of interrupts on this 296 * matches the last disable, processing of interrupts on this
295 * IRQ line is re-enabled. 297 * IRQ line is re-enabled.
296 * 298 *
297 * This function may be called from IRQ context. 299 * This function may be called from IRQ context only when
300 * desc->chip->bus_lock and desc->chip->bus_sync_unlock are NULL !
298 */ 301 */
299void enable_irq(unsigned int irq) 302void enable_irq(unsigned int irq)
300{ 303{
@@ -304,9 +307,11 @@ void enable_irq(unsigned int irq)
304 if (!desc) 307 if (!desc)
305 return; 308 return;
306 309
310 chip_bus_lock(irq, desc);
307 spin_lock_irqsave(&desc->lock, flags); 311 spin_lock_irqsave(&desc->lock, flags);
308 __enable_irq(desc, irq, false); 312 __enable_irq(desc, irq, false);
309 spin_unlock_irqrestore(&desc->lock, flags); 313 spin_unlock_irqrestore(&desc->lock, flags);
314 chip_bus_sync_unlock(irq, desc);
310} 315}
311EXPORT_SYMBOL(enable_irq); 316EXPORT_SYMBOL(enable_irq);
312 317
@@ -436,6 +441,26 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
436 return ret; 441 return ret;
437} 442}
438 443
444/*
445 * Default primary interrupt handler for threaded interrupts. Is
446 * assigned as primary handler when request_threaded_irq is called
447 * with handler == NULL. Useful for oneshot interrupts.
448 */
449static irqreturn_t irq_default_primary_handler(int irq, void *dev_id)
450{
451 return IRQ_WAKE_THREAD;
452}
453
454/*
455 * Primary handler for nested threaded interrupts. Should never be
456 * called.
457 */
458static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id)
459{
460 WARN(1, "Primary handler called for nested irq %d\n", irq);
461 return IRQ_NONE;
462}
463
439static int irq_wait_for_interrupt(struct irqaction *action) 464static int irq_wait_for_interrupt(struct irqaction *action)
440{ 465{
441 while (!kthread_should_stop()) { 466 while (!kthread_should_stop()) {
@@ -451,6 +476,23 @@ static int irq_wait_for_interrupt(struct irqaction *action)
451 return -1; 476 return -1;
452} 477}
453 478
479/*
480 * Oneshot interrupts keep the irq line masked until the threaded
481 * handler finished. unmask if the interrupt has not been disabled and
482 * is marked MASKED.
483 */
484static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
485{
486 chip_bus_lock(irq, desc);
487 spin_lock_irq(&desc->lock);
488 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
489 desc->status &= ~IRQ_MASKED;
490 desc->chip->unmask(irq);
491 }
492 spin_unlock_irq(&desc->lock);
493 chip_bus_sync_unlock(irq, desc);
494}
495
454#ifdef CONFIG_SMP 496#ifdef CONFIG_SMP
455/* 497/*
456 * Check whether we need to change the affinity of the interrupt thread. 498 * Check whether we need to change the affinity of the interrupt thread.
@@ -492,7 +534,7 @@ static int irq_thread(void *data)
492 struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; 534 struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, };
493 struct irqaction *action = data; 535 struct irqaction *action = data;
494 struct irq_desc *desc = irq_to_desc(action->irq); 536 struct irq_desc *desc = irq_to_desc(action->irq);
495 int wake; 537 int wake, oneshot = desc->status & IRQ_ONESHOT;
496 538
497 sched_setscheduler(current, SCHED_FIFO, &param); 539 sched_setscheduler(current, SCHED_FIFO, &param);
498 current->irqaction = action; 540 current->irqaction = action;
@@ -518,6 +560,9 @@ static int irq_thread(void *data)
518 spin_unlock_irq(&desc->lock); 560 spin_unlock_irq(&desc->lock);
519 561
520 action->thread_fn(action->irq, action->dev_id); 562 action->thread_fn(action->irq, action->dev_id);
563
564 if (oneshot)
565 irq_finalize_oneshot(action->irq, desc);
521 } 566 }
522 567
523 wake = atomic_dec_and_test(&desc->threads_active); 568 wake = atomic_dec_and_test(&desc->threads_active);
@@ -565,7 +610,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
565 struct irqaction *old, **old_ptr; 610 struct irqaction *old, **old_ptr;
566 const char *old_name = NULL; 611 const char *old_name = NULL;
567 unsigned long flags; 612 unsigned long flags;
568 int shared = 0; 613 int nested, shared = 0;
569 int ret; 614 int ret;
570 615
571 if (!desc) 616 if (!desc)
@@ -590,10 +635,32 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
590 rand_initialize_irq(irq); 635 rand_initialize_irq(irq);
591 } 636 }
592 637
638 /* Oneshot interrupts are not allowed with shared */
639 if ((new->flags & IRQF_ONESHOT) && (new->flags & IRQF_SHARED))
640 return -EINVAL;
641
642 /*
643 * Check whether the interrupt nests into another interrupt
644 * thread.
645 */
646 nested = desc->status & IRQ_NESTED_THREAD;
647 if (nested) {
648 if (!new->thread_fn)
649 return -EINVAL;
650 /*
651 * Replace the primary handler which was provided from
652 * the driver for non nested interrupt handling by the
653 * dummy function which warns when called.
654 */
655 new->handler = irq_nested_primary_handler;
656 }
657
593 /* 658 /*
594 * Threaded handler ? 659 * Create a handler thread when a thread function is supplied
660 * and the interrupt does not nest into another interrupt
661 * thread.
595 */ 662 */
596 if (new->thread_fn) { 663 if (new->thread_fn && !nested) {
597 struct task_struct *t; 664 struct task_struct *t;
598 665
599 t = kthread_create(irq_thread, new, "irq/%d-%s", irq, 666 t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
@@ -662,9 +729,12 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
662 desc->status |= IRQ_PER_CPU; 729 desc->status |= IRQ_PER_CPU;
663#endif 730#endif
664 731
665 desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | 732 desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_ONESHOT |
666 IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED); 733 IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED);
667 734
735 if (new->flags & IRQF_ONESHOT)
736 desc->status |= IRQ_ONESHOT;
737
668 if (!(desc->status & IRQ_NOAUTOEN)) { 738 if (!(desc->status & IRQ_NOAUTOEN)) {
669 desc->depth = 0; 739 desc->depth = 0;
670 desc->status &= ~IRQ_DISABLED; 740 desc->status &= ~IRQ_DISABLED;
@@ -875,7 +945,14 @@ EXPORT_SYMBOL_GPL(remove_irq);
875 */ 945 */
876void free_irq(unsigned int irq, void *dev_id) 946void free_irq(unsigned int irq, void *dev_id)
877{ 947{
948 struct irq_desc *desc = irq_to_desc(irq);
949
950 if (!desc)
951 return;
952
953 chip_bus_lock(irq, desc);
878 kfree(__free_irq(irq, dev_id)); 954 kfree(__free_irq(irq, dev_id));
955 chip_bus_sync_unlock(irq, desc);
879} 956}
880EXPORT_SYMBOL(free_irq); 957EXPORT_SYMBOL(free_irq);
881 958
@@ -884,6 +961,8 @@ EXPORT_SYMBOL(free_irq);
884 * @irq: Interrupt line to allocate 961 * @irq: Interrupt line to allocate
885 * @handler: Function to be called when the IRQ occurs. 962 * @handler: Function to be called when the IRQ occurs.
886 * Primary handler for threaded interrupts 963 * Primary handler for threaded interrupts
964 * If NULL and thread_fn != NULL the default
965 * primary handler is installed
887 * @thread_fn: Function called from the irq handler thread 966 * @thread_fn: Function called from the irq handler thread
888 * If NULL, no irq thread is created 967 * If NULL, no irq thread is created
889 * @irqflags: Interrupt type flags 968 * @irqflags: Interrupt type flags
@@ -963,8 +1042,12 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
963 1042
964 if (desc->status & IRQ_NOREQUEST) 1043 if (desc->status & IRQ_NOREQUEST)
965 return -EINVAL; 1044 return -EINVAL;
966 if (!handler) 1045
967 return -EINVAL; 1046 if (!handler) {
1047 if (!thread_fn)
1048 return -EINVAL;
1049 handler = irq_default_primary_handler;
1050 }
968 1051
969 action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); 1052 action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
970 if (!action) 1053 if (!action)
@@ -976,7 +1059,10 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
976 action->name = devname; 1059 action->name = devname;
977 action->dev_id = dev_id; 1060 action->dev_id = dev_id;
978 1061
1062 chip_bus_lock(irq, desc);
979 retval = __setup_irq(irq, desc, action); 1063 retval = __setup_irq(irq, desc, action);
1064 chip_bus_sync_unlock(irq, desc);
1065
980 if (retval) 1066 if (retval)
981 kfree(action); 1067 kfree(action);
982 1068
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 638d8bedec14..a0bb09e79867 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -15,10 +15,10 @@
15/** 15/**
16 * suspend_device_irqs - disable all currently enabled interrupt lines 16 * suspend_device_irqs - disable all currently enabled interrupt lines
17 * 17 *
18 * During system-wide suspend or hibernation device interrupts need to be 18 * During system-wide suspend or hibernation device drivers need to be prevented
19 * disabled at the chip level and this function is provided for this purpose. 19 * from receiving interrupts and this function is provided for this purpose.
20 * It disables all interrupt lines that are enabled at the moment and sets the 20 * It marks all interrupt lines in use, except for the timer ones, as disabled
21 * IRQ_SUSPENDED flag for them. 21 * and sets the IRQ_SUSPENDED flag for each of them.
22 */ 22 */
23void suspend_device_irqs(void) 23void suspend_device_irqs(void)
24{ 24{
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 89c7117acf2b..090c3763f3a2 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -70,8 +70,7 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
70 if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { 70 if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
71 desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; 71 desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
72 72
73 if (!desc->chip || !desc->chip->retrigger || 73 if (!desc->chip->retrigger || !desc->chip->retrigger(irq)) {
74 !desc->chip->retrigger(irq)) {
75#ifdef CONFIG_HARDIRQS_SW_RESEND 74#ifdef CONFIG_HARDIRQS_SW_RESEND
76 /* Set it pending and activate the softirq: */ 75 /* Set it pending and activate the softirq: */
77 set_bit(irq, irqs_resend); 76 set_bit(irq, irqs_resend);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 4d568294de3e..114e704760fe 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -297,7 +297,6 @@ static int __init irqfixup_setup(char *str)
297 297
298__setup("irqfixup", irqfixup_setup); 298__setup("irqfixup", irqfixup_setup);
299module_param(irqfixup, int, 0644); 299module_param(irqfixup, int, 0644);
300MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode, 2: irqpoll mode");
301 300
302static int __init irqpoll_setup(char *str) 301static int __init irqpoll_setup(char *str)
303{ 302{
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 4e8cae2e9148..9fcb53a11f87 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -37,6 +37,8 @@
37#include <linux/suspend.h> 37#include <linux/suspend.h>
38#include <asm/uaccess.h> 38#include <asm/uaccess.h>
39 39
40#include <trace/events/module.h>
41
40extern int max_threads; 42extern int max_threads;
41 43
42static struct workqueue_struct *khelper_wq; 44static struct workqueue_struct *khelper_wq;
@@ -112,6 +114,8 @@ int __request_module(bool wait, const char *fmt, ...)
112 return -ENOMEM; 114 return -ENOMEM;
113 } 115 }
114 116
117 trace_module_request(module_name, wait, _RET_IP_);
118
115 ret = call_usermodehelper(modprobe_path, argv, envp, 119 ret = call_usermodehelper(modprobe_path, argv, envp,
116 wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); 120 wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
117 atomic_dec(&kmod_concurrent); 121 atomic_dec(&kmod_concurrent);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 0540948e29ab..ef177d653b2c 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -103,7 +103,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
103#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) 103#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
104 104
105struct kprobe_insn_page { 105struct kprobe_insn_page {
106 struct hlist_node hlist; 106 struct list_head list;
107 kprobe_opcode_t *insns; /* Page of instruction slots */ 107 kprobe_opcode_t *insns; /* Page of instruction slots */
108 char slot_used[INSNS_PER_PAGE]; 108 char slot_used[INSNS_PER_PAGE];
109 int nused; 109 int nused;
@@ -117,7 +117,7 @@ enum kprobe_slot_state {
117}; 117};
118 118
119static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */ 119static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */
120static struct hlist_head kprobe_insn_pages; 120static LIST_HEAD(kprobe_insn_pages);
121static int kprobe_garbage_slots; 121static int kprobe_garbage_slots;
122static int collect_garbage_slots(void); 122static int collect_garbage_slots(void);
123 123
@@ -152,10 +152,9 @@ loop_end:
152static kprobe_opcode_t __kprobes *__get_insn_slot(void) 152static kprobe_opcode_t __kprobes *__get_insn_slot(void)
153{ 153{
154 struct kprobe_insn_page *kip; 154 struct kprobe_insn_page *kip;
155 struct hlist_node *pos;
156 155
157 retry: 156 retry:
158 hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { 157 list_for_each_entry(kip, &kprobe_insn_pages, list) {
159 if (kip->nused < INSNS_PER_PAGE) { 158 if (kip->nused < INSNS_PER_PAGE) {
160 int i; 159 int i;
161 for (i = 0; i < INSNS_PER_PAGE; i++) { 160 for (i = 0; i < INSNS_PER_PAGE; i++) {
@@ -189,8 +188,8 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void)
189 kfree(kip); 188 kfree(kip);
190 return NULL; 189 return NULL;
191 } 190 }
192 INIT_HLIST_NODE(&kip->hlist); 191 INIT_LIST_HEAD(&kip->list);
193 hlist_add_head(&kip->hlist, &kprobe_insn_pages); 192 list_add(&kip->list, &kprobe_insn_pages);
194 memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE); 193 memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE);
195 kip->slot_used[0] = SLOT_USED; 194 kip->slot_used[0] = SLOT_USED;
196 kip->nused = 1; 195 kip->nused = 1;
@@ -219,12 +218,8 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
219 * so as not to have to set it up again the 218 * so as not to have to set it up again the
220 * next time somebody inserts a probe. 219 * next time somebody inserts a probe.
221 */ 220 */
222 hlist_del(&kip->hlist); 221 if (!list_is_singular(&kprobe_insn_pages)) {
223 if (hlist_empty(&kprobe_insn_pages)) { 222 list_del(&kip->list);
224 INIT_HLIST_NODE(&kip->hlist);
225 hlist_add_head(&kip->hlist,
226 &kprobe_insn_pages);
227 } else {
228 module_free(NULL, kip->insns); 223 module_free(NULL, kip->insns);
229 kfree(kip); 224 kfree(kip);
230 } 225 }
@@ -235,14 +230,13 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
235 230
236static int __kprobes collect_garbage_slots(void) 231static int __kprobes collect_garbage_slots(void)
237{ 232{
238 struct kprobe_insn_page *kip; 233 struct kprobe_insn_page *kip, *next;
239 struct hlist_node *pos, *next;
240 234
241 /* Ensure no-one is preepmted on the garbages */ 235 /* Ensure no-one is preepmted on the garbages */
242 if (check_safety()) 236 if (check_safety())
243 return -EAGAIN; 237 return -EAGAIN;
244 238
245 hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { 239 list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) {
246 int i; 240 int i;
247 if (kip->ngarbage == 0) 241 if (kip->ngarbage == 0)
248 continue; 242 continue;
@@ -260,19 +254,17 @@ static int __kprobes collect_garbage_slots(void)
260void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) 254void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
261{ 255{
262 struct kprobe_insn_page *kip; 256 struct kprobe_insn_page *kip;
263 struct hlist_node *pos;
264 257
265 mutex_lock(&kprobe_insn_mutex); 258 mutex_lock(&kprobe_insn_mutex);
266 hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { 259 list_for_each_entry(kip, &kprobe_insn_pages, list) {
267 if (kip->insns <= slot && 260 if (kip->insns <= slot &&
268 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { 261 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
269 int i = (slot - kip->insns) / MAX_INSN_SIZE; 262 int i = (slot - kip->insns) / MAX_INSN_SIZE;
270 if (dirty) { 263 if (dirty) {
271 kip->slot_used[i] = SLOT_DIRTY; 264 kip->slot_used[i] = SLOT_DIRTY;
272 kip->ngarbage++; 265 kip->ngarbage++;
273 } else { 266 } else
274 collect_one_slot(kip, i); 267 collect_one_slot(kip, i);
275 }
276 break; 268 break;
277 } 269 }
278 } 270 }
diff --git a/kernel/kthread.c b/kernel/kthread.c
index eb8751aa0418..5fe709982caa 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -16,8 +16,6 @@
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <trace/events/sched.h> 17#include <trace/events/sched.h>
18 18
19#define KTHREAD_NICE_LEVEL (-5)
20
21static DEFINE_SPINLOCK(kthread_create_lock); 19static DEFINE_SPINLOCK(kthread_create_lock);
22static LIST_HEAD(kthread_create_list); 20static LIST_HEAD(kthread_create_list);
23struct task_struct *kthreadd_task; 21struct task_struct *kthreadd_task;
@@ -145,7 +143,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
145 * The kernel thread should not inherit these properties. 143 * The kernel thread should not inherit these properties.
146 */ 144 */
147 sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param); 145 sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param);
148 set_user_nice(create.result, KTHREAD_NICE_LEVEL);
149 set_cpus_allowed_ptr(create.result, cpu_all_mask); 146 set_cpus_allowed_ptr(create.result, cpu_all_mask);
150 } 147 }
151 return create.result; 148 return create.result;
@@ -221,7 +218,6 @@ int kthreadd(void *unused)
221 /* Setup a clean context for our children to inherit. */ 218 /* Setup a clean context for our children to inherit. */
222 set_task_comm(tsk, "kthreadd"); 219 set_task_comm(tsk, "kthreadd");
223 ignore_signals(tsk); 220 ignore_signals(tsk);
224 set_user_nice(tsk, KTHREAD_NICE_LEVEL);
225 set_cpus_allowed_ptr(tsk, cpu_all_mask); 221 set_cpus_allowed_ptr(tsk, cpu_all_mask);
226 set_mems_allowed(node_possible_map); 222 set_mems_allowed(node_possible_map);
227 223
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 8bbeef996c76..f74d2d7aa605 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -42,6 +42,7 @@
42#include <linux/hash.h> 42#include <linux/hash.h>
43#include <linux/ftrace.h> 43#include <linux/ftrace.h>
44#include <linux/stringify.h> 44#include <linux/stringify.h>
45#include <linux/bitops.h>
45 46
46#include <asm/sections.h> 47#include <asm/sections.h>
47 48
@@ -366,11 +367,21 @@ static int save_trace(struct stack_trace *trace)
366 367
367 save_stack_trace(trace); 368 save_stack_trace(trace);
368 369
370 /*
371 * Some daft arches put -1 at the end to indicate its a full trace.
372 *
373 * <rant> this is buggy anyway, since it takes a whole extra entry so a
374 * complete trace that maxes out the entries provided will be reported
375 * as incomplete, friggin useless </rant>
376 */
377 if (trace->entries[trace->nr_entries-1] == ULONG_MAX)
378 trace->nr_entries--;
379
369 trace->max_entries = trace->nr_entries; 380 trace->max_entries = trace->nr_entries;
370 381
371 nr_stack_trace_entries += trace->nr_entries; 382 nr_stack_trace_entries += trace->nr_entries;
372 383
373 if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) { 384 if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
374 if (!debug_locks_off_graph_unlock()) 385 if (!debug_locks_off_graph_unlock())
375 return 0; 386 return 0;
376 387
@@ -388,20 +399,6 @@ unsigned int nr_hardirq_chains;
388unsigned int nr_softirq_chains; 399unsigned int nr_softirq_chains;
389unsigned int nr_process_chains; 400unsigned int nr_process_chains;
390unsigned int max_lockdep_depth; 401unsigned int max_lockdep_depth;
391unsigned int max_recursion_depth;
392
393static unsigned int lockdep_dependency_gen_id;
394
395static bool lockdep_dependency_visit(struct lock_class *source,
396 unsigned int depth)
397{
398 if (!depth)
399 lockdep_dependency_gen_id++;
400 if (source->dep_gen_id == lockdep_dependency_gen_id)
401 return true;
402 source->dep_gen_id = lockdep_dependency_gen_id;
403 return false;
404}
405 402
406#ifdef CONFIG_DEBUG_LOCKDEP 403#ifdef CONFIG_DEBUG_LOCKDEP
407/* 404/*
@@ -431,11 +428,8 @@ atomic_t redundant_softirqs_on;
431atomic_t redundant_softirqs_off; 428atomic_t redundant_softirqs_off;
432atomic_t nr_unused_locks; 429atomic_t nr_unused_locks;
433atomic_t nr_cyclic_checks; 430atomic_t nr_cyclic_checks;
434atomic_t nr_cyclic_check_recursions;
435atomic_t nr_find_usage_forwards_checks; 431atomic_t nr_find_usage_forwards_checks;
436atomic_t nr_find_usage_forwards_recursions;
437atomic_t nr_find_usage_backwards_checks; 432atomic_t nr_find_usage_backwards_checks;
438atomic_t nr_find_usage_backwards_recursions;
439#endif 433#endif
440 434
441/* 435/*
@@ -551,58 +545,6 @@ static void lockdep_print_held_locks(struct task_struct *curr)
551 } 545 }
552} 546}
553 547
554static void print_lock_class_header(struct lock_class *class, int depth)
555{
556 int bit;
557
558 printk("%*s->", depth, "");
559 print_lock_name(class);
560 printk(" ops: %lu", class->ops);
561 printk(" {\n");
562
563 for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
564 if (class->usage_mask & (1 << bit)) {
565 int len = depth;
566
567 len += printk("%*s %s", depth, "", usage_str[bit]);
568 len += printk(" at:\n");
569 print_stack_trace(class->usage_traces + bit, len);
570 }
571 }
572 printk("%*s }\n", depth, "");
573
574 printk("%*s ... key at: ",depth,"");
575 print_ip_sym((unsigned long)class->key);
576}
577
578/*
579 * printk all lock dependencies starting at <entry>:
580 */
581static void __used
582print_lock_dependencies(struct lock_class *class, int depth)
583{
584 struct lock_list *entry;
585
586 if (lockdep_dependency_visit(class, depth))
587 return;
588
589 if (DEBUG_LOCKS_WARN_ON(depth >= 20))
590 return;
591
592 print_lock_class_header(class, depth);
593
594 list_for_each_entry(entry, &class->locks_after, entry) {
595 if (DEBUG_LOCKS_WARN_ON(!entry->class))
596 return;
597
598 print_lock_dependencies(entry->class, depth + 1);
599
600 printk("%*s ... acquired at:\n",depth,"");
601 print_stack_trace(&entry->trace, 2);
602 printk("\n");
603 }
604}
605
606static void print_kernel_version(void) 548static void print_kernel_version(void)
607{ 549{
608 printk("%s %.*s\n", init_utsname()->release, 550 printk("%s %.*s\n", init_utsname()->release,
@@ -898,22 +840,203 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
898} 840}
899 841
900/* 842/*
843 * For good efficiency of modular, we use power of 2
844 */
845#define MAX_CIRCULAR_QUEUE_SIZE 4096UL
846#define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1)
847
848/*
849 * The circular_queue and helpers is used to implement the
850 * breadth-first search(BFS)algorithem, by which we can build
851 * the shortest path from the next lock to be acquired to the
852 * previous held lock if there is a circular between them.
853 */
854struct circular_queue {
855 unsigned long element[MAX_CIRCULAR_QUEUE_SIZE];
856 unsigned int front, rear;
857};
858
859static struct circular_queue lock_cq;
860
861unsigned int max_bfs_queue_depth;
862
863static unsigned int lockdep_dependency_gen_id;
864
865static inline void __cq_init(struct circular_queue *cq)
866{
867 cq->front = cq->rear = 0;
868 lockdep_dependency_gen_id++;
869}
870
871static inline int __cq_empty(struct circular_queue *cq)
872{
873 return (cq->front == cq->rear);
874}
875
876static inline int __cq_full(struct circular_queue *cq)
877{
878 return ((cq->rear + 1) & CQ_MASK) == cq->front;
879}
880
881static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem)
882{
883 if (__cq_full(cq))
884 return -1;
885
886 cq->element[cq->rear] = elem;
887 cq->rear = (cq->rear + 1) & CQ_MASK;
888 return 0;
889}
890
891static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem)
892{
893 if (__cq_empty(cq))
894 return -1;
895
896 *elem = cq->element[cq->front];
897 cq->front = (cq->front + 1) & CQ_MASK;
898 return 0;
899}
900
901static inline unsigned int __cq_get_elem_count(struct circular_queue *cq)
902{
903 return (cq->rear - cq->front) & CQ_MASK;
904}
905
906static inline void mark_lock_accessed(struct lock_list *lock,
907 struct lock_list *parent)
908{
909 unsigned long nr;
910
911 nr = lock - list_entries;
912 WARN_ON(nr >= nr_list_entries);
913 lock->parent = parent;
914 lock->class->dep_gen_id = lockdep_dependency_gen_id;
915}
916
917static inline unsigned long lock_accessed(struct lock_list *lock)
918{
919 unsigned long nr;
920
921 nr = lock - list_entries;
922 WARN_ON(nr >= nr_list_entries);
923 return lock->class->dep_gen_id == lockdep_dependency_gen_id;
924}
925
926static inline struct lock_list *get_lock_parent(struct lock_list *child)
927{
928 return child->parent;
929}
930
931static inline int get_lock_depth(struct lock_list *child)
932{
933 int depth = 0;
934 struct lock_list *parent;
935
936 while ((parent = get_lock_parent(child))) {
937 child = parent;
938 depth++;
939 }
940 return depth;
941}
942
943static int __bfs(struct lock_list *source_entry,
944 void *data,
945 int (*match)(struct lock_list *entry, void *data),
946 struct lock_list **target_entry,
947 int forward)
948{
949 struct lock_list *entry;
950 struct list_head *head;
951 struct circular_queue *cq = &lock_cq;
952 int ret = 1;
953
954 if (match(source_entry, data)) {
955 *target_entry = source_entry;
956 ret = 0;
957 goto exit;
958 }
959
960 if (forward)
961 head = &source_entry->class->locks_after;
962 else
963 head = &source_entry->class->locks_before;
964
965 if (list_empty(head))
966 goto exit;
967
968 __cq_init(cq);
969 __cq_enqueue(cq, (unsigned long)source_entry);
970
971 while (!__cq_empty(cq)) {
972 struct lock_list *lock;
973
974 __cq_dequeue(cq, (unsigned long *)&lock);
975
976 if (!lock->class) {
977 ret = -2;
978 goto exit;
979 }
980
981 if (forward)
982 head = &lock->class->locks_after;
983 else
984 head = &lock->class->locks_before;
985
986 list_for_each_entry(entry, head, entry) {
987 if (!lock_accessed(entry)) {
988 unsigned int cq_depth;
989 mark_lock_accessed(entry, lock);
990 if (match(entry, data)) {
991 *target_entry = entry;
992 ret = 0;
993 goto exit;
994 }
995
996 if (__cq_enqueue(cq, (unsigned long)entry)) {
997 ret = -1;
998 goto exit;
999 }
1000 cq_depth = __cq_get_elem_count(cq);
1001 if (max_bfs_queue_depth < cq_depth)
1002 max_bfs_queue_depth = cq_depth;
1003 }
1004 }
1005 }
1006exit:
1007 return ret;
1008}
1009
1010static inline int __bfs_forwards(struct lock_list *src_entry,
1011 void *data,
1012 int (*match)(struct lock_list *entry, void *data),
1013 struct lock_list **target_entry)
1014{
1015 return __bfs(src_entry, data, match, target_entry, 1);
1016
1017}
1018
1019static inline int __bfs_backwards(struct lock_list *src_entry,
1020 void *data,
1021 int (*match)(struct lock_list *entry, void *data),
1022 struct lock_list **target_entry)
1023{
1024 return __bfs(src_entry, data, match, target_entry, 0);
1025
1026}
1027
1028/*
901 * Recursive, forwards-direction lock-dependency checking, used for 1029 * Recursive, forwards-direction lock-dependency checking, used for
902 * both noncyclic checking and for hardirq-unsafe/softirq-unsafe 1030 * both noncyclic checking and for hardirq-unsafe/softirq-unsafe
903 * checking. 1031 * checking.
904 *
905 * (to keep the stackframe of the recursive functions small we
906 * use these global variables, and we also mark various helper
907 * functions as noinline.)
908 */ 1032 */
909static struct held_lock *check_source, *check_target;
910 1033
911/* 1034/*
912 * Print a dependency chain entry (this is only done when a deadlock 1035 * Print a dependency chain entry (this is only done when a deadlock
913 * has been detected): 1036 * has been detected):
914 */ 1037 */
915static noinline int 1038static noinline int
916print_circular_bug_entry(struct lock_list *target, unsigned int depth) 1039print_circular_bug_entry(struct lock_list *target, int depth)
917{ 1040{
918 if (debug_locks_silent) 1041 if (debug_locks_silent)
919 return 0; 1042 return 0;
@@ -930,11 +1053,13 @@ print_circular_bug_entry(struct lock_list *target, unsigned int depth)
930 * header first: 1053 * header first:
931 */ 1054 */
932static noinline int 1055static noinline int
933print_circular_bug_header(struct lock_list *entry, unsigned int depth) 1056print_circular_bug_header(struct lock_list *entry, unsigned int depth,
1057 struct held_lock *check_src,
1058 struct held_lock *check_tgt)
934{ 1059{
935 struct task_struct *curr = current; 1060 struct task_struct *curr = current;
936 1061
937 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 1062 if (debug_locks_silent)
938 return 0; 1063 return 0;
939 1064
940 printk("\n=======================================================\n"); 1065 printk("\n=======================================================\n");
@@ -943,9 +1068,9 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
943 printk( "-------------------------------------------------------\n"); 1068 printk( "-------------------------------------------------------\n");
944 printk("%s/%d is trying to acquire lock:\n", 1069 printk("%s/%d is trying to acquire lock:\n",
945 curr->comm, task_pid_nr(curr)); 1070 curr->comm, task_pid_nr(curr));
946 print_lock(check_source); 1071 print_lock(check_src);
947 printk("\nbut task is already holding lock:\n"); 1072 printk("\nbut task is already holding lock:\n");
948 print_lock(check_target); 1073 print_lock(check_tgt);
949 printk("\nwhich lock already depends on the new lock.\n\n"); 1074 printk("\nwhich lock already depends on the new lock.\n\n");
950 printk("\nthe existing dependency chain (in reverse order) is:\n"); 1075 printk("\nthe existing dependency chain (in reverse order) is:\n");
951 1076
@@ -954,19 +1079,36 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
954 return 0; 1079 return 0;
955} 1080}
956 1081
957static noinline int print_circular_bug_tail(void) 1082static inline int class_equal(struct lock_list *entry, void *data)
1083{
1084 return entry->class == data;
1085}
1086
1087static noinline int print_circular_bug(struct lock_list *this,
1088 struct lock_list *target,
1089 struct held_lock *check_src,
1090 struct held_lock *check_tgt)
958{ 1091{
959 struct task_struct *curr = current; 1092 struct task_struct *curr = current;
960 struct lock_list this; 1093 struct lock_list *parent;
1094 int depth;
961 1095
962 if (debug_locks_silent) 1096 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
963 return 0; 1097 return 0;
964 1098
965 this.class = hlock_class(check_source); 1099 if (!save_trace(&this->trace))
966 if (!save_trace(&this.trace))
967 return 0; 1100 return 0;
968 1101
969 print_circular_bug_entry(&this, 0); 1102 depth = get_lock_depth(target);
1103
1104 print_circular_bug_header(target, depth, check_src, check_tgt);
1105
1106 parent = get_lock_parent(target);
1107
1108 while (parent) {
1109 print_circular_bug_entry(parent, --depth);
1110 parent = get_lock_parent(parent);
1111 }
970 1112
971 printk("\nother info that might help us debug this:\n\n"); 1113 printk("\nother info that might help us debug this:\n\n");
972 lockdep_print_held_locks(curr); 1114 lockdep_print_held_locks(curr);
@@ -977,73 +1119,69 @@ static noinline int print_circular_bug_tail(void)
977 return 0; 1119 return 0;
978} 1120}
979 1121
980#define RECURSION_LIMIT 40 1122static noinline int print_bfs_bug(int ret)
981
982static int noinline print_infinite_recursion_bug(void)
983{ 1123{
984 if (!debug_locks_off_graph_unlock()) 1124 if (!debug_locks_off_graph_unlock())
985 return 0; 1125 return 0;
986 1126
987 WARN_ON(1); 1127 WARN(1, "lockdep bfs error:%d\n", ret);
988 1128
989 return 0; 1129 return 0;
990} 1130}
991 1131
992unsigned long __lockdep_count_forward_deps(struct lock_class *class, 1132static int noop_count(struct lock_list *entry, void *data)
993 unsigned int depth)
994{ 1133{
995 struct lock_list *entry; 1134 (*(unsigned long *)data)++;
996 unsigned long ret = 1; 1135 return 0;
1136}
997 1137
998 if (lockdep_dependency_visit(class, depth)) 1138unsigned long __lockdep_count_forward_deps(struct lock_list *this)
999 return 0; 1139{
1140 unsigned long count = 0;
1141 struct lock_list *uninitialized_var(target_entry);
1000 1142
1001 /* 1143 __bfs_forwards(this, (void *)&count, noop_count, &target_entry);
1002 * Recurse this class's dependency list:
1003 */
1004 list_for_each_entry(entry, &class->locks_after, entry)
1005 ret += __lockdep_count_forward_deps(entry->class, depth + 1);
1006 1144
1007 return ret; 1145 return count;
1008} 1146}
1009
1010unsigned long lockdep_count_forward_deps(struct lock_class *class) 1147unsigned long lockdep_count_forward_deps(struct lock_class *class)
1011{ 1148{
1012 unsigned long ret, flags; 1149 unsigned long ret, flags;
1150 struct lock_list this;
1151
1152 this.parent = NULL;
1153 this.class = class;
1013 1154
1014 local_irq_save(flags); 1155 local_irq_save(flags);
1015 __raw_spin_lock(&lockdep_lock); 1156 __raw_spin_lock(&lockdep_lock);
1016 ret = __lockdep_count_forward_deps(class, 0); 1157 ret = __lockdep_count_forward_deps(&this);
1017 __raw_spin_unlock(&lockdep_lock); 1158 __raw_spin_unlock(&lockdep_lock);
1018 local_irq_restore(flags); 1159 local_irq_restore(flags);
1019 1160
1020 return ret; 1161 return ret;
1021} 1162}
1022 1163
1023unsigned long __lockdep_count_backward_deps(struct lock_class *class, 1164unsigned long __lockdep_count_backward_deps(struct lock_list *this)
1024 unsigned int depth)
1025{ 1165{
1026 struct lock_list *entry; 1166 unsigned long count = 0;
1027 unsigned long ret = 1; 1167 struct lock_list *uninitialized_var(target_entry);
1028 1168
1029 if (lockdep_dependency_visit(class, depth)) 1169 __bfs_backwards(this, (void *)&count, noop_count, &target_entry);
1030 return 0;
1031 /*
1032 * Recurse this class's dependency list:
1033 */
1034 list_for_each_entry(entry, &class->locks_before, entry)
1035 ret += __lockdep_count_backward_deps(entry->class, depth + 1);
1036 1170
1037 return ret; 1171 return count;
1038} 1172}
1039 1173
1040unsigned long lockdep_count_backward_deps(struct lock_class *class) 1174unsigned long lockdep_count_backward_deps(struct lock_class *class)
1041{ 1175{
1042 unsigned long ret, flags; 1176 unsigned long ret, flags;
1177 struct lock_list this;
1178
1179 this.parent = NULL;
1180 this.class = class;
1043 1181
1044 local_irq_save(flags); 1182 local_irq_save(flags);
1045 __raw_spin_lock(&lockdep_lock); 1183 __raw_spin_lock(&lockdep_lock);
1046 ret = __lockdep_count_backward_deps(class, 0); 1184 ret = __lockdep_count_backward_deps(&this);
1047 __raw_spin_unlock(&lockdep_lock); 1185 __raw_spin_unlock(&lockdep_lock);
1048 local_irq_restore(flags); 1186 local_irq_restore(flags);
1049 1187
@@ -1055,29 +1193,16 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
1055 * lead to <target>. Print an error and return 0 if it does. 1193 * lead to <target>. Print an error and return 0 if it does.
1056 */ 1194 */
1057static noinline int 1195static noinline int
1058check_noncircular(struct lock_class *source, unsigned int depth) 1196check_noncircular(struct lock_list *root, struct lock_class *target,
1197 struct lock_list **target_entry)
1059{ 1198{
1060 struct lock_list *entry; 1199 int result;
1061 1200
1062 if (lockdep_dependency_visit(source, depth)) 1201 debug_atomic_inc(&nr_cyclic_checks);
1063 return 1;
1064 1202
1065 debug_atomic_inc(&nr_cyclic_check_recursions); 1203 result = __bfs_forwards(root, target, class_equal, target_entry);
1066 if (depth > max_recursion_depth) 1204
1067 max_recursion_depth = depth; 1205 return result;
1068 if (depth >= RECURSION_LIMIT)
1069 return print_infinite_recursion_bug();
1070 /*
1071 * Check this lock's dependency list:
1072 */
1073 list_for_each_entry(entry, &source->locks_after, entry) {
1074 if (entry->class == hlock_class(check_target))
1075 return print_circular_bug_header(entry, depth+1);
1076 debug_atomic_inc(&nr_cyclic_checks);
1077 if (!check_noncircular(entry->class, depth+1))
1078 return print_circular_bug_entry(entry, depth+1);
1079 }
1080 return 1;
1081} 1206}
1082 1207
1083#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) 1208#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
@@ -1086,103 +1211,121 @@ check_noncircular(struct lock_class *source, unsigned int depth)
1086 * proving that two subgraphs can be connected by a new dependency 1211 * proving that two subgraphs can be connected by a new dependency
1087 * without creating any illegal irq-safe -> irq-unsafe lock dependency. 1212 * without creating any illegal irq-safe -> irq-unsafe lock dependency.
1088 */ 1213 */
1089static enum lock_usage_bit find_usage_bit; 1214
1090static struct lock_class *forwards_match, *backwards_match; 1215static inline int usage_match(struct lock_list *entry, void *bit)
1216{
1217 return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit);
1218}
1219
1220
1091 1221
1092/* 1222/*
1093 * Find a node in the forwards-direction dependency sub-graph starting 1223 * Find a node in the forwards-direction dependency sub-graph starting
1094 * at <source> that matches <find_usage_bit>. 1224 * at @root->class that matches @bit.
1095 * 1225 *
1096 * Return 2 if such a node exists in the subgraph, and put that node 1226 * Return 0 if such a node exists in the subgraph, and put that node
1097 * into <forwards_match>. 1227 * into *@target_entry.
1098 * 1228 *
1099 * Return 1 otherwise and keep <forwards_match> unchanged. 1229 * Return 1 otherwise and keep *@target_entry unchanged.
1100 * Return 0 on error. 1230 * Return <0 on error.
1101 */ 1231 */
1102static noinline int 1232static int
1103find_usage_forwards(struct lock_class *source, unsigned int depth) 1233find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
1234 struct lock_list **target_entry)
1104{ 1235{
1105 struct lock_list *entry; 1236 int result;
1106 int ret;
1107
1108 if (lockdep_dependency_visit(source, depth))
1109 return 1;
1110
1111 if (depth > max_recursion_depth)
1112 max_recursion_depth = depth;
1113 if (depth >= RECURSION_LIMIT)
1114 return print_infinite_recursion_bug();
1115 1237
1116 debug_atomic_inc(&nr_find_usage_forwards_checks); 1238 debug_atomic_inc(&nr_find_usage_forwards_checks);
1117 if (source->usage_mask & (1 << find_usage_bit)) {
1118 forwards_match = source;
1119 return 2;
1120 }
1121 1239
1122 /* 1240 result = __bfs_forwards(root, (void *)bit, usage_match, target_entry);
1123 * Check this lock's dependency list: 1241
1124 */ 1242 return result;
1125 list_for_each_entry(entry, &source->locks_after, entry) {
1126 debug_atomic_inc(&nr_find_usage_forwards_recursions);
1127 ret = find_usage_forwards(entry->class, depth+1);
1128 if (ret == 2 || ret == 0)
1129 return ret;
1130 }
1131 return 1;
1132} 1243}
1133 1244
1134/* 1245/*
1135 * Find a node in the backwards-direction dependency sub-graph starting 1246 * Find a node in the backwards-direction dependency sub-graph starting
1136 * at <source> that matches <find_usage_bit>. 1247 * at @root->class that matches @bit.
1137 * 1248 *
1138 * Return 2 if such a node exists in the subgraph, and put that node 1249 * Return 0 if such a node exists in the subgraph, and put that node
1139 * into <backwards_match>. 1250 * into *@target_entry.
1140 * 1251 *
1141 * Return 1 otherwise and keep <backwards_match> unchanged. 1252 * Return 1 otherwise and keep *@target_entry unchanged.
1142 * Return 0 on error. 1253 * Return <0 on error.
1143 */ 1254 */
1144static noinline int 1255static int
1145find_usage_backwards(struct lock_class *source, unsigned int depth) 1256find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit,
1257 struct lock_list **target_entry)
1146{ 1258{
1147 struct lock_list *entry; 1259 int result;
1148 int ret;
1149 1260
1150 if (lockdep_dependency_visit(source, depth)) 1261 debug_atomic_inc(&nr_find_usage_backwards_checks);
1151 return 1;
1152 1262
1153 if (!__raw_spin_is_locked(&lockdep_lock)) 1263 result = __bfs_backwards(root, (void *)bit, usage_match, target_entry);
1154 return DEBUG_LOCKS_WARN_ON(1);
1155 1264
1156 if (depth > max_recursion_depth) 1265 return result;
1157 max_recursion_depth = depth; 1266}
1158 if (depth >= RECURSION_LIMIT)
1159 return print_infinite_recursion_bug();
1160 1267
1161 debug_atomic_inc(&nr_find_usage_backwards_checks); 1268static void print_lock_class_header(struct lock_class *class, int depth)
1162 if (source->usage_mask & (1 << find_usage_bit)) { 1269{
1163 backwards_match = source; 1270 int bit;
1164 return 2;
1165 }
1166 1271
1167 if (!source && debug_locks_off_graph_unlock()) { 1272 printk("%*s->", depth, "");
1168 WARN_ON(1); 1273 print_lock_name(class);
1169 return 0; 1274 printk(" ops: %lu", class->ops);
1170 } 1275 printk(" {\n");
1171 1276
1172 /* 1277 for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
1173 * Check this lock's dependency list: 1278 if (class->usage_mask & (1 << bit)) {
1174 */ 1279 int len = depth;
1175 list_for_each_entry(entry, &source->locks_before, entry) { 1280
1176 debug_atomic_inc(&nr_find_usage_backwards_recursions); 1281 len += printk("%*s %s", depth, "", usage_str[bit]);
1177 ret = find_usage_backwards(entry->class, depth+1); 1282 len += printk(" at:\n");
1178 if (ret == 2 || ret == 0) 1283 print_stack_trace(class->usage_traces + bit, len);
1179 return ret; 1284 }
1180 } 1285 }
1181 return 1; 1286 printk("%*s }\n", depth, "");
1287
1288 printk("%*s ... key at: ",depth,"");
1289 print_ip_sym((unsigned long)class->key);
1290}
1291
1292/*
1293 * printk the shortest lock dependencies from @start to @end in reverse order:
1294 */
1295static void __used
1296print_shortest_lock_dependencies(struct lock_list *leaf,
1297 struct lock_list *root)
1298{
1299 struct lock_list *entry = leaf;
1300 int depth;
1301
1302 /*compute depth from generated tree by BFS*/
1303 depth = get_lock_depth(leaf);
1304
1305 do {
1306 print_lock_class_header(entry->class, depth);
1307 printk("%*s ... acquired at:\n", depth, "");
1308 print_stack_trace(&entry->trace, 2);
1309 printk("\n");
1310
1311 if (depth == 0 && (entry != root)) {
1312 printk("lockdep:%s bad BFS generated tree\n", __func__);
1313 break;
1314 }
1315
1316 entry = get_lock_parent(entry);
1317 depth--;
1318 } while (entry && (depth >= 0));
1319
1320 return;
1182} 1321}
1183 1322
1184static int 1323static int
1185print_bad_irq_dependency(struct task_struct *curr, 1324print_bad_irq_dependency(struct task_struct *curr,
1325 struct lock_list *prev_root,
1326 struct lock_list *next_root,
1327 struct lock_list *backwards_entry,
1328 struct lock_list *forwards_entry,
1186 struct held_lock *prev, 1329 struct held_lock *prev,
1187 struct held_lock *next, 1330 struct held_lock *next,
1188 enum lock_usage_bit bit1, 1331 enum lock_usage_bit bit1,
@@ -1215,26 +1358,32 @@ print_bad_irq_dependency(struct task_struct *curr,
1215 1358
1216 printk("\nbut this new dependency connects a %s-irq-safe lock:\n", 1359 printk("\nbut this new dependency connects a %s-irq-safe lock:\n",
1217 irqclass); 1360 irqclass);
1218 print_lock_name(backwards_match); 1361 print_lock_name(backwards_entry->class);
1219 printk("\n... which became %s-irq-safe at:\n", irqclass); 1362 printk("\n... which became %s-irq-safe at:\n", irqclass);
1220 1363
1221 print_stack_trace(backwards_match->usage_traces + bit1, 1); 1364 print_stack_trace(backwards_entry->class->usage_traces + bit1, 1);
1222 1365
1223 printk("\nto a %s-irq-unsafe lock:\n", irqclass); 1366 printk("\nto a %s-irq-unsafe lock:\n", irqclass);
1224 print_lock_name(forwards_match); 1367 print_lock_name(forwards_entry->class);
1225 printk("\n... which became %s-irq-unsafe at:\n", irqclass); 1368 printk("\n... which became %s-irq-unsafe at:\n", irqclass);
1226 printk("..."); 1369 printk("...");
1227 1370
1228 print_stack_trace(forwards_match->usage_traces + bit2, 1); 1371 print_stack_trace(forwards_entry->class->usage_traces + bit2, 1);
1229 1372
1230 printk("\nother info that might help us debug this:\n\n"); 1373 printk("\nother info that might help us debug this:\n\n");
1231 lockdep_print_held_locks(curr); 1374 lockdep_print_held_locks(curr);
1232 1375
1233 printk("\nthe %s-irq-safe lock's dependencies:\n", irqclass); 1376 printk("\nthe dependencies between %s-irq-safe lock", irqclass);
1234 print_lock_dependencies(backwards_match, 0); 1377 printk(" and the holding lock:\n");
1378 if (!save_trace(&prev_root->trace))
1379 return 0;
1380 print_shortest_lock_dependencies(backwards_entry, prev_root);
1235 1381
1236 printk("\nthe %s-irq-unsafe lock's dependencies:\n", irqclass); 1382 printk("\nthe dependencies between the lock to be acquired");
1237 print_lock_dependencies(forwards_match, 0); 1383 printk(" and %s-irq-unsafe lock:\n", irqclass);
1384 if (!save_trace(&next_root->trace))
1385 return 0;
1386 print_shortest_lock_dependencies(forwards_entry, next_root);
1238 1387
1239 printk("\nstack backtrace:\n"); 1388 printk("\nstack backtrace:\n");
1240 dump_stack(); 1389 dump_stack();
@@ -1248,19 +1397,30 @@ check_usage(struct task_struct *curr, struct held_lock *prev,
1248 enum lock_usage_bit bit_forwards, const char *irqclass) 1397 enum lock_usage_bit bit_forwards, const char *irqclass)
1249{ 1398{
1250 int ret; 1399 int ret;
1400 struct lock_list this, that;
1401 struct lock_list *uninitialized_var(target_entry);
1402 struct lock_list *uninitialized_var(target_entry1);
1251 1403
1252 find_usage_bit = bit_backwards; 1404 this.parent = NULL;
1253 /* fills in <backwards_match> */ 1405
1254 ret = find_usage_backwards(hlock_class(prev), 0); 1406 this.class = hlock_class(prev);
1255 if (!ret || ret == 1) 1407 ret = find_usage_backwards(&this, bit_backwards, &target_entry);
1408 if (ret < 0)
1409 return print_bfs_bug(ret);
1410 if (ret == 1)
1256 return ret; 1411 return ret;
1257 1412
1258 find_usage_bit = bit_forwards; 1413 that.parent = NULL;
1259 ret = find_usage_forwards(hlock_class(next), 0); 1414 that.class = hlock_class(next);
1260 if (!ret || ret == 1) 1415 ret = find_usage_forwards(&that, bit_forwards, &target_entry1);
1416 if (ret < 0)
1417 return print_bfs_bug(ret);
1418 if (ret == 1)
1261 return ret; 1419 return ret;
1262 /* ret == 2 */ 1420
1263 return print_bad_irq_dependency(curr, prev, next, 1421 return print_bad_irq_dependency(curr, &this, &that,
1422 target_entry, target_entry1,
1423 prev, next,
1264 bit_backwards, bit_forwards, irqclass); 1424 bit_backwards, bit_forwards, irqclass);
1265} 1425}
1266 1426
@@ -1472,6 +1632,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1472{ 1632{
1473 struct lock_list *entry; 1633 struct lock_list *entry;
1474 int ret; 1634 int ret;
1635 struct lock_list this;
1636 struct lock_list *uninitialized_var(target_entry);
1475 1637
1476 /* 1638 /*
1477 * Prove that the new <prev> -> <next> dependency would not 1639 * Prove that the new <prev> -> <next> dependency would not
@@ -1482,10 +1644,13 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1482 * We are using global variables to control the recursion, to 1644 * We are using global variables to control the recursion, to
1483 * keep the stackframe size of the recursive functions low: 1645 * keep the stackframe size of the recursive functions low:
1484 */ 1646 */
1485 check_source = next; 1647 this.class = hlock_class(next);
1486 check_target = prev; 1648 this.parent = NULL;
1487 if (!(check_noncircular(hlock_class(next), 0))) 1649 ret = check_noncircular(&this, hlock_class(prev), &target_entry);
1488 return print_circular_bug_tail(); 1650 if (unlikely(!ret))
1651 return print_circular_bug(&this, target_entry, next, prev);
1652 else if (unlikely(ret < 0))
1653 return print_bfs_bug(ret);
1489 1654
1490 if (!check_prev_add_irq(curr, prev, next)) 1655 if (!check_prev_add_irq(curr, prev, next))
1491 return 0; 1656 return 0;
@@ -1884,7 +2049,8 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
1884 * print irq inversion bug: 2049 * print irq inversion bug:
1885 */ 2050 */
1886static int 2051static int
1887print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other, 2052print_irq_inversion_bug(struct task_struct *curr,
2053 struct lock_list *root, struct lock_list *other,
1888 struct held_lock *this, int forwards, 2054 struct held_lock *this, int forwards,
1889 const char *irqclass) 2055 const char *irqclass)
1890{ 2056{
@@ -1902,17 +2068,16 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
1902 printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass); 2068 printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass);
1903 else 2069 else
1904 printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass); 2070 printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass);
1905 print_lock_name(other); 2071 print_lock_name(other->class);
1906 printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); 2072 printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
1907 2073
1908 printk("\nother info that might help us debug this:\n"); 2074 printk("\nother info that might help us debug this:\n");
1909 lockdep_print_held_locks(curr); 2075 lockdep_print_held_locks(curr);
1910 2076
1911 printk("\nthe first lock's dependencies:\n"); 2077 printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
1912 print_lock_dependencies(hlock_class(this), 0); 2078 if (!save_trace(&root->trace))
1913 2079 return 0;
1914 printk("\nthe second lock's dependencies:\n"); 2080 print_shortest_lock_dependencies(other, root);
1915 print_lock_dependencies(other, 0);
1916 2081
1917 printk("\nstack backtrace:\n"); 2082 printk("\nstack backtrace:\n");
1918 dump_stack(); 2083 dump_stack();
@@ -1929,14 +2094,19 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
1929 enum lock_usage_bit bit, const char *irqclass) 2094 enum lock_usage_bit bit, const char *irqclass)
1930{ 2095{
1931 int ret; 2096 int ret;
1932 2097 struct lock_list root;
1933 find_usage_bit = bit; 2098 struct lock_list *uninitialized_var(target_entry);
1934 /* fills in <forwards_match> */ 2099
1935 ret = find_usage_forwards(hlock_class(this), 0); 2100 root.parent = NULL;
1936 if (!ret || ret == 1) 2101 root.class = hlock_class(this);
2102 ret = find_usage_forwards(&root, bit, &target_entry);
2103 if (ret < 0)
2104 return print_bfs_bug(ret);
2105 if (ret == 1)
1937 return ret; 2106 return ret;
1938 2107
1939 return print_irq_inversion_bug(curr, forwards_match, this, 1, irqclass); 2108 return print_irq_inversion_bug(curr, &root, target_entry,
2109 this, 1, irqclass);
1940} 2110}
1941 2111
1942/* 2112/*
@@ -1948,14 +2118,19 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
1948 enum lock_usage_bit bit, const char *irqclass) 2118 enum lock_usage_bit bit, const char *irqclass)
1949{ 2119{
1950 int ret; 2120 int ret;
1951 2121 struct lock_list root;
1952 find_usage_bit = bit; 2122 struct lock_list *uninitialized_var(target_entry);
1953 /* fills in <backwards_match> */ 2123
1954 ret = find_usage_backwards(hlock_class(this), 0); 2124 root.parent = NULL;
1955 if (!ret || ret == 1) 2125 root.class = hlock_class(this);
2126 ret = find_usage_backwards(&root, bit, &target_entry);
2127 if (ret < 0)
2128 return print_bfs_bug(ret);
2129 if (ret == 1)
1956 return ret; 2130 return ret;
1957 2131
1958 return print_irq_inversion_bug(curr, backwards_match, this, 0, irqclass); 2132 return print_irq_inversion_bug(curr, &root, target_entry,
2133 this, 1, irqclass);
1959} 2134}
1960 2135
1961void print_irqtrace_events(struct task_struct *curr) 2136void print_irqtrace_events(struct task_struct *curr)
@@ -2530,13 +2705,15 @@ EXPORT_SYMBOL_GPL(lockdep_init_map);
2530 */ 2705 */
2531static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, 2706static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2532 int trylock, int read, int check, int hardirqs_off, 2707 int trylock, int read, int check, int hardirqs_off,
2533 struct lockdep_map *nest_lock, unsigned long ip) 2708 struct lockdep_map *nest_lock, unsigned long ip,
2709 int references)
2534{ 2710{
2535 struct task_struct *curr = current; 2711 struct task_struct *curr = current;
2536 struct lock_class *class = NULL; 2712 struct lock_class *class = NULL;
2537 struct held_lock *hlock; 2713 struct held_lock *hlock;
2538 unsigned int depth, id; 2714 unsigned int depth, id;
2539 int chain_head = 0; 2715 int chain_head = 0;
2716 int class_idx;
2540 u64 chain_key; 2717 u64 chain_key;
2541 2718
2542 if (!prove_locking) 2719 if (!prove_locking)
@@ -2584,10 +2761,24 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2584 if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) 2761 if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH))
2585 return 0; 2762 return 0;
2586 2763
2764 class_idx = class - lock_classes + 1;
2765
2766 if (depth) {
2767 hlock = curr->held_locks + depth - 1;
2768 if (hlock->class_idx == class_idx && nest_lock) {
2769 if (hlock->references)
2770 hlock->references++;
2771 else
2772 hlock->references = 2;
2773
2774 return 1;
2775 }
2776 }
2777
2587 hlock = curr->held_locks + depth; 2778 hlock = curr->held_locks + depth;
2588 if (DEBUG_LOCKS_WARN_ON(!class)) 2779 if (DEBUG_LOCKS_WARN_ON(!class))
2589 return 0; 2780 return 0;
2590 hlock->class_idx = class - lock_classes + 1; 2781 hlock->class_idx = class_idx;
2591 hlock->acquire_ip = ip; 2782 hlock->acquire_ip = ip;
2592 hlock->instance = lock; 2783 hlock->instance = lock;
2593 hlock->nest_lock = nest_lock; 2784 hlock->nest_lock = nest_lock;
@@ -2595,6 +2786,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2595 hlock->read = read; 2786 hlock->read = read;
2596 hlock->check = check; 2787 hlock->check = check;
2597 hlock->hardirqs_off = !!hardirqs_off; 2788 hlock->hardirqs_off = !!hardirqs_off;
2789 hlock->references = references;
2598#ifdef CONFIG_LOCK_STAT 2790#ifdef CONFIG_LOCK_STAT
2599 hlock->waittime_stamp = 0; 2791 hlock->waittime_stamp = 0;
2600 hlock->holdtime_stamp = sched_clock(); 2792 hlock->holdtime_stamp = sched_clock();
@@ -2703,6 +2895,30 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
2703 return 1; 2895 return 1;
2704} 2896}
2705 2897
2898static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
2899{
2900 if (hlock->instance == lock)
2901 return 1;
2902
2903 if (hlock->references) {
2904 struct lock_class *class = lock->class_cache;
2905
2906 if (!class)
2907 class = look_up_lock_class(lock, 0);
2908
2909 if (DEBUG_LOCKS_WARN_ON(!class))
2910 return 0;
2911
2912 if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock))
2913 return 0;
2914
2915 if (hlock->class_idx == class - lock_classes + 1)
2916 return 1;
2917 }
2918
2919 return 0;
2920}
2921
2706static int 2922static int
2707__lock_set_class(struct lockdep_map *lock, const char *name, 2923__lock_set_class(struct lockdep_map *lock, const char *name,
2708 struct lock_class_key *key, unsigned int subclass, 2924 struct lock_class_key *key, unsigned int subclass,
@@ -2726,7 +2942,7 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
2726 */ 2942 */
2727 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) 2943 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
2728 break; 2944 break;
2729 if (hlock->instance == lock) 2945 if (match_held_lock(hlock, lock))
2730 goto found_it; 2946 goto found_it;
2731 prev_hlock = hlock; 2947 prev_hlock = hlock;
2732 } 2948 }
@@ -2745,7 +2961,8 @@ found_it:
2745 if (!__lock_acquire(hlock->instance, 2961 if (!__lock_acquire(hlock->instance,
2746 hlock_class(hlock)->subclass, hlock->trylock, 2962 hlock_class(hlock)->subclass, hlock->trylock,
2747 hlock->read, hlock->check, hlock->hardirqs_off, 2963 hlock->read, hlock->check, hlock->hardirqs_off,
2748 hlock->nest_lock, hlock->acquire_ip)) 2964 hlock->nest_lock, hlock->acquire_ip,
2965 hlock->references))
2749 return 0; 2966 return 0;
2750 } 2967 }
2751 2968
@@ -2784,20 +3001,34 @@ lock_release_non_nested(struct task_struct *curr,
2784 */ 3001 */
2785 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) 3002 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
2786 break; 3003 break;
2787 if (hlock->instance == lock) 3004 if (match_held_lock(hlock, lock))
2788 goto found_it; 3005 goto found_it;
2789 prev_hlock = hlock; 3006 prev_hlock = hlock;
2790 } 3007 }
2791 return print_unlock_inbalance_bug(curr, lock, ip); 3008 return print_unlock_inbalance_bug(curr, lock, ip);
2792 3009
2793found_it: 3010found_it:
2794 lock_release_holdtime(hlock); 3011 if (hlock->instance == lock)
3012 lock_release_holdtime(hlock);
3013
3014 if (hlock->references) {
3015 hlock->references--;
3016 if (hlock->references) {
3017 /*
3018 * We had, and after removing one, still have
3019 * references, the current lock stack is still
3020 * valid. We're done!
3021 */
3022 return 1;
3023 }
3024 }
2795 3025
2796 /* 3026 /*
2797 * We have the right lock to unlock, 'hlock' points to it. 3027 * We have the right lock to unlock, 'hlock' points to it.
2798 * Now we remove it from the stack, and add back the other 3028 * Now we remove it from the stack, and add back the other
2799 * entries (if any), recalculating the hash along the way: 3029 * entries (if any), recalculating the hash along the way:
2800 */ 3030 */
3031
2801 curr->lockdep_depth = i; 3032 curr->lockdep_depth = i;
2802 curr->curr_chain_key = hlock->prev_chain_key; 3033 curr->curr_chain_key = hlock->prev_chain_key;
2803 3034
@@ -2806,7 +3037,8 @@ found_it:
2806 if (!__lock_acquire(hlock->instance, 3037 if (!__lock_acquire(hlock->instance,
2807 hlock_class(hlock)->subclass, hlock->trylock, 3038 hlock_class(hlock)->subclass, hlock->trylock,
2808 hlock->read, hlock->check, hlock->hardirqs_off, 3039 hlock->read, hlock->check, hlock->hardirqs_off,
2809 hlock->nest_lock, hlock->acquire_ip)) 3040 hlock->nest_lock, hlock->acquire_ip,
3041 hlock->references))
2810 return 0; 3042 return 0;
2811 } 3043 }
2812 3044
@@ -2836,7 +3068,7 @@ static int lock_release_nested(struct task_struct *curr,
2836 /* 3068 /*
2837 * Is the unlock non-nested: 3069 * Is the unlock non-nested:
2838 */ 3070 */
2839 if (hlock->instance != lock) 3071 if (hlock->instance != lock || hlock->references)
2840 return lock_release_non_nested(curr, lock, ip); 3072 return lock_release_non_nested(curr, lock, ip);
2841 curr->lockdep_depth--; 3073 curr->lockdep_depth--;
2842 3074
@@ -2881,6 +3113,21 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
2881 check_chain_key(curr); 3113 check_chain_key(curr);
2882} 3114}
2883 3115
3116static int __lock_is_held(struct lockdep_map *lock)
3117{
3118 struct task_struct *curr = current;
3119 int i;
3120
3121 for (i = 0; i < curr->lockdep_depth; i++) {
3122 struct held_lock *hlock = curr->held_locks + i;
3123
3124 if (match_held_lock(hlock, lock))
3125 return 1;
3126 }
3127
3128 return 0;
3129}
3130
2884/* 3131/*
2885 * Check whether we follow the irq-flags state precisely: 3132 * Check whether we follow the irq-flags state precisely:
2886 */ 3133 */
@@ -2957,7 +3204,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2957 3204
2958 current->lockdep_recursion = 1; 3205 current->lockdep_recursion = 1;
2959 __lock_acquire(lock, subclass, trylock, read, check, 3206 __lock_acquire(lock, subclass, trylock, read, check,
2960 irqs_disabled_flags(flags), nest_lock, ip); 3207 irqs_disabled_flags(flags), nest_lock, ip, 0);
2961 current->lockdep_recursion = 0; 3208 current->lockdep_recursion = 0;
2962 raw_local_irq_restore(flags); 3209 raw_local_irq_restore(flags);
2963} 3210}
@@ -2982,6 +3229,26 @@ void lock_release(struct lockdep_map *lock, int nested,
2982} 3229}
2983EXPORT_SYMBOL_GPL(lock_release); 3230EXPORT_SYMBOL_GPL(lock_release);
2984 3231
3232int lock_is_held(struct lockdep_map *lock)
3233{
3234 unsigned long flags;
3235 int ret = 0;
3236
3237 if (unlikely(current->lockdep_recursion))
3238 return ret;
3239
3240 raw_local_irq_save(flags);
3241 check_flags(flags);
3242
3243 current->lockdep_recursion = 1;
3244 ret = __lock_is_held(lock);
3245 current->lockdep_recursion = 0;
3246 raw_local_irq_restore(flags);
3247
3248 return ret;
3249}
3250EXPORT_SYMBOL_GPL(lock_is_held);
3251
2985void lockdep_set_current_reclaim_state(gfp_t gfp_mask) 3252void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
2986{ 3253{
2987 current->lockdep_reclaim_gfp = gfp_mask; 3254 current->lockdep_reclaim_gfp = gfp_mask;
@@ -3041,7 +3308,7 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
3041 */ 3308 */
3042 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) 3309 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
3043 break; 3310 break;
3044 if (hlock->instance == lock) 3311 if (match_held_lock(hlock, lock))
3045 goto found_it; 3312 goto found_it;
3046 prev_hlock = hlock; 3313 prev_hlock = hlock;
3047 } 3314 }
@@ -3049,6 +3316,9 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
3049 return; 3316 return;
3050 3317
3051found_it: 3318found_it:
3319 if (hlock->instance != lock)
3320 return;
3321
3052 hlock->waittime_stamp = sched_clock(); 3322 hlock->waittime_stamp = sched_clock();
3053 3323
3054 contention_point = lock_point(hlock_class(hlock)->contention_point, ip); 3324 contention_point = lock_point(hlock_class(hlock)->contention_point, ip);
@@ -3088,7 +3358,7 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
3088 */ 3358 */
3089 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) 3359 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
3090 break; 3360 break;
3091 if (hlock->instance == lock) 3361 if (match_held_lock(hlock, lock))
3092 goto found_it; 3362 goto found_it;
3093 prev_hlock = hlock; 3363 prev_hlock = hlock;
3094 } 3364 }
@@ -3096,6 +3366,9 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
3096 return; 3366 return;
3097 3367
3098found_it: 3368found_it:
3369 if (hlock->instance != lock)
3370 return;
3371
3099 cpu = smp_processor_id(); 3372 cpu = smp_processor_id();
3100 if (hlock->waittime_stamp) { 3373 if (hlock->waittime_stamp) {
3101 now = sched_clock(); 3374 now = sched_clock();
@@ -3326,7 +3599,12 @@ void __init lockdep_info(void)
3326 sizeof(struct list_head) * CLASSHASH_SIZE + 3599 sizeof(struct list_head) * CLASSHASH_SIZE +
3327 sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES + 3600 sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES +
3328 sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS + 3601 sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS +
3329 sizeof(struct list_head) * CHAINHASH_SIZE) / 1024); 3602 sizeof(struct list_head) * CHAINHASH_SIZE
3603#ifdef CONFIG_PROVE_LOCKING
3604 + sizeof(struct circular_queue)
3605#endif
3606 ) / 1024
3607 );
3330 3608
3331 printk(" per task-struct memory footprint: %lu bytes\n", 3609 printk(" per task-struct memory footprint: %lu bytes\n",
3332 sizeof(struct held_lock) * MAX_LOCK_DEPTH); 3610 sizeof(struct held_lock) * MAX_LOCK_DEPTH);
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 699a2ac3a0d7..a2ee95ad1313 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -91,6 +91,8 @@ extern unsigned int nr_process_chains;
91extern unsigned int max_lockdep_depth; 91extern unsigned int max_lockdep_depth;
92extern unsigned int max_recursion_depth; 92extern unsigned int max_recursion_depth;
93 93
94extern unsigned int max_bfs_queue_depth;
95
94#ifdef CONFIG_PROVE_LOCKING 96#ifdef CONFIG_PROVE_LOCKING
95extern unsigned long lockdep_count_forward_deps(struct lock_class *); 97extern unsigned long lockdep_count_forward_deps(struct lock_class *);
96extern unsigned long lockdep_count_backward_deps(struct lock_class *); 98extern unsigned long lockdep_count_backward_deps(struct lock_class *);
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index e94caa666dba..d4b3dbc79fdb 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -25,38 +25,12 @@
25 25
26static void *l_next(struct seq_file *m, void *v, loff_t *pos) 26static void *l_next(struct seq_file *m, void *v, loff_t *pos)
27{ 27{
28 struct lock_class *class; 28 return seq_list_next(v, &all_lock_classes, pos);
29
30 (*pos)++;
31
32 if (v == SEQ_START_TOKEN)
33 class = m->private;
34 else {
35 class = v;
36
37 if (class->lock_entry.next != &all_lock_classes)
38 class = list_entry(class->lock_entry.next,
39 struct lock_class, lock_entry);
40 else
41 class = NULL;
42 }
43
44 return class;
45} 29}
46 30
47static void *l_start(struct seq_file *m, loff_t *pos) 31static void *l_start(struct seq_file *m, loff_t *pos)
48{ 32{
49 struct lock_class *class; 33 return seq_list_start_head(&all_lock_classes, *pos);
50 loff_t i = 0;
51
52 if (*pos == 0)
53 return SEQ_START_TOKEN;
54
55 list_for_each_entry(class, &all_lock_classes, lock_entry) {
56 if (++i == *pos)
57 return class;
58 }
59 return NULL;
60} 34}
61 35
62static void l_stop(struct seq_file *m, void *v) 36static void l_stop(struct seq_file *m, void *v)
@@ -82,11 +56,11 @@ static void print_name(struct seq_file *m, struct lock_class *class)
82 56
83static int l_show(struct seq_file *m, void *v) 57static int l_show(struct seq_file *m, void *v)
84{ 58{
85 struct lock_class *class = v; 59 struct lock_class *class = list_entry(v, struct lock_class, lock_entry);
86 struct lock_list *entry; 60 struct lock_list *entry;
87 char usage[LOCK_USAGE_CHARS]; 61 char usage[LOCK_USAGE_CHARS];
88 62
89 if (v == SEQ_START_TOKEN) { 63 if (v == &all_lock_classes) {
90 seq_printf(m, "all lock classes:\n"); 64 seq_printf(m, "all lock classes:\n");
91 return 0; 65 return 0;
92 } 66 }
@@ -128,17 +102,7 @@ static const struct seq_operations lockdep_ops = {
128 102
129static int lockdep_open(struct inode *inode, struct file *file) 103static int lockdep_open(struct inode *inode, struct file *file)
130{ 104{
131 int res = seq_open(file, &lockdep_ops); 105 return seq_open(file, &lockdep_ops);
132 if (!res) {
133 struct seq_file *m = file->private_data;
134
135 if (!list_empty(&all_lock_classes))
136 m->private = list_entry(all_lock_classes.next,
137 struct lock_class, lock_entry);
138 else
139 m->private = NULL;
140 }
141 return res;
142} 106}
143 107
144static const struct file_operations proc_lockdep_operations = { 108static const struct file_operations proc_lockdep_operations = {
@@ -149,37 +113,23 @@ static const struct file_operations proc_lockdep_operations = {
149}; 113};
150 114
151#ifdef CONFIG_PROVE_LOCKING 115#ifdef CONFIG_PROVE_LOCKING
152static void *lc_next(struct seq_file *m, void *v, loff_t *pos)
153{
154 struct lock_chain *chain;
155
156 (*pos)++;
157
158 if (v == SEQ_START_TOKEN)
159 chain = m->private;
160 else {
161 chain = v;
162
163 if (*pos < nr_lock_chains)
164 chain = lock_chains + *pos;
165 else
166 chain = NULL;
167 }
168
169 return chain;
170}
171
172static void *lc_start(struct seq_file *m, loff_t *pos) 116static void *lc_start(struct seq_file *m, loff_t *pos)
173{ 117{
174 if (*pos == 0) 118 if (*pos == 0)
175 return SEQ_START_TOKEN; 119 return SEQ_START_TOKEN;
176 120
177 if (*pos < nr_lock_chains) 121 if (*pos - 1 < nr_lock_chains)
178 return lock_chains + *pos; 122 return lock_chains + (*pos - 1);
179 123
180 return NULL; 124 return NULL;
181} 125}
182 126
127static void *lc_next(struct seq_file *m, void *v, loff_t *pos)
128{
129 (*pos)++;
130 return lc_start(m, pos);
131}
132
183static void lc_stop(struct seq_file *m, void *v) 133static void lc_stop(struct seq_file *m, void *v)
184{ 134{
185} 135}
@@ -220,16 +170,7 @@ static const struct seq_operations lockdep_chains_ops = {
220 170
221static int lockdep_chains_open(struct inode *inode, struct file *file) 171static int lockdep_chains_open(struct inode *inode, struct file *file)
222{ 172{
223 int res = seq_open(file, &lockdep_chains_ops); 173 return seq_open(file, &lockdep_chains_ops);
224 if (!res) {
225 struct seq_file *m = file->private_data;
226
227 if (nr_lock_chains)
228 m->private = lock_chains;
229 else
230 m->private = NULL;
231 }
232 return res;
233} 174}
234 175
235static const struct file_operations proc_lockdep_chains_operations = { 176static const struct file_operations proc_lockdep_chains_operations = {
@@ -258,16 +199,10 @@ static void lockdep_stats_debug_show(struct seq_file *m)
258 debug_atomic_read(&chain_lookup_hits)); 199 debug_atomic_read(&chain_lookup_hits));
259 seq_printf(m, " cyclic checks: %11u\n", 200 seq_printf(m, " cyclic checks: %11u\n",
260 debug_atomic_read(&nr_cyclic_checks)); 201 debug_atomic_read(&nr_cyclic_checks));
261 seq_printf(m, " cyclic-check recursions: %11u\n",
262 debug_atomic_read(&nr_cyclic_check_recursions));
263 seq_printf(m, " find-mask forwards checks: %11u\n", 202 seq_printf(m, " find-mask forwards checks: %11u\n",
264 debug_atomic_read(&nr_find_usage_forwards_checks)); 203 debug_atomic_read(&nr_find_usage_forwards_checks));
265 seq_printf(m, " find-mask forwards recursions: %11u\n",
266 debug_atomic_read(&nr_find_usage_forwards_recursions));
267 seq_printf(m, " find-mask backwards checks: %11u\n", 204 seq_printf(m, " find-mask backwards checks: %11u\n",
268 debug_atomic_read(&nr_find_usage_backwards_checks)); 205 debug_atomic_read(&nr_find_usage_backwards_checks));
269 seq_printf(m, " find-mask backwards recursions:%11u\n",
270 debug_atomic_read(&nr_find_usage_backwards_recursions));
271 206
272 seq_printf(m, " hardirq on events: %11u\n", hi1); 207 seq_printf(m, " hardirq on events: %11u\n", hi1);
273 seq_printf(m, " hardirq off events: %11u\n", hi2); 208 seq_printf(m, " hardirq off events: %11u\n", hi2);
@@ -409,8 +344,10 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
409 nr_unused); 344 nr_unused);
410 seq_printf(m, " max locking depth: %11u\n", 345 seq_printf(m, " max locking depth: %11u\n",
411 max_lockdep_depth); 346 max_lockdep_depth);
412 seq_printf(m, " max recursion depth: %11u\n", 347#ifdef CONFIG_PROVE_LOCKING
413 max_recursion_depth); 348 seq_printf(m, " max bfs queue depth: %11u\n",
349 max_bfs_queue_depth);
350#endif
414 lockdep_stats_debug_show(m); 351 lockdep_stats_debug_show(m);
415 seq_printf(m, " debug_locks: %11u\n", 352 seq_printf(m, " debug_locks: %11u\n",
416 debug_locks); 353 debug_locks);
@@ -438,7 +375,6 @@ struct lock_stat_data {
438}; 375};
439 376
440struct lock_stat_seq { 377struct lock_stat_seq {
441 struct lock_stat_data *iter;
442 struct lock_stat_data *iter_end; 378 struct lock_stat_data *iter_end;
443 struct lock_stat_data stats[MAX_LOCKDEP_KEYS]; 379 struct lock_stat_data stats[MAX_LOCKDEP_KEYS];
444}; 380};
@@ -626,34 +562,22 @@ static void seq_header(struct seq_file *m)
626static void *ls_start(struct seq_file *m, loff_t *pos) 562static void *ls_start(struct seq_file *m, loff_t *pos)
627{ 563{
628 struct lock_stat_seq *data = m->private; 564 struct lock_stat_seq *data = m->private;
565 struct lock_stat_data *iter;
629 566
630 if (*pos == 0) 567 if (*pos == 0)
631 return SEQ_START_TOKEN; 568 return SEQ_START_TOKEN;
632 569
633 data->iter = data->stats + *pos; 570 iter = data->stats + (*pos - 1);
634 if (data->iter >= data->iter_end) 571 if (iter >= data->iter_end)
635 data->iter = NULL; 572 iter = NULL;
636 573
637 return data->iter; 574 return iter;
638} 575}
639 576
640static void *ls_next(struct seq_file *m, void *v, loff_t *pos) 577static void *ls_next(struct seq_file *m, void *v, loff_t *pos)
641{ 578{
642 struct lock_stat_seq *data = m->private;
643
644 (*pos)++; 579 (*pos)++;
645 580 return ls_start(m, pos);
646 if (v == SEQ_START_TOKEN)
647 data->iter = data->stats;
648 else {
649 data->iter = v;
650 data->iter++;
651 }
652
653 if (data->iter == data->iter_end)
654 data->iter = NULL;
655
656 return data->iter;
657} 581}
658 582
659static void ls_stop(struct seq_file *m, void *v) 583static void ls_stop(struct seq_file *m, void *v)
@@ -691,7 +615,6 @@ static int lock_stat_open(struct inode *inode, struct file *file)
691 struct lock_stat_data *iter = data->stats; 615 struct lock_stat_data *iter = data->stats;
692 struct seq_file *m = file->private_data; 616 struct seq_file *m = file->private_data;
693 617
694 data->iter = iter;
695 list_for_each_entry(class, &all_lock_classes, lock_entry) { 618 list_for_each_entry(class, &all_lock_classes, lock_entry) {
696 iter->class = class; 619 iter->class = class;
697 iter->stats = lock_stats(class); 620 iter->stats = lock_stats(class);
@@ -699,7 +622,7 @@ static int lock_stat_open(struct inode *inode, struct file *file)
699 } 622 }
700 data->iter_end = iter; 623 data->iter_end = iter;
701 624
702 sort(data->stats, data->iter_end - data->iter, 625 sort(data->stats, data->iter_end - data->stats,
703 sizeof(struct lock_stat_data), 626 sizeof(struct lock_stat_data),
704 lock_stat_cmp, NULL); 627 lock_stat_cmp, NULL);
705 628
@@ -734,7 +657,6 @@ static int lock_stat_release(struct inode *inode, struct file *file)
734 struct seq_file *seq = file->private_data; 657 struct seq_file *seq = file->private_data;
735 658
736 vfree(seq->private); 659 vfree(seq->private);
737 seq->private = NULL;
738 return seq_release(inode, file); 660 return seq_release(inode, file);
739} 661}
740 662
diff --git a/kernel/module.c b/kernel/module.c
index 2d537186191f..46580edff0cb 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -55,6 +55,11 @@
55#include <linux/percpu.h> 55#include <linux/percpu.h>
56#include <linux/kmemleak.h> 56#include <linux/kmemleak.h>
57 57
58#define CREATE_TRACE_POINTS
59#include <trace/events/module.h>
60
61EXPORT_TRACEPOINT_SYMBOL(module_get);
62
58#if 0 63#if 0
59#define DEBUGP printk 64#define DEBUGP printk
60#else 65#else
@@ -942,6 +947,8 @@ void module_put(struct module *module)
942 if (module) { 947 if (module) {
943 unsigned int cpu = get_cpu(); 948 unsigned int cpu = get_cpu();
944 local_dec(__module_ref_addr(module, cpu)); 949 local_dec(__module_ref_addr(module, cpu));
950 trace_module_put(module, _RET_IP_,
951 local_read(__module_ref_addr(module, cpu)));
945 /* Maybe they're waiting for us to drop reference? */ 952 /* Maybe they're waiting for us to drop reference? */
946 if (unlikely(!module_is_live(module))) 953 if (unlikely(!module_is_live(module)))
947 wake_up_process(module->waiter); 954 wake_up_process(module->waiter);
@@ -1497,6 +1504,8 @@ static int __unlink_module(void *_mod)
1497/* Free a module, remove from lists, etc (must hold module_mutex). */ 1504/* Free a module, remove from lists, etc (must hold module_mutex). */
1498static void free_module(struct module *mod) 1505static void free_module(struct module *mod)
1499{ 1506{
1507 trace_module_free(mod);
1508
1500 /* Delete from various lists */ 1509 /* Delete from various lists */
1501 stop_machine(__unlink_module, mod, NULL); 1510 stop_machine(__unlink_module, mod, NULL);
1502 remove_notes_attrs(mod); 1511 remove_notes_attrs(mod);
@@ -2364,6 +2373,8 @@ static noinline struct module *load_module(void __user *umod,
2364 /* Get rid of temporary copy */ 2373 /* Get rid of temporary copy */
2365 vfree(hdr); 2374 vfree(hdr);
2366 2375
2376 trace_module_load(mod);
2377
2367 /* Done! */ 2378 /* Done! */
2368 return mod; 2379 return mod;
2369 2380
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d7cbc579fc80..e0d91fdf0c3c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -46,12 +46,18 @@ static atomic_t nr_task_counters __read_mostly;
46 46
47/* 47/*
48 * perf counter paranoia level: 48 * perf counter paranoia level:
49 * 0 - not paranoid 49 * -1 - not paranoid at all
50 * 1 - disallow cpu counters to unpriv 50 * 0 - disallow raw tracepoint access for unpriv
51 * 2 - disallow kernel profiling to unpriv 51 * 1 - disallow cpu counters for unpriv
52 * 2 - disallow kernel profiling for unpriv
52 */ 53 */
53int sysctl_perf_counter_paranoid __read_mostly = 1; 54int sysctl_perf_counter_paranoid __read_mostly = 1;
54 55
56static inline bool perf_paranoid_tracepoint_raw(void)
57{
58 return sysctl_perf_counter_paranoid > -1;
59}
60
55static inline bool perf_paranoid_cpu(void) 61static inline bool perf_paranoid_cpu(void)
56{ 62{
57 return sysctl_perf_counter_paranoid > 0; 63 return sysctl_perf_counter_paranoid > 0;
@@ -469,7 +475,8 @@ static void update_counter_times(struct perf_counter *counter)
469 struct perf_counter_context *ctx = counter->ctx; 475 struct perf_counter_context *ctx = counter->ctx;
470 u64 run_end; 476 u64 run_end;
471 477
472 if (counter->state < PERF_COUNTER_STATE_INACTIVE) 478 if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
479 counter->group_leader->state < PERF_COUNTER_STATE_INACTIVE)
473 return; 480 return;
474 481
475 counter->total_time_enabled = ctx->time - counter->tstamp_enabled; 482 counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
@@ -518,7 +525,7 @@ static void __perf_counter_disable(void *info)
518 */ 525 */
519 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { 526 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
520 update_context_time(ctx); 527 update_context_time(ctx);
521 update_counter_times(counter); 528 update_group_times(counter);
522 if (counter == counter->group_leader) 529 if (counter == counter->group_leader)
523 group_sched_out(counter, cpuctx, ctx); 530 group_sched_out(counter, cpuctx, ctx);
524 else 531 else
@@ -573,7 +580,7 @@ static void perf_counter_disable(struct perf_counter *counter)
573 * in, so we can change the state safely. 580 * in, so we can change the state safely.
574 */ 581 */
575 if (counter->state == PERF_COUNTER_STATE_INACTIVE) { 582 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
576 update_counter_times(counter); 583 update_group_times(counter);
577 counter->state = PERF_COUNTER_STATE_OFF; 584 counter->state = PERF_COUNTER_STATE_OFF;
578 } 585 }
579 586
@@ -851,6 +858,27 @@ retry:
851} 858}
852 859
853/* 860/*
861 * Put a counter into inactive state and update time fields.
862 * Enabling the leader of a group effectively enables all
863 * the group members that aren't explicitly disabled, so we
864 * have to update their ->tstamp_enabled also.
865 * Note: this works for group members as well as group leaders
866 * since the non-leader members' sibling_lists will be empty.
867 */
868static void __perf_counter_mark_enabled(struct perf_counter *counter,
869 struct perf_counter_context *ctx)
870{
871 struct perf_counter *sub;
872
873 counter->state = PERF_COUNTER_STATE_INACTIVE;
874 counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
875 list_for_each_entry(sub, &counter->sibling_list, list_entry)
876 if (sub->state >= PERF_COUNTER_STATE_INACTIVE)
877 sub->tstamp_enabled =
878 ctx->time - sub->total_time_enabled;
879}
880
881/*
854 * Cross CPU call to enable a performance counter 882 * Cross CPU call to enable a performance counter
855 */ 883 */
856static void __perf_counter_enable(void *info) 884static void __perf_counter_enable(void *info)
@@ -877,8 +905,7 @@ static void __perf_counter_enable(void *info)
877 905
878 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) 906 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
879 goto unlock; 907 goto unlock;
880 counter->state = PERF_COUNTER_STATE_INACTIVE; 908 __perf_counter_mark_enabled(counter, ctx);
881 counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
882 909
883 /* 910 /*
884 * If the counter is in a group and isn't the group leader, 911 * If the counter is in a group and isn't the group leader,
@@ -971,11 +998,9 @@ static void perf_counter_enable(struct perf_counter *counter)
971 * Since we have the lock this context can't be scheduled 998 * Since we have the lock this context can't be scheduled
972 * in, so we can change the state safely. 999 * in, so we can change the state safely.
973 */ 1000 */
974 if (counter->state == PERF_COUNTER_STATE_OFF) { 1001 if (counter->state == PERF_COUNTER_STATE_OFF)
975 counter->state = PERF_COUNTER_STATE_INACTIVE; 1002 __perf_counter_mark_enabled(counter, ctx);
976 counter->tstamp_enabled = 1003
977 ctx->time - counter->total_time_enabled;
978 }
979 out: 1004 out:
980 spin_unlock_irq(&ctx->lock); 1005 spin_unlock_irq(&ctx->lock);
981} 1006}
@@ -1479,9 +1504,7 @@ static void perf_counter_enable_on_exec(struct task_struct *task)
1479 counter->attr.enable_on_exec = 0; 1504 counter->attr.enable_on_exec = 0;
1480 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) 1505 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
1481 continue; 1506 continue;
1482 counter->state = PERF_COUNTER_STATE_INACTIVE; 1507 __perf_counter_mark_enabled(counter, ctx);
1483 counter->tstamp_enabled =
1484 ctx->time - counter->total_time_enabled;
1485 enabled = 1; 1508 enabled = 1;
1486 } 1509 }
1487 1510
@@ -1675,6 +1698,11 @@ static void free_counter(struct perf_counter *counter)
1675 atomic_dec(&nr_task_counters); 1698 atomic_dec(&nr_task_counters);
1676 } 1699 }
1677 1700
1701 if (counter->output) {
1702 fput(counter->output->filp);
1703 counter->output = NULL;
1704 }
1705
1678 if (counter->destroy) 1706 if (counter->destroy)
1679 counter->destroy(counter); 1707 counter->destroy(counter);
1680 1708
@@ -1960,6 +1988,8 @@ unlock:
1960 return ret; 1988 return ret;
1961} 1989}
1962 1990
1991int perf_counter_set_output(struct perf_counter *counter, int output_fd);
1992
1963static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 1993static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1964{ 1994{
1965 struct perf_counter *counter = file->private_data; 1995 struct perf_counter *counter = file->private_data;
@@ -1983,6 +2013,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1983 case PERF_COUNTER_IOC_PERIOD: 2013 case PERF_COUNTER_IOC_PERIOD:
1984 return perf_counter_period(counter, (u64 __user *)arg); 2014 return perf_counter_period(counter, (u64 __user *)arg);
1985 2015
2016 case PERF_COUNTER_IOC_SET_OUTPUT:
2017 return perf_counter_set_output(counter, arg);
2018
1986 default: 2019 default:
1987 return -ENOTTY; 2020 return -ENOTTY;
1988 } 2021 }
@@ -2253,6 +2286,11 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2253 2286
2254 WARN_ON_ONCE(counter->ctx->parent_ctx); 2287 WARN_ON_ONCE(counter->ctx->parent_ctx);
2255 mutex_lock(&counter->mmap_mutex); 2288 mutex_lock(&counter->mmap_mutex);
2289 if (counter->output) {
2290 ret = -EINVAL;
2291 goto unlock;
2292 }
2293
2256 if (atomic_inc_not_zero(&counter->mmap_count)) { 2294 if (atomic_inc_not_zero(&counter->mmap_count)) {
2257 if (nr_pages != counter->data->nr_pages) 2295 if (nr_pages != counter->data->nr_pages)
2258 ret = -EINVAL; 2296 ret = -EINVAL;
@@ -2638,6 +2676,7 @@ static int perf_output_begin(struct perf_output_handle *handle,
2638 struct perf_counter *counter, unsigned int size, 2676 struct perf_counter *counter, unsigned int size,
2639 int nmi, int sample) 2677 int nmi, int sample)
2640{ 2678{
2679 struct perf_counter *output_counter;
2641 struct perf_mmap_data *data; 2680 struct perf_mmap_data *data;
2642 unsigned int offset, head; 2681 unsigned int offset, head;
2643 int have_lost; 2682 int have_lost;
@@ -2647,13 +2686,17 @@ static int perf_output_begin(struct perf_output_handle *handle,
2647 u64 lost; 2686 u64 lost;
2648 } lost_event; 2687 } lost_event;
2649 2688
2689 rcu_read_lock();
2650 /* 2690 /*
2651 * For inherited counters we send all the output towards the parent. 2691 * For inherited counters we send all the output towards the parent.
2652 */ 2692 */
2653 if (counter->parent) 2693 if (counter->parent)
2654 counter = counter->parent; 2694 counter = counter->parent;
2655 2695
2656 rcu_read_lock(); 2696 output_counter = rcu_dereference(counter->output);
2697 if (output_counter)
2698 counter = output_counter;
2699
2657 data = rcu_dereference(counter->data); 2700 data = rcu_dereference(counter->data);
2658 if (!data) 2701 if (!data)
2659 goto out; 2702 goto out;
@@ -3934,6 +3977,7 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3934 * have these. 3977 * have these.
3935 */ 3978 */
3936 if ((counter->attr.sample_type & PERF_SAMPLE_RAW) && 3979 if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
3980 perf_paranoid_tracepoint_raw() &&
3937 !capable(CAP_SYS_ADMIN)) 3981 !capable(CAP_SYS_ADMIN))
3938 return ERR_PTR(-EPERM); 3982 return ERR_PTR(-EPERM);
3939 3983
@@ -4202,6 +4246,57 @@ err_size:
4202 goto out; 4246 goto out;
4203} 4247}
4204 4248
4249int perf_counter_set_output(struct perf_counter *counter, int output_fd)
4250{
4251 struct perf_counter *output_counter = NULL;
4252 struct file *output_file = NULL;
4253 struct perf_counter *old_output;
4254 int fput_needed = 0;
4255 int ret = -EINVAL;
4256
4257 if (!output_fd)
4258 goto set;
4259
4260 output_file = fget_light(output_fd, &fput_needed);
4261 if (!output_file)
4262 return -EBADF;
4263
4264 if (output_file->f_op != &perf_fops)
4265 goto out;
4266
4267 output_counter = output_file->private_data;
4268
4269 /* Don't chain output fds */
4270 if (output_counter->output)
4271 goto out;
4272
4273 /* Don't set an output fd when we already have an output channel */
4274 if (counter->data)
4275 goto out;
4276
4277 atomic_long_inc(&output_file->f_count);
4278
4279set:
4280 mutex_lock(&counter->mmap_mutex);
4281 old_output = counter->output;
4282 rcu_assign_pointer(counter->output, output_counter);
4283 mutex_unlock(&counter->mmap_mutex);
4284
4285 if (old_output) {
4286 /*
4287 * we need to make sure no existing perf_output_*()
4288 * is still referencing this counter.
4289 */
4290 synchronize_rcu();
4291 fput(old_output->filp);
4292 }
4293
4294 ret = 0;
4295out:
4296 fput_light(output_file, fput_needed);
4297 return ret;
4298}
4299
4205/** 4300/**
4206 * sys_perf_counter_open - open a performance counter, associate it to a task/cpu 4301 * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
4207 * 4302 *
@@ -4221,15 +4316,15 @@ SYSCALL_DEFINE5(perf_counter_open,
4221 struct file *group_file = NULL; 4316 struct file *group_file = NULL;
4222 int fput_needed = 0; 4317 int fput_needed = 0;
4223 int fput_needed2 = 0; 4318 int fput_needed2 = 0;
4224 int ret; 4319 int err;
4225 4320
4226 /* for future expandability... */ 4321 /* for future expandability... */
4227 if (flags) 4322 if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
4228 return -EINVAL; 4323 return -EINVAL;
4229 4324
4230 ret = perf_copy_attr(attr_uptr, &attr); 4325 err = perf_copy_attr(attr_uptr, &attr);
4231 if (ret) 4326 if (err)
4232 return ret; 4327 return err;
4233 4328
4234 if (!attr.exclude_kernel) { 4329 if (!attr.exclude_kernel) {
4235 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) 4330 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
@@ -4252,8 +4347,8 @@ SYSCALL_DEFINE5(perf_counter_open,
4252 * Look up the group leader (we will attach this counter to it): 4347 * Look up the group leader (we will attach this counter to it):
4253 */ 4348 */
4254 group_leader = NULL; 4349 group_leader = NULL;
4255 if (group_fd != -1) { 4350 if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
4256 ret = -EINVAL; 4351 err = -EINVAL;
4257 group_file = fget_light(group_fd, &fput_needed); 4352 group_file = fget_light(group_fd, &fput_needed);
4258 if (!group_file) 4353 if (!group_file)
4259 goto err_put_context; 4354 goto err_put_context;
@@ -4282,18 +4377,24 @@ SYSCALL_DEFINE5(perf_counter_open,
4282 4377
4283 counter = perf_counter_alloc(&attr, cpu, ctx, group_leader, 4378 counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
4284 NULL, GFP_KERNEL); 4379 NULL, GFP_KERNEL);
4285 ret = PTR_ERR(counter); 4380 err = PTR_ERR(counter);
4286 if (IS_ERR(counter)) 4381 if (IS_ERR(counter))
4287 goto err_put_context; 4382 goto err_put_context;
4288 4383
4289 ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); 4384 err = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
4290 if (ret < 0) 4385 if (err < 0)
4291 goto err_free_put_context; 4386 goto err_free_put_context;
4292 4387
4293 counter_file = fget_light(ret, &fput_needed2); 4388 counter_file = fget_light(err, &fput_needed2);
4294 if (!counter_file) 4389 if (!counter_file)
4295 goto err_free_put_context; 4390 goto err_free_put_context;
4296 4391
4392 if (flags & PERF_FLAG_FD_OUTPUT) {
4393 err = perf_counter_set_output(counter, group_fd);
4394 if (err)
4395 goto err_fput_free_put_context;
4396 }
4397
4297 counter->filp = counter_file; 4398 counter->filp = counter_file;
4298 WARN_ON_ONCE(ctx->parent_ctx); 4399 WARN_ON_ONCE(ctx->parent_ctx);
4299 mutex_lock(&ctx->mutex); 4400 mutex_lock(&ctx->mutex);
@@ -4307,20 +4408,20 @@ SYSCALL_DEFINE5(perf_counter_open,
4307 list_add_tail(&counter->owner_entry, &current->perf_counter_list); 4408 list_add_tail(&counter->owner_entry, &current->perf_counter_list);
4308 mutex_unlock(&current->perf_counter_mutex); 4409 mutex_unlock(&current->perf_counter_mutex);
4309 4410
4411err_fput_free_put_context:
4310 fput_light(counter_file, fput_needed2); 4412 fput_light(counter_file, fput_needed2);
4311 4413
4312out_fput:
4313 fput_light(group_file, fput_needed);
4314
4315 return ret;
4316
4317err_free_put_context: 4414err_free_put_context:
4318 kfree(counter); 4415 if (err < 0)
4416 kfree(counter);
4319 4417
4320err_put_context: 4418err_put_context:
4321 put_ctx(ctx); 4419 if (err < 0)
4420 put_ctx(ctx);
4421
4422 fput_light(group_file, fput_needed);
4322 4423
4323 goto out_fput; 4424 return err;
4324} 4425}
4325 4426
4326/* 4427/*
diff --git a/kernel/printk.c b/kernel/printk.c
index b4d97b54c1ec..e10d193a833a 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -37,6 +37,12 @@
37#include <asm/uaccess.h> 37#include <asm/uaccess.h>
38 38
39/* 39/*
40 * for_each_console() allows you to iterate on each console
41 */
42#define for_each_console(con) \
43 for (con = console_drivers; con != NULL; con = con->next)
44
45/*
40 * Architectures can override it: 46 * Architectures can override it:
41 */ 47 */
42void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) 48void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
@@ -61,6 +67,8 @@ int console_printk[4] = {
61 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ 67 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */
62}; 68};
63 69
70static int saved_console_loglevel = -1;
71
64/* 72/*
65 * Low level drivers may need that to know if they can schedule in 73 * Low level drivers may need that to know if they can schedule in
66 * their unblank() callback or not. So let's export it. 74 * their unblank() callback or not. So let's export it.
@@ -372,10 +380,15 @@ int do_syslog(int type, char __user *buf, int len)
372 logged_chars = 0; 380 logged_chars = 0;
373 break; 381 break;
374 case 6: /* Disable logging to console */ 382 case 6: /* Disable logging to console */
383 if (saved_console_loglevel == -1)
384 saved_console_loglevel = console_loglevel;
375 console_loglevel = minimum_console_loglevel; 385 console_loglevel = minimum_console_loglevel;
376 break; 386 break;
377 case 7: /* Enable logging to console */ 387 case 7: /* Enable logging to console */
378 console_loglevel = default_console_loglevel; 388 if (saved_console_loglevel != -1) {
389 console_loglevel = saved_console_loglevel;
390 saved_console_loglevel = -1;
391 }
379 break; 392 break;
380 case 8: /* Set level of messages printed to console */ 393 case 8: /* Set level of messages printed to console */
381 error = -EINVAL; 394 error = -EINVAL;
@@ -384,6 +397,8 @@ int do_syslog(int type, char __user *buf, int len)
384 if (len < minimum_console_loglevel) 397 if (len < minimum_console_loglevel)
385 len = minimum_console_loglevel; 398 len = minimum_console_loglevel;
386 console_loglevel = len; 399 console_loglevel = len;
400 /* Implicitly re-enable logging to console */
401 saved_console_loglevel = -1;
387 error = 0; 402 error = 0;
388 break; 403 break;
389 case 9: /* Number of chars in the log buffer */ 404 case 9: /* Number of chars in the log buffer */
@@ -412,7 +427,7 @@ static void __call_console_drivers(unsigned start, unsigned end)
412{ 427{
413 struct console *con; 428 struct console *con;
414 429
415 for (con = console_drivers; con; con = con->next) { 430 for_each_console(con) {
416 if ((con->flags & CON_ENABLED) && con->write && 431 if ((con->flags & CON_ENABLED) && con->write &&
417 (cpu_online(smp_processor_id()) || 432 (cpu_online(smp_processor_id()) ||
418 (con->flags & CON_ANYTIME))) 433 (con->flags & CON_ANYTIME)))
@@ -544,7 +559,7 @@ static int have_callable_console(void)
544{ 559{
545 struct console *con; 560 struct console *con;
546 561
547 for (con = console_drivers; con; con = con->next) 562 for_each_console(con)
548 if (con->flags & CON_ANYTIME) 563 if (con->flags & CON_ANYTIME)
549 return 1; 564 return 1;
550 565
@@ -1082,7 +1097,7 @@ void console_unblank(void)
1082 1097
1083 console_locked = 1; 1098 console_locked = 1;
1084 console_may_schedule = 0; 1099 console_may_schedule = 0;
1085 for (c = console_drivers; c != NULL; c = c->next) 1100 for_each_console(c)
1086 if ((c->flags & CON_ENABLED) && c->unblank) 1101 if ((c->flags & CON_ENABLED) && c->unblank)
1087 c->unblank(); 1102 c->unblank();
1088 release_console_sem(); 1103 release_console_sem();
@@ -1097,7 +1112,7 @@ struct tty_driver *console_device(int *index)
1097 struct tty_driver *driver = NULL; 1112 struct tty_driver *driver = NULL;
1098 1113
1099 acquire_console_sem(); 1114 acquire_console_sem();
1100 for (c = console_drivers; c != NULL; c = c->next) { 1115 for_each_console(c) {
1101 if (!c->device) 1116 if (!c->device)
1102 continue; 1117 continue;
1103 driver = c->device(c, index); 1118 driver = c->device(c, index);
@@ -1134,25 +1149,49 @@ EXPORT_SYMBOL(console_start);
1134 * to register the console printing procedure with printk() and to 1149 * to register the console printing procedure with printk() and to
1135 * print any messages that were printed by the kernel before the 1150 * print any messages that were printed by the kernel before the
1136 * console driver was initialized. 1151 * console driver was initialized.
1152 *
1153 * This can happen pretty early during the boot process (because of
1154 * early_printk) - sometimes before setup_arch() completes - be careful
1155 * of what kernel features are used - they may not be initialised yet.
1156 *
1157 * There are two types of consoles - bootconsoles (early_printk) and
1158 * "real" consoles (everything which is not a bootconsole) which are
1159 * handled differently.
1160 * - Any number of bootconsoles can be registered at any time.
1161 * - As soon as a "real" console is registered, all bootconsoles
1162 * will be unregistered automatically.
1163 * - Once a "real" console is registered, any attempt to register a
1164 * bootconsoles will be rejected
1137 */ 1165 */
1138void register_console(struct console *console) 1166void register_console(struct console *newcon)
1139{ 1167{
1140 int i; 1168 int i;
1141 unsigned long flags; 1169 unsigned long flags;
1142 struct console *bootconsole = NULL; 1170 struct console *bcon = NULL;
1143 1171
1144 if (console_drivers) { 1172 /*
1145 if (console->flags & CON_BOOT) 1173 * before we register a new CON_BOOT console, make sure we don't
1146 return; 1174 * already have a valid console
1147 if (console_drivers->flags & CON_BOOT) 1175 */
1148 bootconsole = console_drivers; 1176 if (console_drivers && newcon->flags & CON_BOOT) {
1177 /* find the last or real console */
1178 for_each_console(bcon) {
1179 if (!(bcon->flags & CON_BOOT)) {
1180 printk(KERN_INFO "Too late to register bootconsole %s%d\n",
1181 newcon->name, newcon->index);
1182 return;
1183 }
1184 }
1149 } 1185 }
1150 1186
1151 if (preferred_console < 0 || bootconsole || !console_drivers) 1187 if (console_drivers && console_drivers->flags & CON_BOOT)
1188 bcon = console_drivers;
1189
1190 if (preferred_console < 0 || bcon || !console_drivers)
1152 preferred_console = selected_console; 1191 preferred_console = selected_console;
1153 1192
1154 if (console->early_setup) 1193 if (newcon->early_setup)
1155 console->early_setup(); 1194 newcon->early_setup();
1156 1195
1157 /* 1196 /*
1158 * See if we want to use this console driver. If we 1197 * See if we want to use this console driver. If we
@@ -1160,13 +1199,13 @@ void register_console(struct console *console)
1160 * that registers here. 1199 * that registers here.
1161 */ 1200 */
1162 if (preferred_console < 0) { 1201 if (preferred_console < 0) {
1163 if (console->index < 0) 1202 if (newcon->index < 0)
1164 console->index = 0; 1203 newcon->index = 0;
1165 if (console->setup == NULL || 1204 if (newcon->setup == NULL ||
1166 console->setup(console, NULL) == 0) { 1205 newcon->setup(newcon, NULL) == 0) {
1167 console->flags |= CON_ENABLED; 1206 newcon->flags |= CON_ENABLED;
1168 if (console->device) { 1207 if (newcon->device) {
1169 console->flags |= CON_CONSDEV; 1208 newcon->flags |= CON_CONSDEV;
1170 preferred_console = 0; 1209 preferred_console = 0;
1171 } 1210 }
1172 } 1211 }
@@ -1178,64 +1217,62 @@ void register_console(struct console *console)
1178 */ 1217 */
1179 for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; 1218 for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0];
1180 i++) { 1219 i++) {
1181 if (strcmp(console_cmdline[i].name, console->name) != 0) 1220 if (strcmp(console_cmdline[i].name, newcon->name) != 0)
1182 continue; 1221 continue;
1183 if (console->index >= 0 && 1222 if (newcon->index >= 0 &&
1184 console->index != console_cmdline[i].index) 1223 newcon->index != console_cmdline[i].index)
1185 continue; 1224 continue;
1186 if (console->index < 0) 1225 if (newcon->index < 0)
1187 console->index = console_cmdline[i].index; 1226 newcon->index = console_cmdline[i].index;
1188#ifdef CONFIG_A11Y_BRAILLE_CONSOLE 1227#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
1189 if (console_cmdline[i].brl_options) { 1228 if (console_cmdline[i].brl_options) {
1190 console->flags |= CON_BRL; 1229 newcon->flags |= CON_BRL;
1191 braille_register_console(console, 1230 braille_register_console(newcon,
1192 console_cmdline[i].index, 1231 console_cmdline[i].index,
1193 console_cmdline[i].options, 1232 console_cmdline[i].options,
1194 console_cmdline[i].brl_options); 1233 console_cmdline[i].brl_options);
1195 return; 1234 return;
1196 } 1235 }
1197#endif 1236#endif
1198 if (console->setup && 1237 if (newcon->setup &&
1199 console->setup(console, console_cmdline[i].options) != 0) 1238 newcon->setup(newcon, console_cmdline[i].options) != 0)
1200 break; 1239 break;
1201 console->flags |= CON_ENABLED; 1240 newcon->flags |= CON_ENABLED;
1202 console->index = console_cmdline[i].index; 1241 newcon->index = console_cmdline[i].index;
1203 if (i == selected_console) { 1242 if (i == selected_console) {
1204 console->flags |= CON_CONSDEV; 1243 newcon->flags |= CON_CONSDEV;
1205 preferred_console = selected_console; 1244 preferred_console = selected_console;
1206 } 1245 }
1207 break; 1246 break;
1208 } 1247 }
1209 1248
1210 if (!(console->flags & CON_ENABLED)) 1249 if (!(newcon->flags & CON_ENABLED))
1211 return; 1250 return;
1212 1251
1213 if (bootconsole && (console->flags & CON_CONSDEV)) { 1252 /*
1214 printk(KERN_INFO "console handover: boot [%s%d] -> real [%s%d]\n", 1253 * If we have a bootconsole, and are switching to a real console,
1215 bootconsole->name, bootconsole->index, 1254 * don't print everything out again, since when the boot console, and
1216 console->name, console->index); 1255 * the real console are the same physical device, it's annoying to
1217 unregister_console(bootconsole); 1256 * see the beginning boot messages twice
1218 console->flags &= ~CON_PRINTBUFFER; 1257 */
1219 } else { 1258 if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV))
1220 printk(KERN_INFO "console [%s%d] enabled\n", 1259 newcon->flags &= ~CON_PRINTBUFFER;
1221 console->name, console->index);
1222 }
1223 1260
1224 /* 1261 /*
1225 * Put this console in the list - keep the 1262 * Put this console in the list - keep the
1226 * preferred driver at the head of the list. 1263 * preferred driver at the head of the list.
1227 */ 1264 */
1228 acquire_console_sem(); 1265 acquire_console_sem();
1229 if ((console->flags & CON_CONSDEV) || console_drivers == NULL) { 1266 if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) {
1230 console->next = console_drivers; 1267 newcon->next = console_drivers;
1231 console_drivers = console; 1268 console_drivers = newcon;
1232 if (console->next) 1269 if (newcon->next)
1233 console->next->flags &= ~CON_CONSDEV; 1270 newcon->next->flags &= ~CON_CONSDEV;
1234 } else { 1271 } else {
1235 console->next = console_drivers->next; 1272 newcon->next = console_drivers->next;
1236 console_drivers->next = console; 1273 console_drivers->next = newcon;
1237 } 1274 }
1238 if (console->flags & CON_PRINTBUFFER) { 1275 if (newcon->flags & CON_PRINTBUFFER) {
1239 /* 1276 /*
1240 * release_console_sem() will print out the buffered messages 1277 * release_console_sem() will print out the buffered messages
1241 * for us. 1278 * for us.
@@ -1245,6 +1282,28 @@ void register_console(struct console *console)
1245 spin_unlock_irqrestore(&logbuf_lock, flags); 1282 spin_unlock_irqrestore(&logbuf_lock, flags);
1246 } 1283 }
1247 release_console_sem(); 1284 release_console_sem();
1285
1286 /*
1287 * By unregistering the bootconsoles after we enable the real console
1288 * we get the "console xxx enabled" message on all the consoles -
1289 * boot consoles, real consoles, etc - this is to ensure that end
1290 * users know there might be something in the kernel's log buffer that
1291 * went to the bootconsole (that they do not see on the real console)
1292 */
1293 if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) {
1294 /* we need to iterate through twice, to make sure we print
1295 * everything out, before we unregister the console(s)
1296 */
1297 printk(KERN_INFO "console [%s%d] enabled, bootconsole disabled\n",
1298 newcon->name, newcon->index);
1299 for_each_console(bcon)
1300 if (bcon->flags & CON_BOOT)
1301 unregister_console(bcon);
1302 } else {
1303 printk(KERN_INFO "%sconsole [%s%d] enabled\n",
1304 (newcon->flags & CON_BOOT) ? "boot" : "" ,
1305 newcon->name, newcon->index);
1306 }
1248} 1307}
1249EXPORT_SYMBOL(register_console); 1308EXPORT_SYMBOL(register_console);
1250 1309
@@ -1287,11 +1346,13 @@ EXPORT_SYMBOL(unregister_console);
1287 1346
1288static int __init disable_boot_consoles(void) 1347static int __init disable_boot_consoles(void)
1289{ 1348{
1290 if (console_drivers != NULL) { 1349 struct console *con;
1291 if (console_drivers->flags & CON_BOOT) { 1350
1351 for_each_console(con) {
1352 if (con->flags & CON_BOOT) {
1292 printk(KERN_INFO "turn off boot console %s%d\n", 1353 printk(KERN_INFO "turn off boot console %s%d\n",
1293 console_drivers->name, console_drivers->index); 1354 con->name, con->index);
1294 return unregister_console(console_drivers); 1355 unregister_console(con);
1295 } 1356 }
1296 } 1357 }
1297 return 0; 1358 return 0;
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
deleted file mode 100644
index 0f2b0b311304..000000000000
--- a/kernel/rcuclassic.c
+++ /dev/null
@@ -1,807 +0,0 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2001
19 *
20 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
21 * Manfred Spraul <manfred@colorfullife.com>
22 *
23 * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
24 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
25 * Papers:
26 * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
27 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
28 *
29 * For detailed explanation of Read-Copy Update mechanism see -
30 * Documentation/RCU
31 *
32 */
33#include <linux/types.h>
34#include <linux/kernel.h>
35#include <linux/init.h>
36#include <linux/spinlock.h>
37#include <linux/smp.h>
38#include <linux/rcupdate.h>
39#include <linux/interrupt.h>
40#include <linux/sched.h>
41#include <asm/atomic.h>
42#include <linux/bitops.h>
43#include <linux/module.h>
44#include <linux/completion.h>
45#include <linux/moduleparam.h>
46#include <linux/percpu.h>
47#include <linux/notifier.h>
48#include <linux/cpu.h>
49#include <linux/mutex.h>
50#include <linux/time.h>
51
52#ifdef CONFIG_DEBUG_LOCK_ALLOC
53static struct lock_class_key rcu_lock_key;
54struct lockdep_map rcu_lock_map =
55 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
56EXPORT_SYMBOL_GPL(rcu_lock_map);
57#endif
58
59
60/* Definition for rcupdate control block. */
61static struct rcu_ctrlblk rcu_ctrlblk = {
62 .cur = -300,
63 .completed = -300,
64 .pending = -300,
65 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
66 .cpumask = CPU_BITS_NONE,
67};
68
69static struct rcu_ctrlblk rcu_bh_ctrlblk = {
70 .cur = -300,
71 .completed = -300,
72 .pending = -300,
73 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
74 .cpumask = CPU_BITS_NONE,
75};
76
77static DEFINE_PER_CPU(struct rcu_data, rcu_data);
78static DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
79
80/*
81 * Increment the quiescent state counter.
82 * The counter is a bit degenerated: We do not need to know
83 * how many quiescent states passed, just if there was at least
84 * one since the start of the grace period. Thus just a flag.
85 */
86void rcu_qsctr_inc(int cpu)
87{
88 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
89 rdp->passed_quiesc = 1;
90}
91
92void rcu_bh_qsctr_inc(int cpu)
93{
94 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
95 rdp->passed_quiesc = 1;
96}
97
98static int blimit = 10;
99static int qhimark = 10000;
100static int qlowmark = 100;
101
102#ifdef CONFIG_SMP
103static void force_quiescent_state(struct rcu_data *rdp,
104 struct rcu_ctrlblk *rcp)
105{
106 int cpu;
107 unsigned long flags;
108
109 set_need_resched();
110 spin_lock_irqsave(&rcp->lock, flags);
111 if (unlikely(!rcp->signaled)) {
112 rcp->signaled = 1;
113 /*
114 * Don't send IPI to itself. With irqs disabled,
115 * rdp->cpu is the current cpu.
116 *
117 * cpu_online_mask is updated by the _cpu_down()
118 * using __stop_machine(). Since we're in irqs disabled
119 * section, __stop_machine() is not exectuting, hence
120 * the cpu_online_mask is stable.
121 *
122 * However, a cpu might have been offlined _just_ before
123 * we disabled irqs while entering here.
124 * And rcu subsystem might not yet have handled the CPU_DEAD
125 * notification, leading to the offlined cpu's bit
126 * being set in the rcp->cpumask.
127 *
128 * Hence cpumask = (rcp->cpumask & cpu_online_mask) to prevent
129 * sending smp_reschedule() to an offlined CPU.
130 */
131 for_each_cpu_and(cpu,
132 to_cpumask(rcp->cpumask), cpu_online_mask) {
133 if (cpu != rdp->cpu)
134 smp_send_reschedule(cpu);
135 }
136 }
137 spin_unlock_irqrestore(&rcp->lock, flags);
138}
139#else
140static inline void force_quiescent_state(struct rcu_data *rdp,
141 struct rcu_ctrlblk *rcp)
142{
143 set_need_resched();
144}
145#endif
146
147static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp,
148 struct rcu_data *rdp)
149{
150 long batch;
151
152 head->next = NULL;
153 smp_mb(); /* Read of rcu->cur must happen after any change by caller. */
154
155 /*
156 * Determine the batch number of this callback.
157 *
158 * Using ACCESS_ONCE to avoid the following error when gcc eliminates
159 * local variable "batch" and emits codes like this:
160 * 1) rdp->batch = rcp->cur + 1 # gets old value
161 * ......
162 * 2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value
163 * then [*nxttail[0], *nxttail[1]) may contain callbacks
164 * that batch# = rdp->batch, see the comment of struct rcu_data.
165 */
166 batch = ACCESS_ONCE(rcp->cur) + 1;
167
168 if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) {
169 /* process callbacks */
170 rdp->nxttail[0] = rdp->nxttail[1];
171 rdp->nxttail[1] = rdp->nxttail[2];
172 if (rcu_batch_after(batch - 1, rdp->batch))
173 rdp->nxttail[0] = rdp->nxttail[2];
174 }
175
176 rdp->batch = batch;
177 *rdp->nxttail[2] = head;
178 rdp->nxttail[2] = &head->next;
179
180 if (unlikely(++rdp->qlen > qhimark)) {
181 rdp->blimit = INT_MAX;
182 force_quiescent_state(rdp, &rcu_ctrlblk);
183 }
184}
185
186#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
187
188static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
189{
190 rcp->gp_start = jiffies;
191 rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
192}
193
194static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
195{
196 int cpu;
197 long delta;
198 unsigned long flags;
199
200 /* Only let one CPU complain about others per time interval. */
201
202 spin_lock_irqsave(&rcp->lock, flags);
203 delta = jiffies - rcp->jiffies_stall;
204 if (delta < 2 || rcp->cur != rcp->completed) {
205 spin_unlock_irqrestore(&rcp->lock, flags);
206 return;
207 }
208 rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
209 spin_unlock_irqrestore(&rcp->lock, flags);
210
211 /* OK, time to rat on our buddy... */
212
213 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
214 for_each_possible_cpu(cpu) {
215 if (cpumask_test_cpu(cpu, to_cpumask(rcp->cpumask)))
216 printk(" %d", cpu);
217 }
218 printk(" (detected by %d, t=%ld jiffies)\n",
219 smp_processor_id(), (long)(jiffies - rcp->gp_start));
220}
221
222static void print_cpu_stall(struct rcu_ctrlblk *rcp)
223{
224 unsigned long flags;
225
226 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
227 smp_processor_id(), jiffies,
228 jiffies - rcp->gp_start);
229 dump_stack();
230 spin_lock_irqsave(&rcp->lock, flags);
231 if ((long)(jiffies - rcp->jiffies_stall) >= 0)
232 rcp->jiffies_stall =
233 jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
234 spin_unlock_irqrestore(&rcp->lock, flags);
235 set_need_resched(); /* kick ourselves to get things going. */
236}
237
238static void check_cpu_stall(struct rcu_ctrlblk *rcp)
239{
240 long delta;
241
242 delta = jiffies - rcp->jiffies_stall;
243 if (cpumask_test_cpu(smp_processor_id(), to_cpumask(rcp->cpumask)) &&
244 delta >= 0) {
245
246 /* We haven't checked in, so go dump stack. */
247 print_cpu_stall(rcp);
248
249 } else if (rcp->cur != rcp->completed && delta >= 2) {
250
251 /* They had two seconds to dump stack, so complain. */
252 print_other_cpu_stall(rcp);
253 }
254}
255
256#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
257
258static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
259{
260}
261
262static inline void check_cpu_stall(struct rcu_ctrlblk *rcp)
263{
264}
265
266#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
267
268/**
269 * call_rcu - Queue an RCU callback for invocation after a grace period.
270 * @head: structure to be used for queueing the RCU updates.
271 * @func: actual update function to be invoked after the grace period
272 *
273 * The update function will be invoked some time after a full grace
274 * period elapses, in other words after all currently executing RCU
275 * read-side critical sections have completed. RCU read-side critical
276 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
277 * and may be nested.
278 */
279void call_rcu(struct rcu_head *head,
280 void (*func)(struct rcu_head *rcu))
281{
282 unsigned long flags;
283
284 head->func = func;
285 local_irq_save(flags);
286 __call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data));
287 local_irq_restore(flags);
288}
289EXPORT_SYMBOL_GPL(call_rcu);
290
291/**
292 * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
293 * @head: structure to be used for queueing the RCU updates.
294 * @func: actual update function to be invoked after the grace period
295 *
296 * The update function will be invoked some time after a full grace
297 * period elapses, in other words after all currently executing RCU
298 * read-side critical sections have completed. call_rcu_bh() assumes
299 * that the read-side critical sections end on completion of a softirq
300 * handler. This means that read-side critical sections in process
301 * context must not be interrupted by softirqs. This interface is to be
302 * used when most of the read-side critical sections are in softirq context.
303 * RCU read-side critical sections are delimited by rcu_read_lock() and
304 * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
305 * and rcu_read_unlock_bh(), if in process context. These may be nested.
306 */
307void call_rcu_bh(struct rcu_head *head,
308 void (*func)(struct rcu_head *rcu))
309{
310 unsigned long flags;
311
312 head->func = func;
313 local_irq_save(flags);
314 __call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
315 local_irq_restore(flags);
316}
317EXPORT_SYMBOL_GPL(call_rcu_bh);
318
319/*
320 * Return the number of RCU batches processed thus far. Useful
321 * for debug and statistics.
322 */
323long rcu_batches_completed(void)
324{
325 return rcu_ctrlblk.completed;
326}
327EXPORT_SYMBOL_GPL(rcu_batches_completed);
328
329/*
330 * Return the number of RCU batches processed thus far. Useful
331 * for debug and statistics.
332 */
333long rcu_batches_completed_bh(void)
334{
335 return rcu_bh_ctrlblk.completed;
336}
337EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
338
339/* Raises the softirq for processing rcu_callbacks. */
340static inline void raise_rcu_softirq(void)
341{
342 raise_softirq(RCU_SOFTIRQ);
343}
344
345/*
346 * Invoke the completed RCU callbacks. They are expected to be in
347 * a per-cpu list.
348 */
349static void rcu_do_batch(struct rcu_data *rdp)
350{
351 unsigned long flags;
352 struct rcu_head *next, *list;
353 int count = 0;
354
355 list = rdp->donelist;
356 while (list) {
357 next = list->next;
358 prefetch(next);
359 list->func(list);
360 list = next;
361 if (++count >= rdp->blimit)
362 break;
363 }
364 rdp->donelist = list;
365
366 local_irq_save(flags);
367 rdp->qlen -= count;
368 local_irq_restore(flags);
369 if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
370 rdp->blimit = blimit;
371
372 if (!rdp->donelist)
373 rdp->donetail = &rdp->donelist;
374 else
375 raise_rcu_softirq();
376}
377
378/*
379 * Grace period handling:
380 * The grace period handling consists out of two steps:
381 * - A new grace period is started.
382 * This is done by rcu_start_batch. The start is not broadcasted to
383 * all cpus, they must pick this up by comparing rcp->cur with
384 * rdp->quiescbatch. All cpus are recorded in the
385 * rcu_ctrlblk.cpumask bitmap.
386 * - All cpus must go through a quiescent state.
387 * Since the start of the grace period is not broadcasted, at least two
388 * calls to rcu_check_quiescent_state are required:
389 * The first call just notices that a new grace period is running. The
390 * following calls check if there was a quiescent state since the beginning
391 * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
392 * the bitmap is empty, then the grace period is completed.
393 * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
394 * period (if necessary).
395 */
396
397/*
398 * Register a new batch of callbacks, and start it up if there is currently no
399 * active batch and the batch to be registered has not already occurred.
400 * Caller must hold rcu_ctrlblk.lock.
401 */
402static void rcu_start_batch(struct rcu_ctrlblk *rcp)
403{
404 if (rcp->cur != rcp->pending &&
405 rcp->completed == rcp->cur) {
406 rcp->cur++;
407 record_gp_stall_check_time(rcp);
408
409 /*
410 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
411 * Barrier Otherwise it can cause tickless idle CPUs to be
412 * included in rcp->cpumask, which will extend graceperiods
413 * unnecessarily.
414 */
415 smp_mb();
416 cpumask_andnot(to_cpumask(rcp->cpumask),
417 cpu_online_mask, nohz_cpu_mask);
418
419 rcp->signaled = 0;
420 }
421}
422
423/*
424 * cpu went through a quiescent state since the beginning of the grace period.
425 * Clear it from the cpu mask and complete the grace period if it was the last
426 * cpu. Start another grace period if someone has further entries pending
427 */
428static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
429{
430 cpumask_clear_cpu(cpu, to_cpumask(rcp->cpumask));
431 if (cpumask_empty(to_cpumask(rcp->cpumask))) {
432 /* batch completed ! */
433 rcp->completed = rcp->cur;
434 rcu_start_batch(rcp);
435 }
436}
437
438/*
439 * Check if the cpu has gone through a quiescent state (say context
440 * switch). If so and if it already hasn't done so in this RCU
441 * quiescent cycle, then indicate that it has done so.
442 */
443static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
444 struct rcu_data *rdp)
445{
446 unsigned long flags;
447
448 if (rdp->quiescbatch != rcp->cur) {
449 /* start new grace period: */
450 rdp->qs_pending = 1;
451 rdp->passed_quiesc = 0;
452 rdp->quiescbatch = rcp->cur;
453 return;
454 }
455
456 /* Grace period already completed for this cpu?
457 * qs_pending is checked instead of the actual bitmap to avoid
458 * cacheline trashing.
459 */
460 if (!rdp->qs_pending)
461 return;
462
463 /*
464 * Was there a quiescent state since the beginning of the grace
465 * period? If no, then exit and wait for the next call.
466 */
467 if (!rdp->passed_quiesc)
468 return;
469 rdp->qs_pending = 0;
470
471 spin_lock_irqsave(&rcp->lock, flags);
472 /*
473 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
474 * during cpu startup. Ignore the quiescent state.
475 */
476 if (likely(rdp->quiescbatch == rcp->cur))
477 cpu_quiet(rdp->cpu, rcp);
478
479 spin_unlock_irqrestore(&rcp->lock, flags);
480}
481
482
483#ifdef CONFIG_HOTPLUG_CPU
484
485/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
486 * locking requirements, the list it's pulling from has to belong to a cpu
487 * which is dead and hence not processing interrupts.
488 */
489static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
490 struct rcu_head **tail, long batch)
491{
492 unsigned long flags;
493
494 if (list) {
495 local_irq_save(flags);
496 this_rdp->batch = batch;
497 *this_rdp->nxttail[2] = list;
498 this_rdp->nxttail[2] = tail;
499 local_irq_restore(flags);
500 }
501}
502
503static void __rcu_offline_cpu(struct rcu_data *this_rdp,
504 struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
505{
506 unsigned long flags;
507
508 /*
509 * if the cpu going offline owns the grace period
510 * we can block indefinitely waiting for it, so flush
511 * it here
512 */
513 spin_lock_irqsave(&rcp->lock, flags);
514 if (rcp->cur != rcp->completed)
515 cpu_quiet(rdp->cpu, rcp);
516 rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1);
517 rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1);
518 spin_unlock(&rcp->lock);
519
520 this_rdp->qlen += rdp->qlen;
521 local_irq_restore(flags);
522}
523
524static void rcu_offline_cpu(int cpu)
525{
526 struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
527 struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
528
529 __rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
530 &per_cpu(rcu_data, cpu));
531 __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
532 &per_cpu(rcu_bh_data, cpu));
533 put_cpu_var(rcu_data);
534 put_cpu_var(rcu_bh_data);
535}
536
537#else
538
539static void rcu_offline_cpu(int cpu)
540{
541}
542
543#endif
544
545/*
546 * This does the RCU processing work from softirq context.
547 */
548static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
549 struct rcu_data *rdp)
550{
551 unsigned long flags;
552 long completed_snap;
553
554 if (rdp->nxtlist) {
555 local_irq_save(flags);
556 completed_snap = ACCESS_ONCE(rcp->completed);
557
558 /*
559 * move the other grace-period-completed entries to
560 * [rdp->nxtlist, *rdp->nxttail[0]) temporarily
561 */
562 if (!rcu_batch_before(completed_snap, rdp->batch))
563 rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2];
564 else if (!rcu_batch_before(completed_snap, rdp->batch - 1))
565 rdp->nxttail[0] = rdp->nxttail[1];
566
567 /*
568 * the grace period for entries in
569 * [rdp->nxtlist, *rdp->nxttail[0]) has completed and
570 * move these entries to donelist
571 */
572 if (rdp->nxttail[0] != &rdp->nxtlist) {
573 *rdp->donetail = rdp->nxtlist;
574 rdp->donetail = rdp->nxttail[0];
575 rdp->nxtlist = *rdp->nxttail[0];
576 *rdp->donetail = NULL;
577
578 if (rdp->nxttail[1] == rdp->nxttail[0])
579 rdp->nxttail[1] = &rdp->nxtlist;
580 if (rdp->nxttail[2] == rdp->nxttail[0])
581 rdp->nxttail[2] = &rdp->nxtlist;
582 rdp->nxttail[0] = &rdp->nxtlist;
583 }
584
585 local_irq_restore(flags);
586
587 if (rcu_batch_after(rdp->batch, rcp->pending)) {
588 unsigned long flags2;
589
590 /* and start it/schedule start if it's a new batch */
591 spin_lock_irqsave(&rcp->lock, flags2);
592 if (rcu_batch_after(rdp->batch, rcp->pending)) {
593 rcp->pending = rdp->batch;
594 rcu_start_batch(rcp);
595 }
596 spin_unlock_irqrestore(&rcp->lock, flags2);
597 }
598 }
599
600 rcu_check_quiescent_state(rcp, rdp);
601 if (rdp->donelist)
602 rcu_do_batch(rdp);
603}
604
605static void rcu_process_callbacks(struct softirq_action *unused)
606{
607 /*
608 * Memory references from any prior RCU read-side critical sections
609 * executed by the interrupted code must be see before any RCU
610 * grace-period manupulations below.
611 */
612
613 smp_mb(); /* See above block comment. */
614
615 __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
616 __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
617
618 /*
619 * Memory references from any later RCU read-side critical sections
620 * executed by the interrupted code must be see after any RCU
621 * grace-period manupulations above.
622 */
623
624 smp_mb(); /* See above block comment. */
625}
626
627static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
628{
629 /* Check for CPU stalls, if enabled. */
630 check_cpu_stall(rcp);
631
632 if (rdp->nxtlist) {
633 long completed_snap = ACCESS_ONCE(rcp->completed);
634
635 /*
636 * This cpu has pending rcu entries and the grace period
637 * for them has completed.
638 */
639 if (!rcu_batch_before(completed_snap, rdp->batch))
640 return 1;
641 if (!rcu_batch_before(completed_snap, rdp->batch - 1) &&
642 rdp->nxttail[0] != rdp->nxttail[1])
643 return 1;
644 if (rdp->nxttail[0] != &rdp->nxtlist)
645 return 1;
646
647 /*
648 * This cpu has pending rcu entries and the new batch
649 * for then hasn't been started nor scheduled start
650 */
651 if (rcu_batch_after(rdp->batch, rcp->pending))
652 return 1;
653 }
654
655 /* This cpu has finished callbacks to invoke */
656 if (rdp->donelist)
657 return 1;
658
659 /* The rcu core waits for a quiescent state from the cpu */
660 if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
661 return 1;
662
663 /* nothing to do */
664 return 0;
665}
666
667/*
668 * Check to see if there is any immediate RCU-related work to be done
669 * by the current CPU, returning 1 if so. This function is part of the
670 * RCU implementation; it is -not- an exported member of the RCU API.
671 */
672int rcu_pending(int cpu)
673{
674 return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
675 __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
676}
677
678/*
679 * Check to see if any future RCU-related work will need to be done
680 * by the current CPU, even if none need be done immediately, returning
681 * 1 if so. This function is part of the RCU implementation; it is -not-
682 * an exported member of the RCU API.
683 */
684int rcu_needs_cpu(int cpu)
685{
686 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
687 struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
688
689 return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu);
690}
691
692/*
693 * Top-level function driving RCU grace-period detection, normally
694 * invoked from the scheduler-clock interrupt. This function simply
695 * increments counters that are read only from softirq by this same
696 * CPU, so there are no memory barriers required.
697 */
698void rcu_check_callbacks(int cpu, int user)
699{
700 if (user ||
701 (idle_cpu(cpu) && rcu_scheduler_active &&
702 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
703
704 /*
705 * Get here if this CPU took its interrupt from user
706 * mode or from the idle loop, and if this is not a
707 * nested interrupt. In this case, the CPU is in
708 * a quiescent state, so count it.
709 *
710 * Also do a memory barrier. This is needed to handle
711 * the case where writes from a preempt-disable section
712 * of code get reordered into schedule() by this CPU's
713 * write buffer. The memory barrier makes sure that
714 * the rcu_qsctr_inc() and rcu_bh_qsctr_inc() are see
715 * by other CPUs to happen after any such write.
716 */
717
718 smp_mb(); /* See above block comment. */
719 rcu_qsctr_inc(cpu);
720 rcu_bh_qsctr_inc(cpu);
721
722 } else if (!in_softirq()) {
723
724 /*
725 * Get here if this CPU did not take its interrupt from
726 * softirq, in other words, if it is not interrupting
727 * a rcu_bh read-side critical section. This is an _bh
728 * critical section, so count it. The memory barrier
729 * is needed for the same reason as is the above one.
730 */
731
732 smp_mb(); /* See above block comment. */
733 rcu_bh_qsctr_inc(cpu);
734 }
735 raise_rcu_softirq();
736}
737
738static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
739 struct rcu_data *rdp)
740{
741 unsigned long flags;
742
743 spin_lock_irqsave(&rcp->lock, flags);
744 memset(rdp, 0, sizeof(*rdp));
745 rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist;
746 rdp->donetail = &rdp->donelist;
747 rdp->quiescbatch = rcp->completed;
748 rdp->qs_pending = 0;
749 rdp->cpu = cpu;
750 rdp->blimit = blimit;
751 spin_unlock_irqrestore(&rcp->lock, flags);
752}
753
754static void __cpuinit rcu_online_cpu(int cpu)
755{
756 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
757 struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
758
759 rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
760 rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
761 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
762}
763
764static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
765 unsigned long action, void *hcpu)
766{
767 long cpu = (long)hcpu;
768
769 switch (action) {
770 case CPU_UP_PREPARE:
771 case CPU_UP_PREPARE_FROZEN:
772 rcu_online_cpu(cpu);
773 break;
774 case CPU_DEAD:
775 case CPU_DEAD_FROZEN:
776 rcu_offline_cpu(cpu);
777 break;
778 default:
779 break;
780 }
781 return NOTIFY_OK;
782}
783
784static struct notifier_block __cpuinitdata rcu_nb = {
785 .notifier_call = rcu_cpu_notify,
786};
787
788/*
789 * Initializes rcu mechanism. Assumed to be called early.
790 * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
791 * Note that rcu_qsctr and friends are implicitly
792 * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
793 */
794void __init __rcu_init(void)
795{
796#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
797 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
798#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
799 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
800 (void *)(long)smp_processor_id());
801 /* Register notifier for non-boot CPUs */
802 register_cpu_notifier(&rcu_nb);
803}
804
805module_param(blimit, int, 0);
806module_param(qhimark, int, 0);
807module_param(qlowmark, int, 0);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a967c9feb90a..bd5d5c8e5140 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -98,6 +98,30 @@ void synchronize_rcu(void)
98} 98}
99EXPORT_SYMBOL_GPL(synchronize_rcu); 99EXPORT_SYMBOL_GPL(synchronize_rcu);
100 100
101/**
102 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
103 *
104 * Control will return to the caller some time after a full rcu_bh grace
105 * period has elapsed, in other words after all currently executing rcu_bh
106 * read-side critical sections have completed. RCU read-side critical
107 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
108 * and may be nested.
109 */
110void synchronize_rcu_bh(void)
111{
112 struct rcu_synchronize rcu;
113
114 if (rcu_blocking_is_gp())
115 return;
116
117 init_completion(&rcu.completion);
118 /* Will wake me after RCU finished. */
119 call_rcu_bh(&rcu.head, wakeme_after_rcu);
120 /* Wait for it. */
121 wait_for_completion(&rcu.completion);
122}
123EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
124
101static void rcu_barrier_callback(struct rcu_head *notused) 125static void rcu_barrier_callback(struct rcu_head *notused)
102{ 126{
103 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 127 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
@@ -129,6 +153,7 @@ static void rcu_barrier_func(void *type)
129static inline void wait_migrated_callbacks(void) 153static inline void wait_migrated_callbacks(void)
130{ 154{
131 wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count)); 155 wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count));
156 smp_mb(); /* In case we didn't sleep. */
132} 157}
133 158
134/* 159/*
@@ -192,9 +217,13 @@ static void rcu_migrate_callback(struct rcu_head *notused)
192 wake_up(&rcu_migrate_wq); 217 wake_up(&rcu_migrate_wq);
193} 218}
194 219
220extern int rcu_cpu_notify(struct notifier_block *self,
221 unsigned long action, void *hcpu);
222
195static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self, 223static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
196 unsigned long action, void *hcpu) 224 unsigned long action, void *hcpu)
197{ 225{
226 rcu_cpu_notify(self, action, hcpu);
198 if (action == CPU_DYING) { 227 if (action == CPU_DYING) {
199 /* 228 /*
200 * preempt_disable() in on_each_cpu() prevents stop_machine(), 229 * preempt_disable() in on_each_cpu() prevents stop_machine(),
@@ -209,7 +238,8 @@ static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
209 call_rcu_bh(rcu_migrate_head, rcu_migrate_callback); 238 call_rcu_bh(rcu_migrate_head, rcu_migrate_callback);
210 call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback); 239 call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback);
211 call_rcu(rcu_migrate_head + 2, rcu_migrate_callback); 240 call_rcu(rcu_migrate_head + 2, rcu_migrate_callback);
212 } else if (action == CPU_POST_DEAD) { 241 } else if (action == CPU_DOWN_PREPARE) {
242 /* Don't need to wait until next removal operation. */
213 /* rcu_migrate_head is protected by cpu_add_remove_lock */ 243 /* rcu_migrate_head is protected by cpu_add_remove_lock */
214 wait_migrated_callbacks(); 244 wait_migrated_callbacks();
215 } 245 }
@@ -219,8 +249,18 @@ static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
219 249
220void __init rcu_init(void) 250void __init rcu_init(void)
221{ 251{
252 int i;
253
222 __rcu_init(); 254 __rcu_init();
223 hotcpu_notifier(rcu_barrier_cpu_hotplug, 0); 255 cpu_notifier(rcu_barrier_cpu_hotplug, 0);
256
257 /*
258 * We don't need protection against CPU-hotplug here because
259 * this is called early in boot, before either interrupts
260 * or the scheduler are operational.
261 */
262 for_each_online_cpu(i)
263 rcu_barrier_cpu_hotplug(NULL, CPU_UP_PREPARE, (void *)(long)i);
224} 264}
225 265
226void rcu_scheduler_starting(void) 266void rcu_scheduler_starting(void)
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
deleted file mode 100644
index beb0e659adcc..000000000000
--- a/kernel/rcupreempt.c
+++ /dev/null
@@ -1,1539 +0,0 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion, realtime implementation
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2006
19 *
20 * Authors: Paul E. McKenney <paulmck@us.ibm.com>
21 * With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar
22 * for pushing me away from locks and towards counters, and
23 * to Suparna Bhattacharya for pushing me completely away
24 * from atomic instructions on the read side.
25 *
26 * - Added handling of Dynamic Ticks
27 * Copyright 2007 - Paul E. Mckenney <paulmck@us.ibm.com>
28 * - Steven Rostedt <srostedt@redhat.com>
29 *
30 * Papers: http://www.rdrop.com/users/paulmck/RCU
31 *
32 * Design Document: http://lwn.net/Articles/253651/
33 *
34 * For detailed explanation of Read-Copy Update mechanism see -
35 * Documentation/RCU/ *.txt
36 *
37 */
38#include <linux/types.h>
39#include <linux/kernel.h>
40#include <linux/init.h>
41#include <linux/spinlock.h>
42#include <linux/smp.h>
43#include <linux/rcupdate.h>
44#include <linux/interrupt.h>
45#include <linux/sched.h>
46#include <asm/atomic.h>
47#include <linux/bitops.h>
48#include <linux/module.h>
49#include <linux/kthread.h>
50#include <linux/completion.h>
51#include <linux/moduleparam.h>
52#include <linux/percpu.h>
53#include <linux/notifier.h>
54#include <linux/cpu.h>
55#include <linux/random.h>
56#include <linux/delay.h>
57#include <linux/cpumask.h>
58#include <linux/rcupreempt_trace.h>
59#include <asm/byteorder.h>
60
61/*
62 * PREEMPT_RCU data structures.
63 */
64
65/*
66 * GP_STAGES specifies the number of times the state machine has
67 * to go through the all the rcu_try_flip_states (see below)
68 * in a single Grace Period.
69 *
70 * GP in GP_STAGES stands for Grace Period ;)
71 */
72#define GP_STAGES 2
73struct rcu_data {
74 spinlock_t lock; /* Protect rcu_data fields. */
75 long completed; /* Number of last completed batch. */
76 int waitlistcount;
77 struct rcu_head *nextlist;
78 struct rcu_head **nexttail;
79 struct rcu_head *waitlist[GP_STAGES];
80 struct rcu_head **waittail[GP_STAGES];
81 struct rcu_head *donelist; /* from waitlist & waitschedlist */
82 struct rcu_head **donetail;
83 long rcu_flipctr[2];
84 struct rcu_head *nextschedlist;
85 struct rcu_head **nextschedtail;
86 struct rcu_head *waitschedlist;
87 struct rcu_head **waitschedtail;
88 int rcu_sched_sleeping;
89#ifdef CONFIG_RCU_TRACE
90 struct rcupreempt_trace trace;
91#endif /* #ifdef CONFIG_RCU_TRACE */
92};
93
94/*
95 * States for rcu_try_flip() and friends.
96 */
97
98enum rcu_try_flip_states {
99
100 /*
101 * Stay here if nothing is happening. Flip the counter if somthing
102 * starts happening. Denoted by "I"
103 */
104 rcu_try_flip_idle_state,
105
106 /*
107 * Wait here for all CPUs to notice that the counter has flipped. This
108 * prevents the old set of counters from ever being incremented once
109 * we leave this state, which in turn is necessary because we cannot
110 * test any individual counter for zero -- we can only check the sum.
111 * Denoted by "A".
112 */
113 rcu_try_flip_waitack_state,
114
115 /*
116 * Wait here for the sum of the old per-CPU counters to reach zero.
117 * Denoted by "Z".
118 */
119 rcu_try_flip_waitzero_state,
120
121 /*
122 * Wait here for each of the other CPUs to execute a memory barrier.
123 * This is necessary to ensure that these other CPUs really have
124 * completed executing their RCU read-side critical sections, despite
125 * their CPUs wildly reordering memory. Denoted by "M".
126 */
127 rcu_try_flip_waitmb_state,
128};
129
130/*
131 * States for rcu_ctrlblk.rcu_sched_sleep.
132 */
133
134enum rcu_sched_sleep_states {
135 rcu_sched_not_sleeping, /* Not sleeping, callbacks need GP. */
136 rcu_sched_sleep_prep, /* Thinking of sleeping, rechecking. */
137 rcu_sched_sleeping, /* Sleeping, awaken if GP needed. */
138};
139
140struct rcu_ctrlblk {
141 spinlock_t fliplock; /* Protect state-machine transitions. */
142 long completed; /* Number of last completed batch. */
143 enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
144 the rcu state machine */
145 spinlock_t schedlock; /* Protect rcu_sched sleep state. */
146 enum rcu_sched_sleep_states sched_sleep; /* rcu_sched state. */
147 wait_queue_head_t sched_wq; /* Place for rcu_sched to sleep. */
148};
149
150struct rcu_dyntick_sched {
151 int dynticks;
152 int dynticks_snap;
153 int sched_qs;
154 int sched_qs_snap;
155 int sched_dynticks_snap;
156};
157
158static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
159 .dynticks = 1,
160};
161
162void rcu_qsctr_inc(int cpu)
163{
164 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
165
166 rdssp->sched_qs++;
167}
168
169#ifdef CONFIG_NO_HZ
170
171void rcu_enter_nohz(void)
172{
173 static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
174
175 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
176 __get_cpu_var(rcu_dyntick_sched).dynticks++;
177 WARN_ON_RATELIMIT(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1, &rs);
178}
179
180void rcu_exit_nohz(void)
181{
182 static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
183
184 __get_cpu_var(rcu_dyntick_sched).dynticks++;
185 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
186 WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1),
187 &rs);
188}
189
190#endif /* CONFIG_NO_HZ */
191
192
193static DEFINE_PER_CPU(struct rcu_data, rcu_data);
194
195static struct rcu_ctrlblk rcu_ctrlblk = {
196 .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
197 .completed = 0,
198 .rcu_try_flip_state = rcu_try_flip_idle_state,
199 .schedlock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.schedlock),
200 .sched_sleep = rcu_sched_not_sleeping,
201 .sched_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rcu_ctrlblk.sched_wq),
202};
203
204static struct task_struct *rcu_sched_grace_period_task;
205
206#ifdef CONFIG_RCU_TRACE
207static char *rcu_try_flip_state_names[] =
208 { "idle", "waitack", "waitzero", "waitmb" };
209#endif /* #ifdef CONFIG_RCU_TRACE */
210
211static DECLARE_BITMAP(rcu_cpu_online_map, NR_CPUS) __read_mostly
212 = CPU_BITS_NONE;
213
214/*
215 * Enum and per-CPU flag to determine when each CPU has seen
216 * the most recent counter flip.
217 */
218
219enum rcu_flip_flag_values {
220 rcu_flip_seen, /* Steady/initial state, last flip seen. */
221 /* Only GP detector can update. */
222 rcu_flipped /* Flip just completed, need confirmation. */
223 /* Only corresponding CPU can update. */
224};
225static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag)
226 = rcu_flip_seen;
227
228/*
229 * Enum and per-CPU flag to determine when each CPU has executed the
230 * needed memory barrier to fence in memory references from its last RCU
231 * read-side critical section in the just-completed grace period.
232 */
233
234enum rcu_mb_flag_values {
235 rcu_mb_done, /* Steady/initial state, no mb()s required. */
236 /* Only GP detector can update. */
237 rcu_mb_needed /* Flip just completed, need an mb(). */
238 /* Only corresponding CPU can update. */
239};
240static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag)
241 = rcu_mb_done;
242
243/*
244 * RCU_DATA_ME: find the current CPU's rcu_data structure.
245 * RCU_DATA_CPU: find the specified CPU's rcu_data structure.
246 */
247#define RCU_DATA_ME() (&__get_cpu_var(rcu_data))
248#define RCU_DATA_CPU(cpu) (&per_cpu(rcu_data, cpu))
249
250/*
251 * Helper macro for tracing when the appropriate rcu_data is not
252 * cached in a local variable, but where the CPU number is so cached.
253 */
254#define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace));
255
256/*
257 * Helper macro for tracing when the appropriate rcu_data is not
258 * cached in a local variable.
259 */
260#define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace));
261
262/*
263 * Helper macro for tracing when the appropriate rcu_data is pointed
264 * to by a local variable.
265 */
266#define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
267
268#define RCU_SCHED_BATCH_TIME (HZ / 50)
269
270/*
271 * Return the number of RCU batches processed thus far. Useful
272 * for debug and statistics.
273 */
274long rcu_batches_completed(void)
275{
276 return rcu_ctrlblk.completed;
277}
278EXPORT_SYMBOL_GPL(rcu_batches_completed);
279
280void __rcu_read_lock(void)
281{
282 int idx;
283 struct task_struct *t = current;
284 int nesting;
285
286 nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
287 if (nesting != 0) {
288
289 /* An earlier rcu_read_lock() covers us, just count it. */
290
291 t->rcu_read_lock_nesting = nesting + 1;
292
293 } else {
294 unsigned long flags;
295
296 /*
297 * We disable interrupts for the following reasons:
298 * - If we get scheduling clock interrupt here, and we
299 * end up acking the counter flip, it's like a promise
300 * that we will never increment the old counter again.
301 * Thus we will break that promise if that
302 * scheduling clock interrupt happens between the time
303 * we pick the .completed field and the time that we
304 * increment our counter.
305 *
306 * - We don't want to be preempted out here.
307 *
308 * NMIs can still occur, of course, and might themselves
309 * contain rcu_read_lock().
310 */
311
312 local_irq_save(flags);
313
314 /*
315 * Outermost nesting of rcu_read_lock(), so increment
316 * the current counter for the current CPU. Use volatile
317 * casts to prevent the compiler from reordering.
318 */
319
320 idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1;
321 ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++;
322
323 /*
324 * Now that the per-CPU counter has been incremented, we
325 * are protected from races with rcu_read_lock() invoked
326 * from NMI handlers on this CPU. We can therefore safely
327 * increment the nesting counter, relieving further NMIs
328 * of the need to increment the per-CPU counter.
329 */
330
331 ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1;
332
333 /*
334 * Now that we have preventing any NMIs from storing
335 * to the ->rcu_flipctr_idx, we can safely use it to
336 * remember which counter to decrement in the matching
337 * rcu_read_unlock().
338 */
339
340 ACCESS_ONCE(t->rcu_flipctr_idx) = idx;
341 local_irq_restore(flags);
342 }
343}
344EXPORT_SYMBOL_GPL(__rcu_read_lock);
345
346void __rcu_read_unlock(void)
347{
348 int idx;
349 struct task_struct *t = current;
350 int nesting;
351
352 nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
353 if (nesting > 1) {
354
355 /*
356 * We are still protected by the enclosing rcu_read_lock(),
357 * so simply decrement the counter.
358 */
359
360 t->rcu_read_lock_nesting = nesting - 1;
361
362 } else {
363 unsigned long flags;
364
365 /*
366 * Disable local interrupts to prevent the grace-period
367 * detection state machine from seeing us half-done.
368 * NMIs can still occur, of course, and might themselves
369 * contain rcu_read_lock() and rcu_read_unlock().
370 */
371
372 local_irq_save(flags);
373
374 /*
375 * Outermost nesting of rcu_read_unlock(), so we must
376 * decrement the current counter for the current CPU.
377 * This must be done carefully, because NMIs can
378 * occur at any point in this code, and any rcu_read_lock()
379 * and rcu_read_unlock() pairs in the NMI handlers
380 * must interact non-destructively with this code.
381 * Lots of volatile casts, and -very- careful ordering.
382 *
383 * Changes to this code, including this one, must be
384 * inspected, validated, and tested extremely carefully!!!
385 */
386
387 /*
388 * First, pick up the index.
389 */
390
391 idx = ACCESS_ONCE(t->rcu_flipctr_idx);
392
393 /*
394 * Now that we have fetched the counter index, it is
395 * safe to decrement the per-task RCU nesting counter.
396 * After this, any interrupts or NMIs will increment and
397 * decrement the per-CPU counters.
398 */
399 ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1;
400
401 /*
402 * It is now safe to decrement this task's nesting count.
403 * NMIs that occur after this statement will route their
404 * rcu_read_lock() calls through this "else" clause, and
405 * will thus start incrementing the per-CPU counter on
406 * their own. They will also clobber ->rcu_flipctr_idx,
407 * but that is OK, since we have already fetched it.
408 */
409
410 ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--;
411 local_irq_restore(flags);
412 }
413}
414EXPORT_SYMBOL_GPL(__rcu_read_unlock);
415
416/*
417 * If a global counter flip has occurred since the last time that we
418 * advanced callbacks, advance them. Hardware interrupts must be
419 * disabled when calling this function.
420 */
421static void __rcu_advance_callbacks(struct rcu_data *rdp)
422{
423 int cpu;
424 int i;
425 int wlc = 0;
426
427 if (rdp->completed != rcu_ctrlblk.completed) {
428 if (rdp->waitlist[GP_STAGES - 1] != NULL) {
429 *rdp->donetail = rdp->waitlist[GP_STAGES - 1];
430 rdp->donetail = rdp->waittail[GP_STAGES - 1];
431 RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp);
432 }
433 for (i = GP_STAGES - 2; i >= 0; i--) {
434 if (rdp->waitlist[i] != NULL) {
435 rdp->waitlist[i + 1] = rdp->waitlist[i];
436 rdp->waittail[i + 1] = rdp->waittail[i];
437 wlc++;
438 } else {
439 rdp->waitlist[i + 1] = NULL;
440 rdp->waittail[i + 1] =
441 &rdp->waitlist[i + 1];
442 }
443 }
444 if (rdp->nextlist != NULL) {
445 rdp->waitlist[0] = rdp->nextlist;
446 rdp->waittail[0] = rdp->nexttail;
447 wlc++;
448 rdp->nextlist = NULL;
449 rdp->nexttail = &rdp->nextlist;
450 RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp);
451 } else {
452 rdp->waitlist[0] = NULL;
453 rdp->waittail[0] = &rdp->waitlist[0];
454 }
455 rdp->waitlistcount = wlc;
456 rdp->completed = rcu_ctrlblk.completed;
457 }
458
459 /*
460 * Check to see if this CPU needs to report that it has seen
461 * the most recent counter flip, thereby declaring that all
462 * subsequent rcu_read_lock() invocations will respect this flip.
463 */
464
465 cpu = raw_smp_processor_id();
466 if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
467 smp_mb(); /* Subsequent counter accesses must see new value */
468 per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
469 smp_mb(); /* Subsequent RCU read-side critical sections */
470 /* seen -after- acknowledgement. */
471 }
472}
473
474#ifdef CONFIG_NO_HZ
475static DEFINE_PER_CPU(int, rcu_update_flag);
476
477/**
478 * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
479 *
480 * If the CPU was idle with dynamic ticks active, this updates the
481 * rcu_dyntick_sched.dynticks to let the RCU handling know that the
482 * CPU is active.
483 */
484void rcu_irq_enter(void)
485{
486 int cpu = smp_processor_id();
487 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
488
489 if (per_cpu(rcu_update_flag, cpu))
490 per_cpu(rcu_update_flag, cpu)++;
491
492 /*
493 * Only update if we are coming from a stopped ticks mode
494 * (rcu_dyntick_sched.dynticks is even).
495 */
496 if (!in_interrupt() &&
497 (rdssp->dynticks & 0x1) == 0) {
498 /*
499 * The following might seem like we could have a race
500 * with NMI/SMIs. But this really isn't a problem.
501 * Here we do a read/modify/write, and the race happens
502 * when an NMI/SMI comes in after the read and before
503 * the write. But NMI/SMIs will increment this counter
504 * twice before returning, so the zero bit will not
505 * be corrupted by the NMI/SMI which is the most important
506 * part.
507 *
508 * The only thing is that we would bring back the counter
509 * to a postion that it was in during the NMI/SMI.
510 * But the zero bit would be set, so the rest of the
511 * counter would again be ignored.
512 *
513 * On return from the IRQ, the counter may have the zero
514 * bit be 0 and the counter the same as the return from
515 * the NMI/SMI. If the state machine was so unlucky to
516 * see that, it still doesn't matter, since all
517 * RCU read-side critical sections on this CPU would
518 * have already completed.
519 */
520 rdssp->dynticks++;
521 /*
522 * The following memory barrier ensures that any
523 * rcu_read_lock() primitives in the irq handler
524 * are seen by other CPUs to follow the above
525 * increment to rcu_dyntick_sched.dynticks. This is
526 * required in order for other CPUs to correctly
527 * determine when it is safe to advance the RCU
528 * grace-period state machine.
529 */
530 smp_mb(); /* see above block comment. */
531 /*
532 * Since we can't determine the dynamic tick mode from
533 * the rcu_dyntick_sched.dynticks after this routine,
534 * we use a second flag to acknowledge that we came
535 * from an idle state with ticks stopped.
536 */
537 per_cpu(rcu_update_flag, cpu)++;
538 /*
539 * If we take an NMI/SMI now, they will also increment
540 * the rcu_update_flag, and will not update the
541 * rcu_dyntick_sched.dynticks on exit. That is for
542 * this IRQ to do.
543 */
544 }
545}
546
547/**
548 * rcu_irq_exit - Called from exiting Hard irq context.
549 *
550 * If the CPU was idle with dynamic ticks active, update the
551 * rcu_dyntick_sched.dynticks to put let the RCU handling be
552 * aware that the CPU is going back to idle with no ticks.
553 */
554void rcu_irq_exit(void)
555{
556 int cpu = smp_processor_id();
557 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
558
559 /*
560 * rcu_update_flag is set if we interrupted the CPU
561 * when it was idle with ticks stopped.
562 * Once this occurs, we keep track of interrupt nesting
563 * because a NMI/SMI could also come in, and we still
564 * only want the IRQ that started the increment of the
565 * rcu_dyntick_sched.dynticks to be the one that modifies
566 * it on exit.
567 */
568 if (per_cpu(rcu_update_flag, cpu)) {
569 if (--per_cpu(rcu_update_flag, cpu))
570 return;
571
572 /* This must match the interrupt nesting */
573 WARN_ON(in_interrupt());
574
575 /*
576 * If an NMI/SMI happens now we are still
577 * protected by the rcu_dyntick_sched.dynticks being odd.
578 */
579
580 /*
581 * The following memory barrier ensures that any
582 * rcu_read_unlock() primitives in the irq handler
583 * are seen by other CPUs to preceed the following
584 * increment to rcu_dyntick_sched.dynticks. This
585 * is required in order for other CPUs to determine
586 * when it is safe to advance the RCU grace-period
587 * state machine.
588 */
589 smp_mb(); /* see above block comment. */
590 rdssp->dynticks++;
591 WARN_ON(rdssp->dynticks & 0x1);
592 }
593}
594
595void rcu_nmi_enter(void)
596{
597 rcu_irq_enter();
598}
599
600void rcu_nmi_exit(void)
601{
602 rcu_irq_exit();
603}
604
605static void dyntick_save_progress_counter(int cpu)
606{
607 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
608
609 rdssp->dynticks_snap = rdssp->dynticks;
610}
611
612static inline int
613rcu_try_flip_waitack_needed(int cpu)
614{
615 long curr;
616 long snap;
617 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
618
619 curr = rdssp->dynticks;
620 snap = rdssp->dynticks_snap;
621 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
622
623 /*
624 * If the CPU remained in dynticks mode for the entire time
625 * and didn't take any interrupts, NMIs, SMIs, or whatever,
626 * then it cannot be in the middle of an rcu_read_lock(), so
627 * the next rcu_read_lock() it executes must use the new value
628 * of the counter. So we can safely pretend that this CPU
629 * already acknowledged the counter.
630 */
631
632 if ((curr == snap) && ((curr & 0x1) == 0))
633 return 0;
634
635 /*
636 * If the CPU passed through or entered a dynticks idle phase with
637 * no active irq handlers, then, as above, we can safely pretend
638 * that this CPU already acknowledged the counter.
639 */
640
641 if ((curr - snap) > 2 || (curr & 0x1) == 0)
642 return 0;
643
644 /* We need this CPU to explicitly acknowledge the counter flip. */
645
646 return 1;
647}
648
649static inline int
650rcu_try_flip_waitmb_needed(int cpu)
651{
652 long curr;
653 long snap;
654 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
655
656 curr = rdssp->dynticks;
657 snap = rdssp->dynticks_snap;
658 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
659
660 /*
661 * If the CPU remained in dynticks mode for the entire time
662 * and didn't take any interrupts, NMIs, SMIs, or whatever,
663 * then it cannot have executed an RCU read-side critical section
664 * during that time, so there is no need for it to execute a
665 * memory barrier.
666 */
667
668 if ((curr == snap) && ((curr & 0x1) == 0))
669 return 0;
670
671 /*
672 * If the CPU either entered or exited an outermost interrupt,
673 * SMI, NMI, or whatever handler, then we know that it executed
674 * a memory barrier when doing so. So we don't need another one.
675 */
676 if (curr != snap)
677 return 0;
678
679 /* We need the CPU to execute a memory barrier. */
680
681 return 1;
682}
683
684static void dyntick_save_progress_counter_sched(int cpu)
685{
686 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
687
688 rdssp->sched_dynticks_snap = rdssp->dynticks;
689}
690
691static int rcu_qsctr_inc_needed_dyntick(int cpu)
692{
693 long curr;
694 long snap;
695 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
696
697 curr = rdssp->dynticks;
698 snap = rdssp->sched_dynticks_snap;
699 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
700
701 /*
702 * If the CPU remained in dynticks mode for the entire time
703 * and didn't take any interrupts, NMIs, SMIs, or whatever,
704 * then it cannot be in the middle of an rcu_read_lock(), so
705 * the next rcu_read_lock() it executes must use the new value
706 * of the counter. Therefore, this CPU has been in a quiescent
707 * state the entire time, and we don't need to wait for it.
708 */
709
710 if ((curr == snap) && ((curr & 0x1) == 0))
711 return 0;
712
713 /*
714 * If the CPU passed through or entered a dynticks idle phase with
715 * no active irq handlers, then, as above, this CPU has already
716 * passed through a quiescent state.
717 */
718
719 if ((curr - snap) > 2 || (snap & 0x1) == 0)
720 return 0;
721
722 /* We need this CPU to go through a quiescent state. */
723
724 return 1;
725}
726
727#else /* !CONFIG_NO_HZ */
728
729# define dyntick_save_progress_counter(cpu) do { } while (0)
730# define rcu_try_flip_waitack_needed(cpu) (1)
731# define rcu_try_flip_waitmb_needed(cpu) (1)
732
733# define dyntick_save_progress_counter_sched(cpu) do { } while (0)
734# define rcu_qsctr_inc_needed_dyntick(cpu) (1)
735
736#endif /* CONFIG_NO_HZ */
737
738static void save_qsctr_sched(int cpu)
739{
740 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
741
742 rdssp->sched_qs_snap = rdssp->sched_qs;
743}
744
745static inline int rcu_qsctr_inc_needed(int cpu)
746{
747 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
748
749 /*
750 * If there has been a quiescent state, no more need to wait
751 * on this CPU.
752 */
753
754 if (rdssp->sched_qs != rdssp->sched_qs_snap) {
755 smp_mb(); /* force ordering with cpu entering schedule(). */
756 return 0;
757 }
758
759 /* We need this CPU to go through a quiescent state. */
760
761 return 1;
762}
763
764/*
765 * Get here when RCU is idle. Decide whether we need to
766 * move out of idle state, and return non-zero if so.
767 * "Straightforward" approach for the moment, might later
768 * use callback-list lengths, grace-period duration, or
769 * some such to determine when to exit idle state.
770 * Might also need a pre-idle test that does not acquire
771 * the lock, but let's get the simple case working first...
772 */
773
774static int
775rcu_try_flip_idle(void)
776{
777 int cpu;
778
779 RCU_TRACE_ME(rcupreempt_trace_try_flip_i1);
780 if (!rcu_pending(smp_processor_id())) {
781 RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1);
782 return 0;
783 }
784
785 /*
786 * Do the flip.
787 */
788
789 RCU_TRACE_ME(rcupreempt_trace_try_flip_g1);
790 rcu_ctrlblk.completed++; /* stands in for rcu_try_flip_g2 */
791
792 /*
793 * Need a memory barrier so that other CPUs see the new
794 * counter value before they see the subsequent change of all
795 * the rcu_flip_flag instances to rcu_flipped.
796 */
797
798 smp_mb(); /* see above block comment. */
799
800 /* Now ask each CPU for acknowledgement of the flip. */
801
802 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
803 per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
804 dyntick_save_progress_counter(cpu);
805 }
806
807 return 1;
808}
809
810/*
811 * Wait for CPUs to acknowledge the flip.
812 */
813
814static int
815rcu_try_flip_waitack(void)
816{
817 int cpu;
818
819 RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
820 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
821 if (rcu_try_flip_waitack_needed(cpu) &&
822 per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
823 RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
824 return 0;
825 }
826
827 /*
828 * Make sure our checks above don't bleed into subsequent
829 * waiting for the sum of the counters to reach zero.
830 */
831
832 smp_mb(); /* see above block comment. */
833 RCU_TRACE_ME(rcupreempt_trace_try_flip_a2);
834 return 1;
835}
836
837/*
838 * Wait for collective ``last'' counter to reach zero,
839 * then tell all CPUs to do an end-of-grace-period memory barrier.
840 */
841
842static int
843rcu_try_flip_waitzero(void)
844{
845 int cpu;
846 int lastidx = !(rcu_ctrlblk.completed & 0x1);
847 int sum = 0;
848
849 /* Check to see if the sum of the "last" counters is zero. */
850
851 RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
852 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
853 sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
854 if (sum != 0) {
855 RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
856 return 0;
857 }
858
859 /*
860 * This ensures that the other CPUs see the call for
861 * memory barriers -after- the sum to zero has been
862 * detected here
863 */
864 smp_mb(); /* ^^^^^^^^^^^^ */
865
866 /* Call for a memory barrier from each CPU. */
867 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
868 per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
869 dyntick_save_progress_counter(cpu);
870 }
871
872 RCU_TRACE_ME(rcupreempt_trace_try_flip_z2);
873 return 1;
874}
875
876/*
877 * Wait for all CPUs to do their end-of-grace-period memory barrier.
878 * Return 0 once all CPUs have done so.
879 */
880
881static int
882rcu_try_flip_waitmb(void)
883{
884 int cpu;
885
886 RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
887 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
888 if (rcu_try_flip_waitmb_needed(cpu) &&
889 per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
890 RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
891 return 0;
892 }
893
894 smp_mb(); /* Ensure that the above checks precede any following flip. */
895 RCU_TRACE_ME(rcupreempt_trace_try_flip_m2);
896 return 1;
897}
898
899/*
900 * Attempt a single flip of the counters. Remember, a single flip does
901 * -not- constitute a grace period. Instead, the interval between
902 * at least GP_STAGES consecutive flips is a grace period.
903 *
904 * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
905 * on a large SMP, they might want to use a hierarchical organization of
906 * the per-CPU-counter pairs.
907 */
908static void rcu_try_flip(void)
909{
910 unsigned long flags;
911
912 RCU_TRACE_ME(rcupreempt_trace_try_flip_1);
913 if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) {
914 RCU_TRACE_ME(rcupreempt_trace_try_flip_e1);
915 return;
916 }
917
918 /*
919 * Take the next transition(s) through the RCU grace-period
920 * flip-counter state machine.
921 */
922
923 switch (rcu_ctrlblk.rcu_try_flip_state) {
924 case rcu_try_flip_idle_state:
925 if (rcu_try_flip_idle())
926 rcu_ctrlblk.rcu_try_flip_state =
927 rcu_try_flip_waitack_state;
928 break;
929 case rcu_try_flip_waitack_state:
930 if (rcu_try_flip_waitack())
931 rcu_ctrlblk.rcu_try_flip_state =
932 rcu_try_flip_waitzero_state;
933 break;
934 case rcu_try_flip_waitzero_state:
935 if (rcu_try_flip_waitzero())
936 rcu_ctrlblk.rcu_try_flip_state =
937 rcu_try_flip_waitmb_state;
938 break;
939 case rcu_try_flip_waitmb_state:
940 if (rcu_try_flip_waitmb())
941 rcu_ctrlblk.rcu_try_flip_state =
942 rcu_try_flip_idle_state;
943 }
944 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
945}
946
947/*
948 * Check to see if this CPU needs to do a memory barrier in order to
949 * ensure that any prior RCU read-side critical sections have committed
950 * their counter manipulations and critical-section memory references
951 * before declaring the grace period to be completed.
952 */
953static void rcu_check_mb(int cpu)
954{
955 if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) {
956 smp_mb(); /* Ensure RCU read-side accesses are visible. */
957 per_cpu(rcu_mb_flag, cpu) = rcu_mb_done;
958 }
959}
960
961void rcu_check_callbacks(int cpu, int user)
962{
963 unsigned long flags;
964 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
965
966 /*
967 * If this CPU took its interrupt from user mode or from the
968 * idle loop, and this is not a nested interrupt, then
969 * this CPU has to have exited all prior preept-disable
970 * sections of code. So increment the counter to note this.
971 *
972 * The memory barrier is needed to handle the case where
973 * writes from a preempt-disable section of code get reordered
974 * into schedule() by this CPU's write buffer. So the memory
975 * barrier makes sure that the rcu_qsctr_inc() is seen by other
976 * CPUs to happen after any such write.
977 */
978
979 if (user ||
980 (idle_cpu(cpu) && !in_softirq() &&
981 hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
982 smp_mb(); /* Guard against aggressive schedule(). */
983 rcu_qsctr_inc(cpu);
984 }
985
986 rcu_check_mb(cpu);
987 if (rcu_ctrlblk.completed == rdp->completed)
988 rcu_try_flip();
989 spin_lock_irqsave(&rdp->lock, flags);
990 RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
991 __rcu_advance_callbacks(rdp);
992 if (rdp->donelist == NULL) {
993 spin_unlock_irqrestore(&rdp->lock, flags);
994 } else {
995 spin_unlock_irqrestore(&rdp->lock, flags);
996 raise_softirq(RCU_SOFTIRQ);
997 }
998}
999
1000/*
1001 * Needed by dynticks, to make sure all RCU processing has finished
1002 * when we go idle:
1003 */
1004void rcu_advance_callbacks(int cpu, int user)
1005{
1006 unsigned long flags;
1007 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1008
1009 if (rcu_ctrlblk.completed == rdp->completed) {
1010 rcu_try_flip();
1011 if (rcu_ctrlblk.completed == rdp->completed)
1012 return;
1013 }
1014 spin_lock_irqsave(&rdp->lock, flags);
1015 RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
1016 __rcu_advance_callbacks(rdp);
1017 spin_unlock_irqrestore(&rdp->lock, flags);
1018}
1019
1020#ifdef CONFIG_HOTPLUG_CPU
1021#define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \
1022 *dsttail = srclist; \
1023 if (srclist != NULL) { \
1024 dsttail = srctail; \
1025 srclist = NULL; \
1026 srctail = &srclist;\
1027 } \
1028 } while (0)
1029
1030void rcu_offline_cpu(int cpu)
1031{
1032 int i;
1033 struct rcu_head *list = NULL;
1034 unsigned long flags;
1035 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1036 struct rcu_head *schedlist = NULL;
1037 struct rcu_head **schedtail = &schedlist;
1038 struct rcu_head **tail = &list;
1039
1040 /*
1041 * Remove all callbacks from the newly dead CPU, retaining order.
1042 * Otherwise rcu_barrier() will fail
1043 */
1044
1045 spin_lock_irqsave(&rdp->lock, flags);
1046 rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail);
1047 for (i = GP_STAGES - 1; i >= 0; i--)
1048 rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
1049 list, tail);
1050 rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
1051 rcu_offline_cpu_enqueue(rdp->waitschedlist, rdp->waitschedtail,
1052 schedlist, schedtail);
1053 rcu_offline_cpu_enqueue(rdp->nextschedlist, rdp->nextschedtail,
1054 schedlist, schedtail);
1055 rdp->rcu_sched_sleeping = 0;
1056 spin_unlock_irqrestore(&rdp->lock, flags);
1057 rdp->waitlistcount = 0;
1058
1059 /* Disengage the newly dead CPU from the grace-period computation. */
1060
1061 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
1062 rcu_check_mb(cpu);
1063 if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
1064 smp_mb(); /* Subsequent counter accesses must see new value */
1065 per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
1066 smp_mb(); /* Subsequent RCU read-side critical sections */
1067 /* seen -after- acknowledgement. */
1068 }
1069
1070 RCU_DATA_ME()->rcu_flipctr[0] += RCU_DATA_CPU(cpu)->rcu_flipctr[0];
1071 RCU_DATA_ME()->rcu_flipctr[1] += RCU_DATA_CPU(cpu)->rcu_flipctr[1];
1072
1073 RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0;
1074 RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0;
1075
1076 cpumask_clear_cpu(cpu, to_cpumask(rcu_cpu_online_map));
1077
1078 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
1079
1080 /*
1081 * Place the removed callbacks on the current CPU's queue.
1082 * Make them all start a new grace period: simple approach,
1083 * in theory could starve a given set of callbacks, but
1084 * you would need to be doing some serious CPU hotplugging
1085 * to make this happen. If this becomes a problem, adding
1086 * a synchronize_rcu() to the hotplug path would be a simple
1087 * fix.
1088 */
1089
1090 local_irq_save(flags); /* disable preempt till we know what lock. */
1091 rdp = RCU_DATA_ME();
1092 spin_lock(&rdp->lock);
1093 *rdp->nexttail = list;
1094 if (list)
1095 rdp->nexttail = tail;
1096 *rdp->nextschedtail = schedlist;
1097 if (schedlist)
1098 rdp->nextschedtail = schedtail;
1099 spin_unlock_irqrestore(&rdp->lock, flags);
1100}
1101
1102#else /* #ifdef CONFIG_HOTPLUG_CPU */
1103
1104void rcu_offline_cpu(int cpu)
1105{
1106}
1107
1108#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
1109
1110void __cpuinit rcu_online_cpu(int cpu)
1111{
1112 unsigned long flags;
1113 struct rcu_data *rdp;
1114
1115 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
1116 cpumask_set_cpu(cpu, to_cpumask(rcu_cpu_online_map));
1117 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
1118
1119 /*
1120 * The rcu_sched grace-period processing might have bypassed
1121 * this CPU, given that it was not in the rcu_cpu_online_map
1122 * when the grace-period scan started. This means that the
1123 * grace-period task might sleep. So make sure that if this
1124 * should happen, the first callback posted to this CPU will
1125 * wake up the grace-period task if need be.
1126 */
1127
1128 rdp = RCU_DATA_CPU(cpu);
1129 spin_lock_irqsave(&rdp->lock, flags);
1130 rdp->rcu_sched_sleeping = 1;
1131 spin_unlock_irqrestore(&rdp->lock, flags);
1132}
1133
1134static void rcu_process_callbacks(struct softirq_action *unused)
1135{
1136 unsigned long flags;
1137 struct rcu_head *next, *list;
1138 struct rcu_data *rdp;
1139
1140 local_irq_save(flags);
1141 rdp = RCU_DATA_ME();
1142 spin_lock(&rdp->lock);
1143 list = rdp->donelist;
1144 if (list == NULL) {
1145 spin_unlock_irqrestore(&rdp->lock, flags);
1146 return;
1147 }
1148 rdp->donelist = NULL;
1149 rdp->donetail = &rdp->donelist;
1150 RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp);
1151 spin_unlock_irqrestore(&rdp->lock, flags);
1152 while (list) {
1153 next = list->next;
1154 list->func(list);
1155 list = next;
1156 RCU_TRACE_ME(rcupreempt_trace_invoke);
1157 }
1158}
1159
1160void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1161{
1162 unsigned long flags;
1163 struct rcu_data *rdp;
1164
1165 head->func = func;
1166 head->next = NULL;
1167 local_irq_save(flags);
1168 rdp = RCU_DATA_ME();
1169 spin_lock(&rdp->lock);
1170 __rcu_advance_callbacks(rdp);
1171 *rdp->nexttail = head;
1172 rdp->nexttail = &head->next;
1173 RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
1174 spin_unlock_irqrestore(&rdp->lock, flags);
1175}
1176EXPORT_SYMBOL_GPL(call_rcu);
1177
1178void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1179{
1180 unsigned long flags;
1181 struct rcu_data *rdp;
1182 int wake_gp = 0;
1183
1184 head->func = func;
1185 head->next = NULL;
1186 local_irq_save(flags);
1187 rdp = RCU_DATA_ME();
1188 spin_lock(&rdp->lock);
1189 *rdp->nextschedtail = head;
1190 rdp->nextschedtail = &head->next;
1191 if (rdp->rcu_sched_sleeping) {
1192
1193 /* Grace-period processing might be sleeping... */
1194
1195 rdp->rcu_sched_sleeping = 0;
1196 wake_gp = 1;
1197 }
1198 spin_unlock_irqrestore(&rdp->lock, flags);
1199 if (wake_gp) {
1200
1201 /* Wake up grace-period processing, unless someone beat us. */
1202
1203 spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1204 if (rcu_ctrlblk.sched_sleep != rcu_sched_sleeping)
1205 wake_gp = 0;
1206 rcu_ctrlblk.sched_sleep = rcu_sched_not_sleeping;
1207 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1208 if (wake_gp)
1209 wake_up_interruptible(&rcu_ctrlblk.sched_wq);
1210 }
1211}
1212EXPORT_SYMBOL_GPL(call_rcu_sched);
1213
1214/*
1215 * Wait until all currently running preempt_disable() code segments
1216 * (including hardware-irq-disable segments) complete. Note that
1217 * in -rt this does -not- necessarily result in all currently executing
1218 * interrupt -handlers- having completed.
1219 */
1220void __synchronize_sched(void)
1221{
1222 struct rcu_synchronize rcu;
1223
1224 if (num_online_cpus() == 1)
1225 return; /* blocking is gp if only one CPU! */
1226
1227 init_completion(&rcu.completion);
1228 /* Will wake me after RCU finished. */
1229 call_rcu_sched(&rcu.head, wakeme_after_rcu);
1230 /* Wait for it. */
1231 wait_for_completion(&rcu.completion);
1232}
1233EXPORT_SYMBOL_GPL(__synchronize_sched);
1234
1235/*
1236 * kthread function that manages call_rcu_sched grace periods.
1237 */
1238static int rcu_sched_grace_period(void *arg)
1239{
1240 int couldsleep; /* might sleep after current pass. */
1241 int couldsleepnext = 0; /* might sleep after next pass. */
1242 int cpu;
1243 unsigned long flags;
1244 struct rcu_data *rdp;
1245 int ret;
1246
1247 /*
1248 * Each pass through the following loop handles one
1249 * rcu_sched grace period cycle.
1250 */
1251 do {
1252 /* Save each CPU's current state. */
1253
1254 for_each_online_cpu(cpu) {
1255 dyntick_save_progress_counter_sched(cpu);
1256 save_qsctr_sched(cpu);
1257 }
1258
1259 /*
1260 * Sleep for about an RCU grace-period's worth to
1261 * allow better batching and to consume less CPU.
1262 */
1263 schedule_timeout_interruptible(RCU_SCHED_BATCH_TIME);
1264
1265 /*
1266 * If there was nothing to do last time, prepare to
1267 * sleep at the end of the current grace period cycle.
1268 */
1269 couldsleep = couldsleepnext;
1270 couldsleepnext = 1;
1271 if (couldsleep) {
1272 spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1273 rcu_ctrlblk.sched_sleep = rcu_sched_sleep_prep;
1274 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1275 }
1276
1277 /*
1278 * Wait on each CPU in turn to have either visited
1279 * a quiescent state or been in dynticks-idle mode.
1280 */
1281 for_each_online_cpu(cpu) {
1282 while (rcu_qsctr_inc_needed(cpu) &&
1283 rcu_qsctr_inc_needed_dyntick(cpu)) {
1284 /* resched_cpu(cpu); @@@ */
1285 schedule_timeout_interruptible(1);
1286 }
1287 }
1288
1289 /* Advance callbacks for each CPU. */
1290
1291 for_each_online_cpu(cpu) {
1292
1293 rdp = RCU_DATA_CPU(cpu);
1294 spin_lock_irqsave(&rdp->lock, flags);
1295
1296 /*
1297 * We are running on this CPU irq-disabled, so no
1298 * CPU can go offline until we re-enable irqs.
1299 * The current CPU might have already gone
1300 * offline (between the for_each_offline_cpu and
1301 * the spin_lock_irqsave), but in that case all its
1302 * callback lists will be empty, so no harm done.
1303 *
1304 * Advance the callbacks! We share normal RCU's
1305 * donelist, since callbacks are invoked the
1306 * same way in either case.
1307 */
1308 if (rdp->waitschedlist != NULL) {
1309 *rdp->donetail = rdp->waitschedlist;
1310 rdp->donetail = rdp->waitschedtail;
1311
1312 /*
1313 * Next rcu_check_callbacks() will
1314 * do the required raise_softirq().
1315 */
1316 }
1317 if (rdp->nextschedlist != NULL) {
1318 rdp->waitschedlist = rdp->nextschedlist;
1319 rdp->waitschedtail = rdp->nextschedtail;
1320 couldsleep = 0;
1321 couldsleepnext = 0;
1322 } else {
1323 rdp->waitschedlist = NULL;
1324 rdp->waitschedtail = &rdp->waitschedlist;
1325 }
1326 rdp->nextschedlist = NULL;
1327 rdp->nextschedtail = &rdp->nextschedlist;
1328
1329 /* Mark sleep intention. */
1330
1331 rdp->rcu_sched_sleeping = couldsleep;
1332
1333 spin_unlock_irqrestore(&rdp->lock, flags);
1334 }
1335
1336 /* If we saw callbacks on the last scan, go deal with them. */
1337
1338 if (!couldsleep)
1339 continue;
1340
1341 /* Attempt to block... */
1342
1343 spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1344 if (rcu_ctrlblk.sched_sleep != rcu_sched_sleep_prep) {
1345
1346 /*
1347 * Someone posted a callback after we scanned.
1348 * Go take care of it.
1349 */
1350 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1351 couldsleepnext = 0;
1352 continue;
1353 }
1354
1355 /* Block until the next person posts a callback. */
1356
1357 rcu_ctrlblk.sched_sleep = rcu_sched_sleeping;
1358 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1359 ret = 0; /* unused */
1360 __wait_event_interruptible(rcu_ctrlblk.sched_wq,
1361 rcu_ctrlblk.sched_sleep != rcu_sched_sleeping,
1362 ret);
1363
1364 couldsleepnext = 0;
1365
1366 } while (!kthread_should_stop());
1367
1368 return (0);
1369}
1370
1371/*
1372 * Check to see if any future RCU-related work will need to be done
1373 * by the current CPU, even if none need be done immediately, returning
1374 * 1 if so. Assumes that notifiers would take care of handling any
1375 * outstanding requests from the RCU core.
1376 *
1377 * This function is part of the RCU implementation; it is -not-
1378 * an exported member of the RCU API.
1379 */
1380int rcu_needs_cpu(int cpu)
1381{
1382 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1383
1384 return (rdp->donelist != NULL ||
1385 !!rdp->waitlistcount ||
1386 rdp->nextlist != NULL ||
1387 rdp->nextschedlist != NULL ||
1388 rdp->waitschedlist != NULL);
1389}
1390
1391int rcu_pending(int cpu)
1392{
1393 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1394
1395 /* The CPU has at least one callback queued somewhere. */
1396
1397 if (rdp->donelist != NULL ||
1398 !!rdp->waitlistcount ||
1399 rdp->nextlist != NULL ||
1400 rdp->nextschedlist != NULL ||
1401 rdp->waitschedlist != NULL)
1402 return 1;
1403
1404 /* The RCU core needs an acknowledgement from this CPU. */
1405
1406 if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) ||
1407 (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed))
1408 return 1;
1409
1410 /* This CPU has fallen behind the global grace-period number. */
1411
1412 if (rdp->completed != rcu_ctrlblk.completed)
1413 return 1;
1414
1415 /* Nothing needed from this CPU. */
1416
1417 return 0;
1418}
1419
1420static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1421 unsigned long action, void *hcpu)
1422{
1423 long cpu = (long)hcpu;
1424
1425 switch (action) {
1426 case CPU_UP_PREPARE:
1427 case CPU_UP_PREPARE_FROZEN:
1428 rcu_online_cpu(cpu);
1429 break;
1430 case CPU_UP_CANCELED:
1431 case CPU_UP_CANCELED_FROZEN:
1432 case CPU_DEAD:
1433 case CPU_DEAD_FROZEN:
1434 rcu_offline_cpu(cpu);
1435 break;
1436 default:
1437 break;
1438 }
1439 return NOTIFY_OK;
1440}
1441
1442static struct notifier_block __cpuinitdata rcu_nb = {
1443 .notifier_call = rcu_cpu_notify,
1444};
1445
1446void __init __rcu_init(void)
1447{
1448 int cpu;
1449 int i;
1450 struct rcu_data *rdp;
1451
1452 printk(KERN_NOTICE "Preemptible RCU implementation.\n");
1453 for_each_possible_cpu(cpu) {
1454 rdp = RCU_DATA_CPU(cpu);
1455 spin_lock_init(&rdp->lock);
1456 rdp->completed = 0;
1457 rdp->waitlistcount = 0;
1458 rdp->nextlist = NULL;
1459 rdp->nexttail = &rdp->nextlist;
1460 for (i = 0; i < GP_STAGES; i++) {
1461 rdp->waitlist[i] = NULL;
1462 rdp->waittail[i] = &rdp->waitlist[i];
1463 }
1464 rdp->donelist = NULL;
1465 rdp->donetail = &rdp->donelist;
1466 rdp->rcu_flipctr[0] = 0;
1467 rdp->rcu_flipctr[1] = 0;
1468 rdp->nextschedlist = NULL;
1469 rdp->nextschedtail = &rdp->nextschedlist;
1470 rdp->waitschedlist = NULL;
1471 rdp->waitschedtail = &rdp->waitschedlist;
1472 rdp->rcu_sched_sleeping = 0;
1473 }
1474 register_cpu_notifier(&rcu_nb);
1475
1476 /*
1477 * We don't need protection against CPU-Hotplug here
1478 * since
1479 * a) If a CPU comes online while we are iterating over the
1480 * cpu_online_mask below, we would only end up making a
1481 * duplicate call to rcu_online_cpu() which sets the corresponding
1482 * CPU's mask in the rcu_cpu_online_map.
1483 *
1484 * b) A CPU cannot go offline at this point in time since the user
1485 * does not have access to the sysfs interface, nor do we
1486 * suspend the system.
1487 */
1488 for_each_online_cpu(cpu)
1489 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu);
1490
1491 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1492}
1493
1494/*
1495 * Late-boot-time RCU initialization that must wait until after scheduler
1496 * has been initialized.
1497 */
1498void __init rcu_init_sched(void)
1499{
1500 rcu_sched_grace_period_task = kthread_run(rcu_sched_grace_period,
1501 NULL,
1502 "rcu_sched_grace_period");
1503 WARN_ON(IS_ERR(rcu_sched_grace_period_task));
1504}
1505
1506#ifdef CONFIG_RCU_TRACE
1507long *rcupreempt_flipctr(int cpu)
1508{
1509 return &RCU_DATA_CPU(cpu)->rcu_flipctr[0];
1510}
1511EXPORT_SYMBOL_GPL(rcupreempt_flipctr);
1512
1513int rcupreempt_flip_flag(int cpu)
1514{
1515 return per_cpu(rcu_flip_flag, cpu);
1516}
1517EXPORT_SYMBOL_GPL(rcupreempt_flip_flag);
1518
1519int rcupreempt_mb_flag(int cpu)
1520{
1521 return per_cpu(rcu_mb_flag, cpu);
1522}
1523EXPORT_SYMBOL_GPL(rcupreempt_mb_flag);
1524
1525char *rcupreempt_try_flip_state_name(void)
1526{
1527 return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state];
1528}
1529EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name);
1530
1531struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu)
1532{
1533 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1534
1535 return &rdp->trace;
1536}
1537EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu);
1538
1539#endif /* #ifdef RCU_TRACE */
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
deleted file mode 100644
index 7c2665cac172..000000000000
--- a/kernel/rcupreempt_trace.c
+++ /dev/null
@@ -1,334 +0,0 @@
1/*
2 * Read-Copy Update tracing for realtime implementation
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2006
19 *
20 * Papers: http://www.rdrop.com/users/paulmck/RCU
21 *
22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU/ *.txt
24 *
25 */
26#include <linux/types.h>
27#include <linux/kernel.h>
28#include <linux/init.h>
29#include <linux/spinlock.h>
30#include <linux/smp.h>
31#include <linux/rcupdate.h>
32#include <linux/interrupt.h>
33#include <linux/sched.h>
34#include <asm/atomic.h>
35#include <linux/bitops.h>
36#include <linux/module.h>
37#include <linux/completion.h>
38#include <linux/moduleparam.h>
39#include <linux/percpu.h>
40#include <linux/notifier.h>
41#include <linux/cpu.h>
42#include <linux/mutex.h>
43#include <linux/rcupreempt_trace.h>
44#include <linux/debugfs.h>
45
46static struct mutex rcupreempt_trace_mutex;
47static char *rcupreempt_trace_buf;
48#define RCUPREEMPT_TRACE_BUF_SIZE 4096
49
50void rcupreempt_trace_move2done(struct rcupreempt_trace *trace)
51{
52 trace->done_length += trace->wait_length;
53 trace->done_add += trace->wait_length;
54 trace->wait_length = 0;
55}
56void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace)
57{
58 trace->wait_length += trace->next_length;
59 trace->wait_add += trace->next_length;
60 trace->next_length = 0;
61}
62void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace)
63{
64 atomic_inc(&trace->rcu_try_flip_1);
65}
66void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace)
67{
68 atomic_inc(&trace->rcu_try_flip_e1);
69}
70void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace)
71{
72 trace->rcu_try_flip_i1++;
73}
74void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace)
75{
76 trace->rcu_try_flip_ie1++;
77}
78void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace)
79{
80 trace->rcu_try_flip_g1++;
81}
82void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace)
83{
84 trace->rcu_try_flip_a1++;
85}
86void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace)
87{
88 trace->rcu_try_flip_ae1++;
89}
90void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace)
91{
92 trace->rcu_try_flip_a2++;
93}
94void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace)
95{
96 trace->rcu_try_flip_z1++;
97}
98void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace)
99{
100 trace->rcu_try_flip_ze1++;
101}
102void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace)
103{
104 trace->rcu_try_flip_z2++;
105}
106void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace)
107{
108 trace->rcu_try_flip_m1++;
109}
110void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace)
111{
112 trace->rcu_try_flip_me1++;
113}
114void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace)
115{
116 trace->rcu_try_flip_m2++;
117}
118void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace)
119{
120 trace->rcu_check_callbacks++;
121}
122void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace)
123{
124 trace->done_remove += trace->done_length;
125 trace->done_length = 0;
126}
127void rcupreempt_trace_invoke(struct rcupreempt_trace *trace)
128{
129 atomic_inc(&trace->done_invoked);
130}
131void rcupreempt_trace_next_add(struct rcupreempt_trace *trace)
132{
133 trace->next_add++;
134 trace->next_length++;
135}
136
137static void rcupreempt_trace_sum(struct rcupreempt_trace *sp)
138{
139 struct rcupreempt_trace *cp;
140 int cpu;
141
142 memset(sp, 0, sizeof(*sp));
143 for_each_possible_cpu(cpu) {
144 cp = rcupreempt_trace_cpu(cpu);
145 sp->next_length += cp->next_length;
146 sp->next_add += cp->next_add;
147 sp->wait_length += cp->wait_length;
148 sp->wait_add += cp->wait_add;
149 sp->done_length += cp->done_length;
150 sp->done_add += cp->done_add;
151 sp->done_remove += cp->done_remove;
152 atomic_add(atomic_read(&cp->done_invoked), &sp->done_invoked);
153 sp->rcu_check_callbacks += cp->rcu_check_callbacks;
154 atomic_add(atomic_read(&cp->rcu_try_flip_1),
155 &sp->rcu_try_flip_1);
156 atomic_add(atomic_read(&cp->rcu_try_flip_e1),
157 &sp->rcu_try_flip_e1);
158 sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1;
159 sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1;
160 sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1;
161 sp->rcu_try_flip_a1 += cp->rcu_try_flip_a1;
162 sp->rcu_try_flip_ae1 += cp->rcu_try_flip_ae1;
163 sp->rcu_try_flip_a2 += cp->rcu_try_flip_a2;
164 sp->rcu_try_flip_z1 += cp->rcu_try_flip_z1;
165 sp->rcu_try_flip_ze1 += cp->rcu_try_flip_ze1;
166 sp->rcu_try_flip_z2 += cp->rcu_try_flip_z2;
167 sp->rcu_try_flip_m1 += cp->rcu_try_flip_m1;
168 sp->rcu_try_flip_me1 += cp->rcu_try_flip_me1;
169 sp->rcu_try_flip_m2 += cp->rcu_try_flip_m2;
170 }
171}
172
173static ssize_t rcustats_read(struct file *filp, char __user *buffer,
174 size_t count, loff_t *ppos)
175{
176 struct rcupreempt_trace trace;
177 ssize_t bcount;
178 int cnt = 0;
179
180 rcupreempt_trace_sum(&trace);
181 mutex_lock(&rcupreempt_trace_mutex);
182 snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
183 "ggp=%ld rcc=%ld\n",
184 rcu_batches_completed(),
185 trace.rcu_check_callbacks);
186 snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
187 "na=%ld nl=%ld wa=%ld wl=%ld da=%ld dl=%ld dr=%ld di=%d\n"
188 "1=%d e1=%d i1=%ld ie1=%ld g1=%ld a1=%ld ae1=%ld a2=%ld\n"
189 "z1=%ld ze1=%ld z2=%ld m1=%ld me1=%ld m2=%ld\n",
190
191 trace.next_add, trace.next_length,
192 trace.wait_add, trace.wait_length,
193 trace.done_add, trace.done_length,
194 trace.done_remove, atomic_read(&trace.done_invoked),
195 atomic_read(&trace.rcu_try_flip_1),
196 atomic_read(&trace.rcu_try_flip_e1),
197 trace.rcu_try_flip_i1, trace.rcu_try_flip_ie1,
198 trace.rcu_try_flip_g1,
199 trace.rcu_try_flip_a1, trace.rcu_try_flip_ae1,
200 trace.rcu_try_flip_a2,
201 trace.rcu_try_flip_z1, trace.rcu_try_flip_ze1,
202 trace.rcu_try_flip_z2,
203 trace.rcu_try_flip_m1, trace.rcu_try_flip_me1,
204 trace.rcu_try_flip_m2);
205 bcount = simple_read_from_buffer(buffer, count, ppos,
206 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
207 mutex_unlock(&rcupreempt_trace_mutex);
208 return bcount;
209}
210
211static ssize_t rcugp_read(struct file *filp, char __user *buffer,
212 size_t count, loff_t *ppos)
213{
214 long oldgp = rcu_batches_completed();
215 ssize_t bcount;
216
217 mutex_lock(&rcupreempt_trace_mutex);
218 synchronize_rcu();
219 snprintf(rcupreempt_trace_buf, RCUPREEMPT_TRACE_BUF_SIZE,
220 "oldggp=%ld newggp=%ld\n", oldgp, rcu_batches_completed());
221 bcount = simple_read_from_buffer(buffer, count, ppos,
222 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
223 mutex_unlock(&rcupreempt_trace_mutex);
224 return bcount;
225}
226
227static ssize_t rcuctrs_read(struct file *filp, char __user *buffer,
228 size_t count, loff_t *ppos)
229{
230 int cnt = 0;
231 int cpu;
232 int f = rcu_batches_completed() & 0x1;
233 ssize_t bcount;
234
235 mutex_lock(&rcupreempt_trace_mutex);
236
237 cnt += snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE,
238 "CPU last cur F M\n");
239 for_each_online_cpu(cpu) {
240 long *flipctr = rcupreempt_flipctr(cpu);
241 cnt += snprintf(&rcupreempt_trace_buf[cnt],
242 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
243 "%3d %4ld %3ld %d %d\n",
244 cpu,
245 flipctr[!f],
246 flipctr[f],
247 rcupreempt_flip_flag(cpu),
248 rcupreempt_mb_flag(cpu));
249 }
250 cnt += snprintf(&rcupreempt_trace_buf[cnt],
251 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
252 "ggp = %ld, state = %s\n",
253 rcu_batches_completed(),
254 rcupreempt_try_flip_state_name());
255 cnt += snprintf(&rcupreempt_trace_buf[cnt],
256 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
257 "\n");
258 bcount = simple_read_from_buffer(buffer, count, ppos,
259 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
260 mutex_unlock(&rcupreempt_trace_mutex);
261 return bcount;
262}
263
264static struct file_operations rcustats_fops = {
265 .owner = THIS_MODULE,
266 .read = rcustats_read,
267};
268
269static struct file_operations rcugp_fops = {
270 .owner = THIS_MODULE,
271 .read = rcugp_read,
272};
273
274static struct file_operations rcuctrs_fops = {
275 .owner = THIS_MODULE,
276 .read = rcuctrs_read,
277};
278
279static struct dentry *rcudir, *statdir, *ctrsdir, *gpdir;
280static int rcupreempt_debugfs_init(void)
281{
282 rcudir = debugfs_create_dir("rcu", NULL);
283 if (!rcudir)
284 goto out;
285 statdir = debugfs_create_file("rcustats", 0444, rcudir,
286 NULL, &rcustats_fops);
287 if (!statdir)
288 goto free_out;
289
290 gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
291 if (!gpdir)
292 goto free_out;
293
294 ctrsdir = debugfs_create_file("rcuctrs", 0444, rcudir,
295 NULL, &rcuctrs_fops);
296 if (!ctrsdir)
297 goto free_out;
298 return 0;
299free_out:
300 if (statdir)
301 debugfs_remove(statdir);
302 if (gpdir)
303 debugfs_remove(gpdir);
304 debugfs_remove(rcudir);
305out:
306 return 1;
307}
308
309static int __init rcupreempt_trace_init(void)
310{
311 int ret;
312
313 mutex_init(&rcupreempt_trace_mutex);
314 rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
315 if (!rcupreempt_trace_buf)
316 return 1;
317 ret = rcupreempt_debugfs_init();
318 if (ret)
319 kfree(rcupreempt_trace_buf);
320 return ret;
321}
322
323static void __exit rcupreempt_trace_cleanup(void)
324{
325 debugfs_remove(statdir);
326 debugfs_remove(gpdir);
327 debugfs_remove(ctrsdir);
328 debugfs_remove(rcudir);
329 kfree(rcupreempt_trace_buf);
330}
331
332
333module_init(rcupreempt_trace_init);
334module_exit(rcupreempt_trace_cleanup);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9b4a975a4b4a..b33db539a8ad 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -257,14 +257,14 @@ struct rcu_torture_ops {
257 void (*init)(void); 257 void (*init)(void);
258 void (*cleanup)(void); 258 void (*cleanup)(void);
259 int (*readlock)(void); 259 int (*readlock)(void);
260 void (*readdelay)(struct rcu_random_state *rrsp); 260 void (*read_delay)(struct rcu_random_state *rrsp);
261 void (*readunlock)(int idx); 261 void (*readunlock)(int idx);
262 int (*completed)(void); 262 int (*completed)(void);
263 void (*deferredfree)(struct rcu_torture *p); 263 void (*deferred_free)(struct rcu_torture *p);
264 void (*sync)(void); 264 void (*sync)(void);
265 void (*cb_barrier)(void); 265 void (*cb_barrier)(void);
266 int (*stats)(char *page); 266 int (*stats)(char *page);
267 int irqcapable; 267 int irq_capable;
268 char *name; 268 char *name;
269}; 269};
270static struct rcu_torture_ops *cur_ops = NULL; 270static struct rcu_torture_ops *cur_ops = NULL;
@@ -320,7 +320,7 @@ rcu_torture_cb(struct rcu_head *p)
320 rp->rtort_mbtest = 0; 320 rp->rtort_mbtest = 0;
321 rcu_torture_free(rp); 321 rcu_torture_free(rp);
322 } else 322 } else
323 cur_ops->deferredfree(rp); 323 cur_ops->deferred_free(rp);
324} 324}
325 325
326static void rcu_torture_deferred_free(struct rcu_torture *p) 326static void rcu_torture_deferred_free(struct rcu_torture *p)
@@ -329,18 +329,18 @@ static void rcu_torture_deferred_free(struct rcu_torture *p)
329} 329}
330 330
331static struct rcu_torture_ops rcu_ops = { 331static struct rcu_torture_ops rcu_ops = {
332 .init = NULL, 332 .init = NULL,
333 .cleanup = NULL, 333 .cleanup = NULL,
334 .readlock = rcu_torture_read_lock, 334 .readlock = rcu_torture_read_lock,
335 .readdelay = rcu_read_delay, 335 .read_delay = rcu_read_delay,
336 .readunlock = rcu_torture_read_unlock, 336 .readunlock = rcu_torture_read_unlock,
337 .completed = rcu_torture_completed, 337 .completed = rcu_torture_completed,
338 .deferredfree = rcu_torture_deferred_free, 338 .deferred_free = rcu_torture_deferred_free,
339 .sync = synchronize_rcu, 339 .sync = synchronize_rcu,
340 .cb_barrier = rcu_barrier, 340 .cb_barrier = rcu_barrier,
341 .stats = NULL, 341 .stats = NULL,
342 .irqcapable = 1, 342 .irq_capable = 1,
343 .name = "rcu" 343 .name = "rcu"
344}; 344};
345 345
346static void rcu_sync_torture_deferred_free(struct rcu_torture *p) 346static void rcu_sync_torture_deferred_free(struct rcu_torture *p)
@@ -370,18 +370,18 @@ static void rcu_sync_torture_init(void)
370} 370}
371 371
372static struct rcu_torture_ops rcu_sync_ops = { 372static struct rcu_torture_ops rcu_sync_ops = {
373 .init = rcu_sync_torture_init, 373 .init = rcu_sync_torture_init,
374 .cleanup = NULL, 374 .cleanup = NULL,
375 .readlock = rcu_torture_read_lock, 375 .readlock = rcu_torture_read_lock,
376 .readdelay = rcu_read_delay, 376 .read_delay = rcu_read_delay,
377 .readunlock = rcu_torture_read_unlock, 377 .readunlock = rcu_torture_read_unlock,
378 .completed = rcu_torture_completed, 378 .completed = rcu_torture_completed,
379 .deferredfree = rcu_sync_torture_deferred_free, 379 .deferred_free = rcu_sync_torture_deferred_free,
380 .sync = synchronize_rcu, 380 .sync = synchronize_rcu,
381 .cb_barrier = NULL, 381 .cb_barrier = NULL,
382 .stats = NULL, 382 .stats = NULL,
383 .irqcapable = 1, 383 .irq_capable = 1,
384 .name = "rcu_sync" 384 .name = "rcu_sync"
385}; 385};
386 386
387/* 387/*
@@ -432,33 +432,33 @@ static void rcu_bh_torture_synchronize(void)
432} 432}
433 433
434static struct rcu_torture_ops rcu_bh_ops = { 434static struct rcu_torture_ops rcu_bh_ops = {
435 .init = NULL, 435 .init = NULL,
436 .cleanup = NULL, 436 .cleanup = NULL,
437 .readlock = rcu_bh_torture_read_lock, 437 .readlock = rcu_bh_torture_read_lock,
438 .readdelay = rcu_read_delay, /* just reuse rcu's version. */ 438 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
439 .readunlock = rcu_bh_torture_read_unlock, 439 .readunlock = rcu_bh_torture_read_unlock,
440 .completed = rcu_bh_torture_completed, 440 .completed = rcu_bh_torture_completed,
441 .deferredfree = rcu_bh_torture_deferred_free, 441 .deferred_free = rcu_bh_torture_deferred_free,
442 .sync = rcu_bh_torture_synchronize, 442 .sync = rcu_bh_torture_synchronize,
443 .cb_barrier = rcu_barrier_bh, 443 .cb_barrier = rcu_barrier_bh,
444 .stats = NULL, 444 .stats = NULL,
445 .irqcapable = 1, 445 .irq_capable = 1,
446 .name = "rcu_bh" 446 .name = "rcu_bh"
447}; 447};
448 448
449static struct rcu_torture_ops rcu_bh_sync_ops = { 449static struct rcu_torture_ops rcu_bh_sync_ops = {
450 .init = rcu_sync_torture_init, 450 .init = rcu_sync_torture_init,
451 .cleanup = NULL, 451 .cleanup = NULL,
452 .readlock = rcu_bh_torture_read_lock, 452 .readlock = rcu_bh_torture_read_lock,
453 .readdelay = rcu_read_delay, /* just reuse rcu's version. */ 453 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
454 .readunlock = rcu_bh_torture_read_unlock, 454 .readunlock = rcu_bh_torture_read_unlock,
455 .completed = rcu_bh_torture_completed, 455 .completed = rcu_bh_torture_completed,
456 .deferredfree = rcu_sync_torture_deferred_free, 456 .deferred_free = rcu_sync_torture_deferred_free,
457 .sync = rcu_bh_torture_synchronize, 457 .sync = rcu_bh_torture_synchronize,
458 .cb_barrier = NULL, 458 .cb_barrier = NULL,
459 .stats = NULL, 459 .stats = NULL,
460 .irqcapable = 1, 460 .irq_capable = 1,
461 .name = "rcu_bh_sync" 461 .name = "rcu_bh_sync"
462}; 462};
463 463
464/* 464/*
@@ -530,17 +530,17 @@ static int srcu_torture_stats(char *page)
530} 530}
531 531
532static struct rcu_torture_ops srcu_ops = { 532static struct rcu_torture_ops srcu_ops = {
533 .init = srcu_torture_init, 533 .init = srcu_torture_init,
534 .cleanup = srcu_torture_cleanup, 534 .cleanup = srcu_torture_cleanup,
535 .readlock = srcu_torture_read_lock, 535 .readlock = srcu_torture_read_lock,
536 .readdelay = srcu_read_delay, 536 .read_delay = srcu_read_delay,
537 .readunlock = srcu_torture_read_unlock, 537 .readunlock = srcu_torture_read_unlock,
538 .completed = srcu_torture_completed, 538 .completed = srcu_torture_completed,
539 .deferredfree = rcu_sync_torture_deferred_free, 539 .deferred_free = rcu_sync_torture_deferred_free,
540 .sync = srcu_torture_synchronize, 540 .sync = srcu_torture_synchronize,
541 .cb_barrier = NULL, 541 .cb_barrier = NULL,
542 .stats = srcu_torture_stats, 542 .stats = srcu_torture_stats,
543 .name = "srcu" 543 .name = "srcu"
544}; 544};
545 545
546/* 546/*
@@ -574,32 +574,49 @@ static void sched_torture_synchronize(void)
574} 574}
575 575
576static struct rcu_torture_ops sched_ops = { 576static struct rcu_torture_ops sched_ops = {
577 .init = rcu_sync_torture_init, 577 .init = rcu_sync_torture_init,
578 .cleanup = NULL, 578 .cleanup = NULL,
579 .readlock = sched_torture_read_lock, 579 .readlock = sched_torture_read_lock,
580 .readdelay = rcu_read_delay, /* just reuse rcu's version. */ 580 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
581 .readunlock = sched_torture_read_unlock, 581 .readunlock = sched_torture_read_unlock,
582 .completed = sched_torture_completed, 582 .completed = sched_torture_completed,
583 .deferredfree = rcu_sched_torture_deferred_free, 583 .deferred_free = rcu_sched_torture_deferred_free,
584 .sync = sched_torture_synchronize, 584 .sync = sched_torture_synchronize,
585 .cb_barrier = rcu_barrier_sched, 585 .cb_barrier = rcu_barrier_sched,
586 .stats = NULL, 586 .stats = NULL,
587 .irqcapable = 1, 587 .irq_capable = 1,
588 .name = "sched" 588 .name = "sched"
589}; 589};
590 590
591static struct rcu_torture_ops sched_ops_sync = { 591static struct rcu_torture_ops sched_ops_sync = {
592 .init = rcu_sync_torture_init, 592 .init = rcu_sync_torture_init,
593 .cleanup = NULL, 593 .cleanup = NULL,
594 .readlock = sched_torture_read_lock, 594 .readlock = sched_torture_read_lock,
595 .readdelay = rcu_read_delay, /* just reuse rcu's version. */ 595 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
596 .readunlock = sched_torture_read_unlock, 596 .readunlock = sched_torture_read_unlock,
597 .completed = sched_torture_completed, 597 .completed = sched_torture_completed,
598 .deferredfree = rcu_sync_torture_deferred_free, 598 .deferred_free = rcu_sync_torture_deferred_free,
599 .sync = sched_torture_synchronize, 599 .sync = sched_torture_synchronize,
600 .cb_barrier = NULL, 600 .cb_barrier = NULL,
601 .stats = NULL, 601 .stats = NULL,
602 .name = "sched_sync" 602 .name = "sched_sync"
603};
604
605extern int rcu_expedited_torture_stats(char *page);
606
607static struct rcu_torture_ops sched_expedited_ops = {
608 .init = rcu_sync_torture_init,
609 .cleanup = NULL,
610 .readlock = sched_torture_read_lock,
611 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
612 .readunlock = sched_torture_read_unlock,
613 .completed = sched_torture_completed,
614 .deferred_free = rcu_sync_torture_deferred_free,
615 .sync = synchronize_sched_expedited,
616 .cb_barrier = NULL,
617 .stats = rcu_expedited_torture_stats,
618 .irq_capable = 1,
619 .name = "sched_expedited"
603}; 620};
604 621
605/* 622/*
@@ -635,7 +652,7 @@ rcu_torture_writer(void *arg)
635 i = RCU_TORTURE_PIPE_LEN; 652 i = RCU_TORTURE_PIPE_LEN;
636 atomic_inc(&rcu_torture_wcount[i]); 653 atomic_inc(&rcu_torture_wcount[i]);
637 old_rp->rtort_pipe_count++; 654 old_rp->rtort_pipe_count++;
638 cur_ops->deferredfree(old_rp); 655 cur_ops->deferred_free(old_rp);
639 } 656 }
640 rcu_torture_current_version++; 657 rcu_torture_current_version++;
641 oldbatch = cur_ops->completed(); 658 oldbatch = cur_ops->completed();
@@ -700,7 +717,7 @@ static void rcu_torture_timer(unsigned long unused)
700 if (p->rtort_mbtest == 0) 717 if (p->rtort_mbtest == 0)
701 atomic_inc(&n_rcu_torture_mberror); 718 atomic_inc(&n_rcu_torture_mberror);
702 spin_lock(&rand_lock); 719 spin_lock(&rand_lock);
703 cur_ops->readdelay(&rand); 720 cur_ops->read_delay(&rand);
704 n_rcu_torture_timers++; 721 n_rcu_torture_timers++;
705 spin_unlock(&rand_lock); 722 spin_unlock(&rand_lock);
706 preempt_disable(); 723 preempt_disable();
@@ -738,11 +755,11 @@ rcu_torture_reader(void *arg)
738 755
739 VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); 756 VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
740 set_user_nice(current, 19); 757 set_user_nice(current, 19);
741 if (irqreader && cur_ops->irqcapable) 758 if (irqreader && cur_ops->irq_capable)
742 setup_timer_on_stack(&t, rcu_torture_timer, 0); 759 setup_timer_on_stack(&t, rcu_torture_timer, 0);
743 760
744 do { 761 do {
745 if (irqreader && cur_ops->irqcapable) { 762 if (irqreader && cur_ops->irq_capable) {
746 if (!timer_pending(&t)) 763 if (!timer_pending(&t))
747 mod_timer(&t, 1); 764 mod_timer(&t, 1);
748 } 765 }
@@ -757,7 +774,7 @@ rcu_torture_reader(void *arg)
757 } 774 }
758 if (p->rtort_mbtest == 0) 775 if (p->rtort_mbtest == 0)
759 atomic_inc(&n_rcu_torture_mberror); 776 atomic_inc(&n_rcu_torture_mberror);
760 cur_ops->readdelay(&rand); 777 cur_ops->read_delay(&rand);
761 preempt_disable(); 778 preempt_disable();
762 pipe_count = p->rtort_pipe_count; 779 pipe_count = p->rtort_pipe_count;
763 if (pipe_count > RCU_TORTURE_PIPE_LEN) { 780 if (pipe_count > RCU_TORTURE_PIPE_LEN) {
@@ -778,7 +795,7 @@ rcu_torture_reader(void *arg)
778 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 795 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
779 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); 796 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
780 rcutorture_shutdown_absorb("rcu_torture_reader"); 797 rcutorture_shutdown_absorb("rcu_torture_reader");
781 if (irqreader && cur_ops->irqcapable) 798 if (irqreader && cur_ops->irq_capable)
782 del_timer_sync(&t); 799 del_timer_sync(&t);
783 while (!kthread_should_stop()) 800 while (!kthread_should_stop())
784 schedule_timeout_uninterruptible(1); 801 schedule_timeout_uninterruptible(1);
@@ -1078,6 +1095,7 @@ rcu_torture_init(void)
1078 int firsterr = 0; 1095 int firsterr = 0;
1079 static struct rcu_torture_ops *torture_ops[] = 1096 static struct rcu_torture_ops *torture_ops[] =
1080 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, 1097 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
1098 &sched_expedited_ops,
1081 &srcu_ops, &sched_ops, &sched_ops_sync, }; 1099 &srcu_ops, &sched_ops, &sched_ops_sync, };
1082 1100
1083 mutex_lock(&fullstop_mutex); 1101 mutex_lock(&fullstop_mutex);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 7717b95c2027..6b11b07cfe7f 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -35,6 +35,7 @@
35#include <linux/rcupdate.h> 35#include <linux/rcupdate.h>
36#include <linux/interrupt.h> 36#include <linux/interrupt.h>
37#include <linux/sched.h> 37#include <linux/sched.h>
38#include <linux/nmi.h>
38#include <asm/atomic.h> 39#include <asm/atomic.h>
39#include <linux/bitops.h> 40#include <linux/bitops.h>
40#include <linux/module.h> 41#include <linux/module.h>
@@ -46,6 +47,8 @@
46#include <linux/mutex.h> 47#include <linux/mutex.h>
47#include <linux/time.h> 48#include <linux/time.h>
48 49
50#include "rcutree.h"
51
49#ifdef CONFIG_DEBUG_LOCK_ALLOC 52#ifdef CONFIG_DEBUG_LOCK_ALLOC
50static struct lock_class_key rcu_lock_key; 53static struct lock_class_key rcu_lock_key;
51struct lockdep_map rcu_lock_map = 54struct lockdep_map rcu_lock_map =
@@ -72,30 +75,59 @@ EXPORT_SYMBOL_GPL(rcu_lock_map);
72 .n_force_qs_ngp = 0, \ 75 .n_force_qs_ngp = 0, \
73} 76}
74 77
75struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state); 78struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state);
76DEFINE_PER_CPU(struct rcu_data, rcu_data); 79DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
77 80
78struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 81struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
79DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 82DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
80 83
84extern long rcu_batches_completed_sched(void);
85static struct rcu_node *rcu_get_root(struct rcu_state *rsp);
86static void cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp,
87 struct rcu_node *rnp, unsigned long flags);
88static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags);
89#ifdef CONFIG_HOTPLUG_CPU
90static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp);
91#endif /* #ifdef CONFIG_HOTPLUG_CPU */
92static void __rcu_process_callbacks(struct rcu_state *rsp,
93 struct rcu_data *rdp);
94static void __call_rcu(struct rcu_head *head,
95 void (*func)(struct rcu_head *rcu),
96 struct rcu_state *rsp);
97static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp);
98static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp,
99 int preemptable);
100
101#include "rcutree_plugin.h"
102
81/* 103/*
82 * Increment the quiescent state counter. 104 * Note a quiescent state. Because we do not need to know
83 * The counter is a bit degenerated: We do not need to know
84 * how many quiescent states passed, just if there was at least 105 * how many quiescent states passed, just if there was at least
85 * one since the start of the grace period. Thus just a flag. 106 * one since the start of the grace period, this just sets a flag.
86 */ 107 */
87void rcu_qsctr_inc(int cpu) 108void rcu_sched_qs(int cpu)
88{ 109{
89 struct rcu_data *rdp = &per_cpu(rcu_data, cpu); 110 unsigned long flags;
111 struct rcu_data *rdp;
112
113 local_irq_save(flags);
114 rdp = &per_cpu(rcu_sched_data, cpu);
90 rdp->passed_quiesc = 1; 115 rdp->passed_quiesc = 1;
91 rdp->passed_quiesc_completed = rdp->completed; 116 rdp->passed_quiesc_completed = rdp->completed;
117 rcu_preempt_qs(cpu);
118 local_irq_restore(flags);
92} 119}
93 120
94void rcu_bh_qsctr_inc(int cpu) 121void rcu_bh_qs(int cpu)
95{ 122{
96 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); 123 unsigned long flags;
124 struct rcu_data *rdp;
125
126 local_irq_save(flags);
127 rdp = &per_cpu(rcu_bh_data, cpu);
97 rdp->passed_quiesc = 1; 128 rdp->passed_quiesc = 1;
98 rdp->passed_quiesc_completed = rdp->completed; 129 rdp->passed_quiesc_completed = rdp->completed;
130 local_irq_restore(flags);
99} 131}
100 132
101#ifdef CONFIG_NO_HZ 133#ifdef CONFIG_NO_HZ
@@ -110,15 +142,16 @@ static int qhimark = 10000; /* If this many pending, ignore blimit. */
110static int qlowmark = 100; /* Once only this many pending, use blimit. */ 142static int qlowmark = 100; /* Once only this many pending, use blimit. */
111 143
112static void force_quiescent_state(struct rcu_state *rsp, int relaxed); 144static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
145static int rcu_pending(int cpu);
113 146
114/* 147/*
115 * Return the number of RCU batches processed thus far for debug & stats. 148 * Return the number of RCU-sched batches processed thus far for debug & stats.
116 */ 149 */
117long rcu_batches_completed(void) 150long rcu_batches_completed_sched(void)
118{ 151{
119 return rcu_state.completed; 152 return rcu_sched_state.completed;
120} 153}
121EXPORT_SYMBOL_GPL(rcu_batches_completed); 154EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
122 155
123/* 156/*
124 * Return the number of RCU BH batches processed thus far for debug & stats. 157 * Return the number of RCU BH batches processed thus far for debug & stats.
@@ -181,6 +214,10 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
181 return 1; 214 return 1;
182 } 215 }
183 216
217 /* If preemptable RCU, no point in sending reschedule IPI. */
218 if (rdp->preemptable)
219 return 0;
220
184 /* The CPU is online, so send it a reschedule IPI. */ 221 /* The CPU is online, so send it a reschedule IPI. */
185 if (rdp->cpu != smp_processor_id()) 222 if (rdp->cpu != smp_processor_id())
186 smp_send_reschedule(rdp->cpu); 223 smp_send_reschedule(rdp->cpu);
@@ -193,7 +230,6 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
193#endif /* #ifdef CONFIG_SMP */ 230#endif /* #ifdef CONFIG_SMP */
194 231
195#ifdef CONFIG_NO_HZ 232#ifdef CONFIG_NO_HZ
196static DEFINE_RATELIMIT_STATE(rcu_rs, 10 * HZ, 5);
197 233
198/** 234/**
199 * rcu_enter_nohz - inform RCU that current CPU is entering nohz 235 * rcu_enter_nohz - inform RCU that current CPU is entering nohz
@@ -213,7 +249,7 @@ void rcu_enter_nohz(void)
213 rdtp = &__get_cpu_var(rcu_dynticks); 249 rdtp = &__get_cpu_var(rcu_dynticks);
214 rdtp->dynticks++; 250 rdtp->dynticks++;
215 rdtp->dynticks_nesting--; 251 rdtp->dynticks_nesting--;
216 WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs); 252 WARN_ON_ONCE(rdtp->dynticks & 0x1);
217 local_irq_restore(flags); 253 local_irq_restore(flags);
218} 254}
219 255
@@ -232,7 +268,7 @@ void rcu_exit_nohz(void)
232 rdtp = &__get_cpu_var(rcu_dynticks); 268 rdtp = &__get_cpu_var(rcu_dynticks);
233 rdtp->dynticks++; 269 rdtp->dynticks++;
234 rdtp->dynticks_nesting++; 270 rdtp->dynticks_nesting++;
235 WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs); 271 WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
236 local_irq_restore(flags); 272 local_irq_restore(flags);
237 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ 273 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
238} 274}
@@ -251,7 +287,7 @@ void rcu_nmi_enter(void)
251 if (rdtp->dynticks & 0x1) 287 if (rdtp->dynticks & 0x1)
252 return; 288 return;
253 rdtp->dynticks_nmi++; 289 rdtp->dynticks_nmi++;
254 WARN_ON_RATELIMIT(!(rdtp->dynticks_nmi & 0x1), &rcu_rs); 290 WARN_ON_ONCE(!(rdtp->dynticks_nmi & 0x1));
255 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ 291 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
256} 292}
257 293
@@ -270,7 +306,7 @@ void rcu_nmi_exit(void)
270 return; 306 return;
271 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ 307 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
272 rdtp->dynticks_nmi++; 308 rdtp->dynticks_nmi++;
273 WARN_ON_RATELIMIT(rdtp->dynticks_nmi & 0x1, &rcu_rs); 309 WARN_ON_ONCE(rdtp->dynticks_nmi & 0x1);
274} 310}
275 311
276/** 312/**
@@ -286,7 +322,7 @@ void rcu_irq_enter(void)
286 if (rdtp->dynticks_nesting++) 322 if (rdtp->dynticks_nesting++)
287 return; 323 return;
288 rdtp->dynticks++; 324 rdtp->dynticks++;
289 WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs); 325 WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
290 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ 326 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
291} 327}
292 328
@@ -305,10 +341,10 @@ void rcu_irq_exit(void)
305 return; 341 return;
306 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ 342 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
307 rdtp->dynticks++; 343 rdtp->dynticks++;
308 WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs); 344 WARN_ON_ONCE(rdtp->dynticks & 0x1);
309 345
310 /* If the interrupt queued a callback, get out of dyntick mode. */ 346 /* If the interrupt queued a callback, get out of dyntick mode. */
311 if (__get_cpu_var(rcu_data).nxtlist || 347 if (__get_cpu_var(rcu_sched_data).nxtlist ||
312 __get_cpu_var(rcu_bh_data).nxtlist) 348 __get_cpu_var(rcu_bh_data).nxtlist)
313 set_need_resched(); 349 set_need_resched();
314} 350}
@@ -461,6 +497,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
461 497
462 printk(KERN_ERR "INFO: RCU detected CPU stalls:"); 498 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
463 for (; rnp_cur < rnp_end; rnp_cur++) { 499 for (; rnp_cur < rnp_end; rnp_cur++) {
500 rcu_print_task_stall(rnp);
464 if (rnp_cur->qsmask == 0) 501 if (rnp_cur->qsmask == 0)
465 continue; 502 continue;
466 for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++) 503 for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++)
@@ -469,6 +506,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
469 } 506 }
470 printk(" (detected by %d, t=%ld jiffies)\n", 507 printk(" (detected by %d, t=%ld jiffies)\n",
471 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 508 smp_processor_id(), (long)(jiffies - rsp->gp_start));
509 trigger_all_cpu_backtrace();
510
472 force_quiescent_state(rsp, 0); /* Kick them all. */ 511 force_quiescent_state(rsp, 0); /* Kick them all. */
473} 512}
474 513
@@ -479,12 +518,14 @@ static void print_cpu_stall(struct rcu_state *rsp)
479 518
480 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n", 519 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n",
481 smp_processor_id(), jiffies - rsp->gp_start); 520 smp_processor_id(), jiffies - rsp->gp_start);
482 dump_stack(); 521 trigger_all_cpu_backtrace();
522
483 spin_lock_irqsave(&rnp->lock, flags); 523 spin_lock_irqsave(&rnp->lock, flags);
484 if ((long)(jiffies - rsp->jiffies_stall) >= 0) 524 if ((long)(jiffies - rsp->jiffies_stall) >= 0)
485 rsp->jiffies_stall = 525 rsp->jiffies_stall =
486 jiffies + RCU_SECONDS_TILL_STALL_RECHECK; 526 jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
487 spin_unlock_irqrestore(&rnp->lock, flags); 527 spin_unlock_irqrestore(&rnp->lock, flags);
528
488 set_need_resched(); /* kick ourselves to get things going. */ 529 set_need_resched(); /* kick ourselves to get things going. */
489} 530}
490 531
@@ -674,6 +715,19 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
674} 715}
675 716
676/* 717/*
718 * Clean up after the prior grace period and let rcu_start_gp() start up
719 * the next grace period if one is needed. Note that the caller must
720 * hold rnp->lock, as required by rcu_start_gp(), which will release it.
721 */
722static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags)
723 __releases(rnp->lock)
724{
725 rsp->completed = rsp->gpnum;
726 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
727 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
728}
729
730/*
677 * Similar to cpu_quiet(), for which it is a helper function. Allows 731 * Similar to cpu_quiet(), for which it is a helper function. Allows
678 * a group of CPUs to be quieted at one go, though all the CPUs in the 732 * a group of CPUs to be quieted at one go, though all the CPUs in the
679 * group must be represented by the same leaf rcu_node structure. 733 * group must be represented by the same leaf rcu_node structure.
@@ -694,7 +748,7 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
694 return; 748 return;
695 } 749 }
696 rnp->qsmask &= ~mask; 750 rnp->qsmask &= ~mask;
697 if (rnp->qsmask != 0) { 751 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
698 752
699 /* Other bits still set at this level, so done. */ 753 /* Other bits still set at this level, so done. */
700 spin_unlock_irqrestore(&rnp->lock, flags); 754 spin_unlock_irqrestore(&rnp->lock, flags);
@@ -714,14 +768,10 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
714 768
715 /* 769 /*
716 * Get here if we are the last CPU to pass through a quiescent 770 * Get here if we are the last CPU to pass through a quiescent
717 * state for this grace period. Clean up and let rcu_start_gp() 771 * state for this grace period. Invoke cpu_quiet_msk_finish()
718 * start up the next grace period if one is needed. Note that 772 * to clean up and start the next grace period if one is needed.
719 * we still hold rnp->lock, as required by rcu_start_gp(), which
720 * will release it.
721 */ 773 */
722 rsp->completed = rsp->gpnum; 774 cpu_quiet_msk_finish(rsp, flags); /* releases rnp->lock. */
723 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
724 rcu_start_gp(rsp, flags); /* releases rnp->lock. */
725} 775}
726 776
727/* 777/*
@@ -828,11 +878,12 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
828 spin_lock(&rnp->lock); /* irqs already disabled. */ 878 spin_lock(&rnp->lock); /* irqs already disabled. */
829 rnp->qsmaskinit &= ~mask; 879 rnp->qsmaskinit &= ~mask;
830 if (rnp->qsmaskinit != 0) { 880 if (rnp->qsmaskinit != 0) {
831 spin_unlock(&rnp->lock); /* irqs already disabled. */ 881 spin_unlock(&rnp->lock); /* irqs remain disabled. */
832 break; 882 break;
833 } 883 }
884 rcu_preempt_offline_tasks(rsp, rnp);
834 mask = rnp->grpmask; 885 mask = rnp->grpmask;
835 spin_unlock(&rnp->lock); /* irqs already disabled. */ 886 spin_unlock(&rnp->lock); /* irqs remain disabled. */
836 rnp = rnp->parent; 887 rnp = rnp->parent;
837 } while (rnp != NULL); 888 } while (rnp != NULL);
838 lastcomp = rsp->completed; 889 lastcomp = rsp->completed;
@@ -845,7 +896,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
845 /* 896 /*
846 * Move callbacks from the outgoing CPU to the running CPU. 897 * Move callbacks from the outgoing CPU to the running CPU.
847 * Note that the outgoing CPU is now quiscent, so it is now 898 * Note that the outgoing CPU is now quiscent, so it is now
848 * (uncharacteristically) safe to access it rcu_data structure. 899 * (uncharacteristically) safe to access its rcu_data structure.
849 * Note also that we must carefully retain the order of the 900 * Note also that we must carefully retain the order of the
850 * outgoing CPU's callbacks in order for rcu_barrier() to work 901 * outgoing CPU's callbacks in order for rcu_barrier() to work
851 * correctly. Finally, note that we start all the callbacks 902 * correctly. Finally, note that we start all the callbacks
@@ -876,8 +927,9 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
876 */ 927 */
877static void rcu_offline_cpu(int cpu) 928static void rcu_offline_cpu(int cpu)
878{ 929{
879 __rcu_offline_cpu(cpu, &rcu_state); 930 __rcu_offline_cpu(cpu, &rcu_sched_state);
880 __rcu_offline_cpu(cpu, &rcu_bh_state); 931 __rcu_offline_cpu(cpu, &rcu_bh_state);
932 rcu_preempt_offline_cpu(cpu);
881} 933}
882 934
883#else /* #ifdef CONFIG_HOTPLUG_CPU */ 935#else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -963,6 +1015,8 @@ static void rcu_do_batch(struct rcu_data *rdp)
963 */ 1015 */
964void rcu_check_callbacks(int cpu, int user) 1016void rcu_check_callbacks(int cpu, int user)
965{ 1017{
1018 if (!rcu_pending(cpu))
1019 return; /* if nothing for RCU to do. */
966 if (user || 1020 if (user ||
967 (idle_cpu(cpu) && rcu_scheduler_active && 1021 (idle_cpu(cpu) && rcu_scheduler_active &&
968 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { 1022 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
@@ -971,17 +1025,16 @@ void rcu_check_callbacks(int cpu, int user)
971 * Get here if this CPU took its interrupt from user 1025 * Get here if this CPU took its interrupt from user
972 * mode or from the idle loop, and if this is not a 1026 * mode or from the idle loop, and if this is not a
973 * nested interrupt. In this case, the CPU is in 1027 * nested interrupt. In this case, the CPU is in
974 * a quiescent state, so count it. 1028 * a quiescent state, so note it.
975 * 1029 *
976 * No memory barrier is required here because both 1030 * No memory barrier is required here because both
977 * rcu_qsctr_inc() and rcu_bh_qsctr_inc() reference 1031 * rcu_sched_qs() and rcu_bh_qs() reference only CPU-local
978 * only CPU-local variables that other CPUs neither 1032 * variables that other CPUs neither access nor modify,
979 * access nor modify, at least not while the corresponding 1033 * at least not while the corresponding CPU is online.
980 * CPU is online.
981 */ 1034 */
982 1035
983 rcu_qsctr_inc(cpu); 1036 rcu_sched_qs(cpu);
984 rcu_bh_qsctr_inc(cpu); 1037 rcu_bh_qs(cpu);
985 1038
986 } else if (!in_softirq()) { 1039 } else if (!in_softirq()) {
987 1040
@@ -989,11 +1042,12 @@ void rcu_check_callbacks(int cpu, int user)
989 * Get here if this CPU did not take its interrupt from 1042 * Get here if this CPU did not take its interrupt from
990 * softirq, in other words, if it is not interrupting 1043 * softirq, in other words, if it is not interrupting
991 * a rcu_bh read-side critical section. This is an _bh 1044 * a rcu_bh read-side critical section. This is an _bh
992 * critical section, so count it. 1045 * critical section, so note it.
993 */ 1046 */
994 1047
995 rcu_bh_qsctr_inc(cpu); 1048 rcu_bh_qs(cpu);
996 } 1049 }
1050 rcu_preempt_check_callbacks(cpu);
997 raise_softirq(RCU_SOFTIRQ); 1051 raise_softirq(RCU_SOFTIRQ);
998} 1052}
999 1053
@@ -1132,6 +1186,8 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1132{ 1186{
1133 unsigned long flags; 1187 unsigned long flags;
1134 1188
1189 WARN_ON_ONCE(rdp->beenonline == 0);
1190
1135 /* 1191 /*
1136 * If an RCU GP has gone long enough, go check for dyntick 1192 * If an RCU GP has gone long enough, go check for dyntick
1137 * idle CPUs and, if needed, send resched IPIs. 1193 * idle CPUs and, if needed, send resched IPIs.
@@ -1170,8 +1226,10 @@ static void rcu_process_callbacks(struct softirq_action *unused)
1170 */ 1226 */
1171 smp_mb(); /* See above block comment. */ 1227 smp_mb(); /* See above block comment. */
1172 1228
1173 __rcu_process_callbacks(&rcu_state, &__get_cpu_var(rcu_data)); 1229 __rcu_process_callbacks(&rcu_sched_state,
1230 &__get_cpu_var(rcu_sched_data));
1174 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); 1231 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
1232 rcu_preempt_process_callbacks();
1175 1233
1176 /* 1234 /*
1177 * Memory references from any later RCU read-side critical sections 1235 * Memory references from any later RCU read-side critical sections
@@ -1227,13 +1285,13 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1227} 1285}
1228 1286
1229/* 1287/*
1230 * Queue an RCU callback for invocation after a grace period. 1288 * Queue an RCU-sched callback for invocation after a grace period.
1231 */ 1289 */
1232void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 1290void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1233{ 1291{
1234 __call_rcu(head, func, &rcu_state); 1292 __call_rcu(head, func, &rcu_sched_state);
1235} 1293}
1236EXPORT_SYMBOL_GPL(call_rcu); 1294EXPORT_SYMBOL_GPL(call_rcu_sched);
1237 1295
1238/* 1296/*
1239 * Queue an RCU for invocation after a quicker grace period. 1297 * Queue an RCU for invocation after a quicker grace period.
@@ -1305,10 +1363,11 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1305 * by the current CPU, returning 1 if so. This function is part of the 1363 * by the current CPU, returning 1 if so. This function is part of the
1306 * RCU implementation; it is -not- an exported member of the RCU API. 1364 * RCU implementation; it is -not- an exported member of the RCU API.
1307 */ 1365 */
1308int rcu_pending(int cpu) 1366static int rcu_pending(int cpu)
1309{ 1367{
1310 return __rcu_pending(&rcu_state, &per_cpu(rcu_data, cpu)) || 1368 return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) ||
1311 __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)); 1369 __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)) ||
1370 rcu_preempt_pending(cpu);
1312} 1371}
1313 1372
1314/* 1373/*
@@ -1320,27 +1379,46 @@ int rcu_pending(int cpu)
1320int rcu_needs_cpu(int cpu) 1379int rcu_needs_cpu(int cpu)
1321{ 1380{
1322 /* RCU callbacks either ready or pending? */ 1381 /* RCU callbacks either ready or pending? */
1323 return per_cpu(rcu_data, cpu).nxtlist || 1382 return per_cpu(rcu_sched_data, cpu).nxtlist ||
1324 per_cpu(rcu_bh_data, cpu).nxtlist; 1383 per_cpu(rcu_bh_data, cpu).nxtlist ||
1384 rcu_preempt_needs_cpu(cpu);
1325} 1385}
1326 1386
1327/* 1387/*
1328 * Initialize a CPU's per-CPU RCU data. We take this "scorched earth" 1388 * Do boot-time initialization of a CPU's per-CPU RCU data.
1329 * approach so that we don't have to worry about how long the CPU has
1330 * been gone, or whether it ever was online previously. We do trust the
1331 * ->mynode field, as it is constant for a given struct rcu_data and
1332 * initialized during early boot.
1333 *
1334 * Note that only one online or offline event can be happening at a given
1335 * time. Note also that we can accept some slop in the rsp->completed
1336 * access due to the fact that this CPU cannot possibly have any RCU
1337 * callbacks in flight yet.
1338 */ 1389 */
1339static void __cpuinit 1390static void __init
1340rcu_init_percpu_data(int cpu, struct rcu_state *rsp) 1391rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1341{ 1392{
1342 unsigned long flags; 1393 unsigned long flags;
1343 int i; 1394 int i;
1395 struct rcu_data *rdp = rsp->rda[cpu];
1396 struct rcu_node *rnp = rcu_get_root(rsp);
1397
1398 /* Set up local state, ensuring consistent view of global state. */
1399 spin_lock_irqsave(&rnp->lock, flags);
1400 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
1401 rdp->nxtlist = NULL;
1402 for (i = 0; i < RCU_NEXT_SIZE; i++)
1403 rdp->nxttail[i] = &rdp->nxtlist;
1404 rdp->qlen = 0;
1405#ifdef CONFIG_NO_HZ
1406 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1407#endif /* #ifdef CONFIG_NO_HZ */
1408 rdp->cpu = cpu;
1409 spin_unlock_irqrestore(&rnp->lock, flags);
1410}
1411
1412/*
1413 * Initialize a CPU's per-CPU RCU data. Note that only one online or
1414 * offline event can be happening at a given time. Note also that we
1415 * can accept some slop in the rsp->completed access due to the fact
1416 * that this CPU cannot possibly have any RCU callbacks in flight yet.
1417 */
1418static void __cpuinit
1419rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1420{
1421 unsigned long flags;
1344 long lastcomp; 1422 long lastcomp;
1345 unsigned long mask; 1423 unsigned long mask;
1346 struct rcu_data *rdp = rsp->rda[cpu]; 1424 struct rcu_data *rdp = rsp->rda[cpu];
@@ -1354,17 +1432,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
1354 rdp->passed_quiesc = 0; /* We could be racing with new GP, */ 1432 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1355 rdp->qs_pending = 1; /* so set up to respond to current GP. */ 1433 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1356 rdp->beenonline = 1; /* We have now been online. */ 1434 rdp->beenonline = 1; /* We have now been online. */
1435 rdp->preemptable = preemptable;
1357 rdp->passed_quiesc_completed = lastcomp - 1; 1436 rdp->passed_quiesc_completed = lastcomp - 1;
1358 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
1359 rdp->nxtlist = NULL;
1360 for (i = 0; i < RCU_NEXT_SIZE; i++)
1361 rdp->nxttail[i] = &rdp->nxtlist;
1362 rdp->qlen = 0;
1363 rdp->blimit = blimit; 1437 rdp->blimit = blimit;
1364#ifdef CONFIG_NO_HZ
1365 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1366#endif /* #ifdef CONFIG_NO_HZ */
1367 rdp->cpu = cpu;
1368 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1438 spin_unlock(&rnp->lock); /* irqs remain disabled. */
1369 1439
1370 /* 1440 /*
@@ -1405,16 +1475,16 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
1405 1475
1406static void __cpuinit rcu_online_cpu(int cpu) 1476static void __cpuinit rcu_online_cpu(int cpu)
1407{ 1477{
1408 rcu_init_percpu_data(cpu, &rcu_state); 1478 rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
1409 rcu_init_percpu_data(cpu, &rcu_bh_state); 1479 rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
1410 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 1480 rcu_preempt_init_percpu_data(cpu);
1411} 1481}
1412 1482
1413/* 1483/*
1414 * Handle CPU online/offline notifcation events. 1484 * Handle CPU online/offline notification events.
1415 */ 1485 */
1416static int __cpuinit rcu_cpu_notify(struct notifier_block *self, 1486int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1417 unsigned long action, void *hcpu) 1487 unsigned long action, void *hcpu)
1418{ 1488{
1419 long cpu = (long)hcpu; 1489 long cpu = (long)hcpu;
1420 1490
@@ -1486,6 +1556,7 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1486 rnp = rsp->level[i]; 1556 rnp = rsp->level[i];
1487 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { 1557 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
1488 spin_lock_init(&rnp->lock); 1558 spin_lock_init(&rnp->lock);
1559 rnp->gpnum = 0;
1489 rnp->qsmask = 0; 1560 rnp->qsmask = 0;
1490 rnp->qsmaskinit = 0; 1561 rnp->qsmaskinit = 0;
1491 rnp->grplo = j * cpustride; 1562 rnp->grplo = j * cpustride;
@@ -1503,16 +1574,20 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1503 j / rsp->levelspread[i - 1]; 1574 j / rsp->levelspread[i - 1];
1504 } 1575 }
1505 rnp->level = i; 1576 rnp->level = i;
1577 INIT_LIST_HEAD(&rnp->blocked_tasks[0]);
1578 INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
1506 } 1579 }
1507 } 1580 }
1508} 1581}
1509 1582
1510/* 1583/*
1511 * Helper macro for __rcu_init(). To be used nowhere else! 1584 * Helper macro for __rcu_init() and __rcu_init_preempt(). To be used
1512 * Assigns leaf node pointers into each CPU's rcu_data structure. 1585 * nowhere else! Assigns leaf node pointers into each CPU's rcu_data
1586 * structure.
1513 */ 1587 */
1514#define RCU_DATA_PTR_INIT(rsp, rcu_data) \ 1588#define RCU_INIT_FLAVOR(rsp, rcu_data) \
1515do { \ 1589do { \
1590 rcu_init_one(rsp); \
1516 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \ 1591 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
1517 j = 0; \ 1592 j = 0; \
1518 for_each_possible_cpu(i) { \ 1593 for_each_possible_cpu(i) { \
@@ -1520,32 +1595,43 @@ do { \
1520 j++; \ 1595 j++; \
1521 per_cpu(rcu_data, i).mynode = &rnp[j]; \ 1596 per_cpu(rcu_data, i).mynode = &rnp[j]; \
1522 (rsp)->rda[i] = &per_cpu(rcu_data, i); \ 1597 (rsp)->rda[i] = &per_cpu(rcu_data, i); \
1598 rcu_boot_init_percpu_data(i, rsp); \
1523 } \ 1599 } \
1524} while (0) 1600} while (0)
1525 1601
1526static struct notifier_block __cpuinitdata rcu_nb = { 1602#ifdef CONFIG_TREE_PREEMPT_RCU
1527 .notifier_call = rcu_cpu_notify, 1603
1528}; 1604void __init __rcu_init_preempt(void)
1605{
1606 int i; /* All used by RCU_INIT_FLAVOR(). */
1607 int j;
1608 struct rcu_node *rnp;
1609
1610 RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data);
1611}
1612
1613#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1614
1615void __init __rcu_init_preempt(void)
1616{
1617}
1618
1619#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
1529 1620
1530void __init __rcu_init(void) 1621void __init __rcu_init(void)
1531{ 1622{
1532 int i; /* All used by RCU_DATA_PTR_INIT(). */ 1623 int i; /* All used by RCU_INIT_FLAVOR(). */
1533 int j; 1624 int j;
1534 struct rcu_node *rnp; 1625 struct rcu_node *rnp;
1535 1626
1536 printk(KERN_INFO "Hierarchical RCU implementation.\n"); 1627 rcu_bootup_announce();
1537#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 1628#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1538 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); 1629 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1539#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 1630#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
1540 rcu_init_one(&rcu_state); 1631 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
1541 RCU_DATA_PTR_INIT(&rcu_state, rcu_data); 1632 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
1542 rcu_init_one(&rcu_bh_state); 1633 __rcu_init_preempt();
1543 RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data); 1634 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1544
1545 for_each_online_cpu(i)
1546 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
1547 /* Register notifier for non-boot CPUs */
1548 register_cpu_notifier(&rcu_nb);
1549} 1635}
1550 1636
1551module_param(blimit, int, 0); 1637module_param(blimit, int, 0);
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 5e872bbf07f5..bf8a6f9f134d 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -1,10 +1,259 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
3 * Internal non-public definitions.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 *
19 * Copyright IBM Corporation, 2008
20 *
21 * Author: Ingo Molnar <mingo@elte.hu>
22 * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
23 */
24
25#include <linux/cache.h>
26#include <linux/spinlock.h>
27#include <linux/threads.h>
28#include <linux/cpumask.h>
29#include <linux/seqlock.h>
30
31/*
32 * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
33 * In theory, it should be possible to add more levels straightforwardly.
34 * In practice, this has not been tested, so there is probably some
35 * bug somewhere.
36 */
37#define MAX_RCU_LVLS 3
38#define RCU_FANOUT (CONFIG_RCU_FANOUT)
39#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT)
40#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT)
41
42#if NR_CPUS <= RCU_FANOUT
43# define NUM_RCU_LVLS 1
44# define NUM_RCU_LVL_0 1
45# define NUM_RCU_LVL_1 (NR_CPUS)
46# define NUM_RCU_LVL_2 0
47# define NUM_RCU_LVL_3 0
48#elif NR_CPUS <= RCU_FANOUT_SQ
49# define NUM_RCU_LVLS 2
50# define NUM_RCU_LVL_0 1
51# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT)
52# define NUM_RCU_LVL_2 (NR_CPUS)
53# define NUM_RCU_LVL_3 0
54#elif NR_CPUS <= RCU_FANOUT_CUBE
55# define NUM_RCU_LVLS 3
56# define NUM_RCU_LVL_0 1
57# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ)
58# define NUM_RCU_LVL_2 (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT))
59# define NUM_RCU_LVL_3 NR_CPUS
60#else
61# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
62#endif /* #if (NR_CPUS) <= RCU_FANOUT */
63
64#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
65#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
66
67/*
68 * Dynticks per-CPU state.
69 */
70struct rcu_dynticks {
71 int dynticks_nesting; /* Track nesting level, sort of. */
72 int dynticks; /* Even value for dynticks-idle, else odd. */
73 int dynticks_nmi; /* Even value for either dynticks-idle or */
74 /* not in nmi handler, else odd. So this */
75 /* remains even for nmi from irq handler. */
76};
77
78/*
79 * Definition for node within the RCU grace-period-detection hierarchy.
80 */
81struct rcu_node {
82 spinlock_t lock;
83 long gpnum; /* Current grace period for this node. */
84 /* This will either be equal to or one */
85 /* behind the root rcu_node's gpnum. */
86 unsigned long qsmask; /* CPUs or groups that need to switch in */
87 /* order for current grace period to proceed.*/
88 unsigned long qsmaskinit;
89 /* Per-GP initialization for qsmask. */
90 unsigned long grpmask; /* Mask to apply to parent qsmask. */
91 int grplo; /* lowest-numbered CPU or group here. */
92 int grphi; /* highest-numbered CPU or group here. */
93 u8 grpnum; /* CPU/group number for next level up. */
94 u8 level; /* root is at level 0. */
95 struct rcu_node *parent;
96 struct list_head blocked_tasks[2];
97 /* Tasks blocked in RCU read-side critsect. */
98} ____cacheline_internodealigned_in_smp;
99
100/* Index values for nxttail array in struct rcu_data. */
101#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
102#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */
103#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */
104#define RCU_NEXT_TAIL 3
105#define RCU_NEXT_SIZE 4
106
107/* Per-CPU data for read-copy update. */
108struct rcu_data {
109 /* 1) quiescent-state and grace-period handling : */
110 long completed; /* Track rsp->completed gp number */
111 /* in order to detect GP end. */
112 long gpnum; /* Highest gp number that this CPU */
113 /* is aware of having started. */
114 long passed_quiesc_completed;
115 /* Value of completed at time of qs. */
116 bool passed_quiesc; /* User-mode/idle loop etc. */
117 bool qs_pending; /* Core waits for quiesc state. */
118 bool beenonline; /* CPU online at least once. */
119 bool preemptable; /* Preemptable RCU? */
120 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
121 unsigned long grpmask; /* Mask to apply to leaf qsmask. */
122
123 /* 2) batch handling */
124 /*
125 * If nxtlist is not NULL, it is partitioned as follows.
126 * Any of the partitions might be empty, in which case the
127 * pointer to that partition will be equal to the pointer for
128 * the following partition. When the list is empty, all of
129 * the nxttail elements point to nxtlist, which is NULL.
130 *
131 * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]):
132 * Entries that might have arrived after current GP ended
133 * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
134 * Entries known to have arrived before current GP ended
135 * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
136 * Entries that batch # <= ->completed - 1: waiting for current GP
137 * [nxtlist, *nxttail[RCU_DONE_TAIL]):
138 * Entries that batch # <= ->completed
139 * The grace period for these entries has completed, and
140 * the other grace-period-completed entries may be moved
141 * here temporarily in rcu_process_callbacks().
142 */
143 struct rcu_head *nxtlist;
144 struct rcu_head **nxttail[RCU_NEXT_SIZE];
145 long qlen; /* # of queued callbacks */
146 long blimit; /* Upper limit on a processed batch */
147
148#ifdef CONFIG_NO_HZ
149 /* 3) dynticks interface. */
150 struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */
151 int dynticks_snap; /* Per-GP tracking for dynticks. */
152 int dynticks_nmi_snap; /* Per-GP tracking for dynticks_nmi. */
153#endif /* #ifdef CONFIG_NO_HZ */
154
155 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
156#ifdef CONFIG_NO_HZ
157 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
158#endif /* #ifdef CONFIG_NO_HZ */
159 unsigned long offline_fqs; /* Kicked due to being offline. */
160 unsigned long resched_ipi; /* Sent a resched IPI. */
161
162 /* 5) __rcu_pending() statistics. */
163 long n_rcu_pending; /* rcu_pending() calls since boot. */
164 long n_rp_qs_pending;
165 long n_rp_cb_ready;
166 long n_rp_cpu_needs_gp;
167 long n_rp_gp_completed;
168 long n_rp_gp_started;
169 long n_rp_need_fqs;
170 long n_rp_need_nothing;
171
172 int cpu;
173};
174
175/* Values for signaled field in struct rcu_state. */
176#define RCU_GP_INIT 0 /* Grace period being initialized. */
177#define RCU_SAVE_DYNTICK 1 /* Need to scan dyntick state. */
178#define RCU_FORCE_QS 2 /* Need to force quiescent state. */
179#ifdef CONFIG_NO_HZ
180#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
181#else /* #ifdef CONFIG_NO_HZ */
182#define RCU_SIGNAL_INIT RCU_FORCE_QS
183#endif /* #else #ifdef CONFIG_NO_HZ */
184
185#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
186#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
187#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rsp->jiffies_stall */
188#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rsp->jiffies_stall */
189#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
190 /* to take at least one */
191 /* scheduling clock irq */
192 /* before ratting on them. */
193
194#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
195
196/*
197 * RCU global state, including node hierarchy. This hierarchy is
198 * represented in "heap" form in a dense array. The root (first level)
199 * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
200 * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]),
201 * and the third level in ->node[m+1] and following (->node[m+1] referenced
202 * by ->level[2]). The number of levels is determined by the number of
203 * CPUs and by CONFIG_RCU_FANOUT. Small systems will have a "hierarchy"
204 * consisting of a single rcu_node.
205 */
206struct rcu_state {
207 struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */
208 struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */
209 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
210 u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */
211 struct rcu_data *rda[NR_CPUS]; /* array of rdp pointers. */
212
213 /* The following fields are guarded by the root rcu_node's lock. */
214
215 u8 signaled ____cacheline_internodealigned_in_smp;
216 /* Force QS state. */
217 long gpnum; /* Current gp number. */
218 long completed; /* # of last completed gp. */
219 spinlock_t onofflock; /* exclude on/offline and */
220 /* starting new GP. */
221 spinlock_t fqslock; /* Only one task forcing */
222 /* quiescent states. */
223 unsigned long jiffies_force_qs; /* Time at which to invoke */
224 /* force_quiescent_state(). */
225 unsigned long n_force_qs; /* Number of calls to */
226 /* force_quiescent_state(). */
227 unsigned long n_force_qs_lh; /* ~Number of calls leaving */
228 /* due to lock unavailable. */
229 unsigned long n_force_qs_ngp; /* Number of calls leaving */
230 /* due to no GP active. */
231#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
232 unsigned long gp_start; /* Time at which GP started, */
233 /* but in jiffies. */
234 unsigned long jiffies_stall; /* Time at which to check */
235 /* for CPU stalls. */
236#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
237#ifdef CONFIG_NO_HZ
238 long dynticks_completed; /* Value of completed @ snap. */
239#endif /* #ifdef CONFIG_NO_HZ */
240};
241
242#ifdef RCU_TREE_NONCORE
1 243
2/* 244/*
3 * RCU implementation internal declarations: 245 * RCU implementation internal declarations:
4 */ 246 */
5extern struct rcu_state rcu_state; 247extern struct rcu_state rcu_sched_state;
6DECLARE_PER_CPU(struct rcu_data, rcu_data); 248DECLARE_PER_CPU(struct rcu_data, rcu_sched_data);
7 249
8extern struct rcu_state rcu_bh_state; 250extern struct rcu_state rcu_bh_state;
9DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); 251DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
10 252
253#ifdef CONFIG_TREE_PREEMPT_RCU
254extern struct rcu_state rcu_preempt_state;
255DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
256#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
257
258#endif /* #ifdef RCU_TREE_NONCORE */
259
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
new file mode 100644
index 000000000000..47789369ea59
--- /dev/null
+++ b/kernel/rcutree_plugin.h
@@ -0,0 +1,532 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
3 * Internal non-public definitions that provide either classic
4 * or preemptable semantics.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 *
20 * Copyright Red Hat, 2009
21 * Copyright IBM Corporation, 2009
22 *
23 * Author: Ingo Molnar <mingo@elte.hu>
24 * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
25 */
26
27
28#ifdef CONFIG_TREE_PREEMPT_RCU
29
30struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
31DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
32
33/*
34 * Tell them what RCU they are running.
35 */
36static inline void rcu_bootup_announce(void)
37{
38 printk(KERN_INFO
39 "Experimental preemptable hierarchical RCU implementation.\n");
40}
41
42/*
43 * Return the number of RCU-preempt batches processed thus far
44 * for debug and statistics.
45 */
46long rcu_batches_completed_preempt(void)
47{
48 return rcu_preempt_state.completed;
49}
50EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt);
51
52/*
53 * Return the number of RCU batches processed thus far for debug & stats.
54 */
55long rcu_batches_completed(void)
56{
57 return rcu_batches_completed_preempt();
58}
59EXPORT_SYMBOL_GPL(rcu_batches_completed);
60
61/*
62 * Record a preemptable-RCU quiescent state for the specified CPU. Note
63 * that this just means that the task currently running on the CPU is
64 * not in a quiescent state. There might be any number of tasks blocked
65 * while in an RCU read-side critical section.
66 */
67static void rcu_preempt_qs_record(int cpu)
68{
69 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
70 rdp->passed_quiesc = 1;
71 rdp->passed_quiesc_completed = rdp->completed;
72}
73
74/*
75 * We have entered the scheduler or are between softirqs in ksoftirqd.
76 * If we are in an RCU read-side critical section, we need to reflect
77 * that in the state of the rcu_node structure corresponding to this CPU.
78 * Caller must disable hardirqs.
79 */
80static void rcu_preempt_qs(int cpu)
81{
82 struct task_struct *t = current;
83 int phase;
84 struct rcu_data *rdp;
85 struct rcu_node *rnp;
86
87 if (t->rcu_read_lock_nesting &&
88 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
89
90 /* Possibly blocking in an RCU read-side critical section. */
91 rdp = rcu_preempt_state.rda[cpu];
92 rnp = rdp->mynode;
93 spin_lock(&rnp->lock);
94 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
95 t->rcu_blocked_node = rnp;
96
97 /*
98 * If this CPU has already checked in, then this task
99 * will hold up the next grace period rather than the
100 * current grace period. Queue the task accordingly.
101 * If the task is queued for the current grace period
102 * (i.e., this CPU has not yet passed through a quiescent
103 * state for the current grace period), then as long
104 * as that task remains queued, the current grace period
105 * cannot end.
106 */
107 phase = !(rnp->qsmask & rdp->grpmask) ^ (rnp->gpnum & 0x1);
108 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]);
109 smp_mb(); /* Ensure later ctxt swtch seen after above. */
110 spin_unlock(&rnp->lock);
111 }
112
113 /*
114 * Either we were not in an RCU read-side critical section to
115 * begin with, or we have now recorded that critical section
116 * globally. Either way, we can now note a quiescent state
117 * for this CPU. Again, if we were in an RCU read-side critical
118 * section, and if that critical section was blocking the current
119 * grace period, then the fact that the task has been enqueued
120 * means that we continue to block the current grace period.
121 */
122 rcu_preempt_qs_record(cpu);
123 t->rcu_read_unlock_special &= ~(RCU_READ_UNLOCK_NEED_QS |
124 RCU_READ_UNLOCK_GOT_QS);
125}
126
127/*
128 * Tree-preemptable RCU implementation for rcu_read_lock().
129 * Just increment ->rcu_read_lock_nesting, shared state will be updated
130 * if we block.
131 */
132void __rcu_read_lock(void)
133{
134 ACCESS_ONCE(current->rcu_read_lock_nesting)++;
135 barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */
136}
137EXPORT_SYMBOL_GPL(__rcu_read_lock);
138
139static void rcu_read_unlock_special(struct task_struct *t)
140{
141 int empty;
142 unsigned long flags;
143 unsigned long mask;
144 struct rcu_node *rnp;
145 int special;
146
147 /* NMI handlers cannot block and cannot safely manipulate state. */
148 if (in_nmi())
149 return;
150
151 local_irq_save(flags);
152
153 /*
154 * If RCU core is waiting for this CPU to exit critical section,
155 * let it know that we have done so.
156 */
157 special = t->rcu_read_unlock_special;
158 if (special & RCU_READ_UNLOCK_NEED_QS) {
159 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
160 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_GOT_QS;
161 }
162
163 /* Hardware IRQ handlers cannot block. */
164 if (in_irq()) {
165 local_irq_restore(flags);
166 return;
167 }
168
169 /* Clean up if blocked during RCU read-side critical section. */
170 if (special & RCU_READ_UNLOCK_BLOCKED) {
171 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
172
173 /*
174 * Remove this task from the list it blocked on. The
175 * task can migrate while we acquire the lock, but at
176 * most one time. So at most two passes through loop.
177 */
178 for (;;) {
179 rnp = t->rcu_blocked_node;
180 spin_lock(&rnp->lock);
181 if (rnp == t->rcu_blocked_node)
182 break;
183 spin_unlock(&rnp->lock);
184 }
185 empty = list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
186 list_del_init(&t->rcu_node_entry);
187 t->rcu_blocked_node = NULL;
188
189 /*
190 * If this was the last task on the current list, and if
191 * we aren't waiting on any CPUs, report the quiescent state.
192 * Note that both cpu_quiet_msk_finish() and cpu_quiet_msk()
193 * drop rnp->lock and restore irq.
194 */
195 if (!empty && rnp->qsmask == 0 &&
196 list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])) {
197 t->rcu_read_unlock_special &=
198 ~(RCU_READ_UNLOCK_NEED_QS |
199 RCU_READ_UNLOCK_GOT_QS);
200 if (rnp->parent == NULL) {
201 /* Only one rcu_node in the tree. */
202 cpu_quiet_msk_finish(&rcu_preempt_state, flags);
203 return;
204 }
205 /* Report up the rest of the hierarchy. */
206 mask = rnp->grpmask;
207 spin_unlock_irqrestore(&rnp->lock, flags);
208 rnp = rnp->parent;
209 spin_lock_irqsave(&rnp->lock, flags);
210 cpu_quiet_msk(mask, &rcu_preempt_state, rnp, flags);
211 return;
212 }
213 spin_unlock(&rnp->lock);
214 }
215 local_irq_restore(flags);
216}
217
218/*
219 * Tree-preemptable RCU implementation for rcu_read_unlock().
220 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
221 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
222 * invoke rcu_read_unlock_special() to clean up after a context switch
223 * in an RCU read-side critical section and other special cases.
224 */
225void __rcu_read_unlock(void)
226{
227 struct task_struct *t = current;
228
229 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */
230 if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 &&
231 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
232 rcu_read_unlock_special(t);
233}
234EXPORT_SYMBOL_GPL(__rcu_read_unlock);
235
236#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
237
238/*
239 * Scan the current list of tasks blocked within RCU read-side critical
240 * sections, printing out the tid of each.
241 */
242static void rcu_print_task_stall(struct rcu_node *rnp)
243{
244 unsigned long flags;
245 struct list_head *lp;
246 int phase = rnp->gpnum & 0x1;
247 struct task_struct *t;
248
249 if (!list_empty(&rnp->blocked_tasks[phase])) {
250 spin_lock_irqsave(&rnp->lock, flags);
251 phase = rnp->gpnum & 0x1; /* re-read under lock. */
252 lp = &rnp->blocked_tasks[phase];
253 list_for_each_entry(t, lp, rcu_node_entry)
254 printk(" P%d", t->pid);
255 spin_unlock_irqrestore(&rnp->lock, flags);
256 }
257}
258
259#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
260
261/*
262 * Check for preempted RCU readers for the specified rcu_node structure.
263 * If the caller needs a reliable answer, it must hold the rcu_node's
264 * >lock.
265 */
266static int rcu_preempted_readers(struct rcu_node *rnp)
267{
268 return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
269}
270
271#ifdef CONFIG_HOTPLUG_CPU
272
273/*
274 * Handle tasklist migration for case in which all CPUs covered by the
275 * specified rcu_node have gone offline. Move them up to the root
276 * rcu_node. The reason for not just moving them to the immediate
277 * parent is to remove the need for rcu_read_unlock_special() to
278 * make more than two attempts to acquire the target rcu_node's lock.
279 *
280 * The caller must hold rnp->lock with irqs disabled.
281 */
282static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
283 struct rcu_node *rnp)
284{
285 int i;
286 struct list_head *lp;
287 struct list_head *lp_root;
288 struct rcu_node *rnp_root = rcu_get_root(rsp);
289 struct task_struct *tp;
290
291 if (rnp == rnp_root) {
292 WARN_ONCE(1, "Last CPU thought to be offlined?");
293 return; /* Shouldn't happen: at least one CPU online. */
294 }
295
296 /*
297 * Move tasks up to root rcu_node. Rely on the fact that the
298 * root rcu_node can be at most one ahead of the rest of the
299 * rcu_nodes in terms of gp_num value. This fact allows us to
300 * move the blocked_tasks[] array directly, element by element.
301 */
302 for (i = 0; i < 2; i++) {
303 lp = &rnp->blocked_tasks[i];
304 lp_root = &rnp_root->blocked_tasks[i];
305 while (!list_empty(lp)) {
306 tp = list_entry(lp->next, typeof(*tp), rcu_node_entry);
307 spin_lock(&rnp_root->lock); /* irqs already disabled */
308 list_del(&tp->rcu_node_entry);
309 tp->rcu_blocked_node = rnp_root;
310 list_add(&tp->rcu_node_entry, lp_root);
311 spin_unlock(&rnp_root->lock); /* irqs remain disabled */
312 }
313 }
314}
315
316/*
317 * Do CPU-offline processing for preemptable RCU.
318 */
319static void rcu_preempt_offline_cpu(int cpu)
320{
321 __rcu_offline_cpu(cpu, &rcu_preempt_state);
322}
323
324#endif /* #ifdef CONFIG_HOTPLUG_CPU */
325
326/*
327 * Check for a quiescent state from the current CPU. When a task blocks,
328 * the task is recorded in the corresponding CPU's rcu_node structure,
329 * which is checked elsewhere.
330 *
331 * Caller must disable hard irqs.
332 */
333static void rcu_preempt_check_callbacks(int cpu)
334{
335 struct task_struct *t = current;
336
337 if (t->rcu_read_lock_nesting == 0) {
338 t->rcu_read_unlock_special &=
339 ~(RCU_READ_UNLOCK_NEED_QS | RCU_READ_UNLOCK_GOT_QS);
340 rcu_preempt_qs_record(cpu);
341 return;
342 }
343 if (per_cpu(rcu_preempt_data, cpu).qs_pending) {
344 if (t->rcu_read_unlock_special & RCU_READ_UNLOCK_GOT_QS) {
345 rcu_preempt_qs_record(cpu);
346 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_GOT_QS;
347 } else if (!(t->rcu_read_unlock_special &
348 RCU_READ_UNLOCK_NEED_QS)) {
349 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
350 }
351 }
352}
353
354/*
355 * Process callbacks for preemptable RCU.
356 */
357static void rcu_preempt_process_callbacks(void)
358{
359 __rcu_process_callbacks(&rcu_preempt_state,
360 &__get_cpu_var(rcu_preempt_data));
361}
362
363/*
364 * Queue a preemptable-RCU callback for invocation after a grace period.
365 */
366void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
367{
368 __call_rcu(head, func, &rcu_preempt_state);
369}
370EXPORT_SYMBOL_GPL(call_rcu);
371
372/*
373 * Check to see if there is any immediate preemptable-RCU-related work
374 * to be done.
375 */
376static int rcu_preempt_pending(int cpu)
377{
378 return __rcu_pending(&rcu_preempt_state,
379 &per_cpu(rcu_preempt_data, cpu));
380}
381
382/*
383 * Does preemptable RCU need the CPU to stay out of dynticks mode?
384 */
385static int rcu_preempt_needs_cpu(int cpu)
386{
387 return !!per_cpu(rcu_preempt_data, cpu).nxtlist;
388}
389
390/*
391 * Initialize preemptable RCU's per-CPU data.
392 */
393static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
394{
395 rcu_init_percpu_data(cpu, &rcu_preempt_state, 1);
396}
397
398/*
399 * Check for a task exiting while in a preemptable-RCU read-side
400 * critical section, clean up if so. No need to issue warnings,
401 * as debug_check_no_locks_held() already does this if lockdep
402 * is enabled.
403 */
404void exit_rcu(void)
405{
406 struct task_struct *t = current;
407
408 if (t->rcu_read_lock_nesting == 0)
409 return;
410 t->rcu_read_lock_nesting = 1;
411 rcu_read_unlock();
412}
413
414#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
415
416/*
417 * Tell them what RCU they are running.
418 */
419static inline void rcu_bootup_announce(void)
420{
421 printk(KERN_INFO "Hierarchical RCU implementation.\n");
422}
423
424/*
425 * Return the number of RCU batches processed thus far for debug & stats.
426 */
427long rcu_batches_completed(void)
428{
429 return rcu_batches_completed_sched();
430}
431EXPORT_SYMBOL_GPL(rcu_batches_completed);
432
433/*
434 * Because preemptable RCU does not exist, we never have to check for
435 * CPUs being in quiescent states.
436 */
437static void rcu_preempt_qs(int cpu)
438{
439}
440
441#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
442
443/*
444 * Because preemptable RCU does not exist, we never have to check for
445 * tasks blocked within RCU read-side critical sections.
446 */
447static void rcu_print_task_stall(struct rcu_node *rnp)
448{
449}
450
451#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
452
453/*
454 * Because preemptable RCU does not exist, there are never any preempted
455 * RCU readers.
456 */
457static int rcu_preempted_readers(struct rcu_node *rnp)
458{
459 return 0;
460}
461
462#ifdef CONFIG_HOTPLUG_CPU
463
464/*
465 * Because preemptable RCU does not exist, it never needs to migrate
466 * tasks that were blocked within RCU read-side critical sections.
467 */
468static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
469 struct rcu_node *rnp)
470{
471}
472
473/*
474 * Because preemptable RCU does not exist, it never needs CPU-offline
475 * processing.
476 */
477static void rcu_preempt_offline_cpu(int cpu)
478{
479}
480
481#endif /* #ifdef CONFIG_HOTPLUG_CPU */
482
483/*
484 * Because preemptable RCU does not exist, it never has any callbacks
485 * to check.
486 */
487void rcu_preempt_check_callbacks(int cpu)
488{
489}
490
491/*
492 * Because preemptable RCU does not exist, it never has any callbacks
493 * to process.
494 */
495void rcu_preempt_process_callbacks(void)
496{
497}
498
499/*
500 * In classic RCU, call_rcu() is just call_rcu_sched().
501 */
502void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
503{
504 call_rcu_sched(head, func);
505}
506EXPORT_SYMBOL_GPL(call_rcu);
507
508/*
509 * Because preemptable RCU does not exist, it never has any work to do.
510 */
511static int rcu_preempt_pending(int cpu)
512{
513 return 0;
514}
515
516/*
517 * Because preemptable RCU does not exist, it never needs any CPU.
518 */
519static int rcu_preempt_needs_cpu(int cpu)
520{
521 return 0;
522}
523
524/*
525 * Because preemptable RCU does not exist, there is no per-CPU
526 * data to initialize.
527 */
528static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
529{
530}
531
532#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index fe1dcdbf1ca3..0ea1bff69727 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -43,6 +43,7 @@
43#include <linux/debugfs.h> 43#include <linux/debugfs.h>
44#include <linux/seq_file.h> 44#include <linux/seq_file.h>
45 45
46#define RCU_TREE_NONCORE
46#include "rcutree.h" 47#include "rcutree.h"
47 48
48static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) 49static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
@@ -76,8 +77,12 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
76 77
77static int show_rcudata(struct seq_file *m, void *unused) 78static int show_rcudata(struct seq_file *m, void *unused)
78{ 79{
79 seq_puts(m, "rcu:\n"); 80#ifdef CONFIG_TREE_PREEMPT_RCU
80 PRINT_RCU_DATA(rcu_data, print_one_rcu_data, m); 81 seq_puts(m, "rcu_preempt:\n");
82 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data, m);
83#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
84 seq_puts(m, "rcu_sched:\n");
85 PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data, m);
81 seq_puts(m, "rcu_bh:\n"); 86 seq_puts(m, "rcu_bh:\n");
82 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m); 87 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m);
83 return 0; 88 return 0;
@@ -102,7 +107,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
102 return; 107 return;
103 seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d", 108 seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d",
104 rdp->cpu, 109 rdp->cpu,
105 cpu_is_offline(rdp->cpu) ? "\"Y\"" : "\"N\"", 110 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
106 rdp->completed, rdp->gpnum, 111 rdp->completed, rdp->gpnum,
107 rdp->passed_quiesc, rdp->passed_quiesc_completed, 112 rdp->passed_quiesc, rdp->passed_quiesc_completed,
108 rdp->qs_pending); 113 rdp->qs_pending);
@@ -124,8 +129,12 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
124 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); 129 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
125#endif /* #ifdef CONFIG_NO_HZ */ 130#endif /* #ifdef CONFIG_NO_HZ */
126 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n"); 131 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n");
127 seq_puts(m, "\"rcu:\"\n"); 132#ifdef CONFIG_TREE_PREEMPT_RCU
128 PRINT_RCU_DATA(rcu_data, print_one_rcu_data_csv, m); 133 seq_puts(m, "\"rcu_preempt:\"\n");
134 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m);
135#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
136 seq_puts(m, "\"rcu_sched:\"\n");
137 PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data_csv, m);
129 seq_puts(m, "\"rcu_bh:\"\n"); 138 seq_puts(m, "\"rcu_bh:\"\n");
130 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m); 139 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m);
131 return 0; 140 return 0;
@@ -171,8 +180,12 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
171 180
172static int show_rcuhier(struct seq_file *m, void *unused) 181static int show_rcuhier(struct seq_file *m, void *unused)
173{ 182{
174 seq_puts(m, "rcu:\n"); 183#ifdef CONFIG_TREE_PREEMPT_RCU
175 print_one_rcu_state(m, &rcu_state); 184 seq_puts(m, "rcu_preempt:\n");
185 print_one_rcu_state(m, &rcu_preempt_state);
186#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
187 seq_puts(m, "rcu_sched:\n");
188 print_one_rcu_state(m, &rcu_sched_state);
176 seq_puts(m, "rcu_bh:\n"); 189 seq_puts(m, "rcu_bh:\n");
177 print_one_rcu_state(m, &rcu_bh_state); 190 print_one_rcu_state(m, &rcu_bh_state);
178 return 0; 191 return 0;
@@ -193,8 +206,12 @@ static struct file_operations rcuhier_fops = {
193 206
194static int show_rcugp(struct seq_file *m, void *unused) 207static int show_rcugp(struct seq_file *m, void *unused)
195{ 208{
196 seq_printf(m, "rcu: completed=%ld gpnum=%ld\n", 209#ifdef CONFIG_TREE_PREEMPT_RCU
197 rcu_state.completed, rcu_state.gpnum); 210 seq_printf(m, "rcu_preempt: completed=%ld gpnum=%ld\n",
211 rcu_preempt_state.completed, rcu_preempt_state.gpnum);
212#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
213 seq_printf(m, "rcu_sched: completed=%ld gpnum=%ld\n",
214 rcu_sched_state.completed, rcu_sched_state.gpnum);
198 seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n", 215 seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n",
199 rcu_bh_state.completed, rcu_bh_state.gpnum); 216 rcu_bh_state.completed, rcu_bh_state.gpnum);
200 return 0; 217 return 0;
@@ -243,8 +260,12 @@ static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
243 260
244static int show_rcu_pending(struct seq_file *m, void *unused) 261static int show_rcu_pending(struct seq_file *m, void *unused)
245{ 262{
246 seq_puts(m, "rcu:\n"); 263#ifdef CONFIG_TREE_PREEMPT_RCU
247 print_rcu_pendings(m, &rcu_state); 264 seq_puts(m, "rcu_preempt:\n");
265 print_rcu_pendings(m, &rcu_preempt_state);
266#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
267 seq_puts(m, "rcu_sched:\n");
268 print_rcu_pendings(m, &rcu_sched_state);
248 seq_puts(m, "rcu_bh:\n"); 269 seq_puts(m, "rcu_bh:\n");
249 print_rcu_pendings(m, &rcu_bh_state); 270 print_rcu_pendings(m, &rcu_bh_state);
250 return 0; 271 return 0;
@@ -264,62 +285,47 @@ static struct file_operations rcu_pending_fops = {
264}; 285};
265 286
266static struct dentry *rcudir; 287static struct dentry *rcudir;
267static struct dentry *datadir;
268static struct dentry *datadir_csv;
269static struct dentry *gpdir;
270static struct dentry *hierdir;
271static struct dentry *rcu_pendingdir;
272 288
273static int __init rcuclassic_trace_init(void) 289static int __init rcuclassic_trace_init(void)
274{ 290{
291 struct dentry *retval;
292
275 rcudir = debugfs_create_dir("rcu", NULL); 293 rcudir = debugfs_create_dir("rcu", NULL);
276 if (!rcudir) 294 if (!rcudir)
277 goto out; 295 goto free_out;
278 296
279 datadir = debugfs_create_file("rcudata", 0444, rcudir, 297 retval = debugfs_create_file("rcudata", 0444, rcudir,
280 NULL, &rcudata_fops); 298 NULL, &rcudata_fops);
281 if (!datadir) 299 if (!retval)
282 goto free_out; 300 goto free_out;
283 301
284 datadir_csv = debugfs_create_file("rcudata.csv", 0444, rcudir, 302 retval = debugfs_create_file("rcudata.csv", 0444, rcudir,
285 NULL, &rcudata_csv_fops); 303 NULL, &rcudata_csv_fops);
286 if (!datadir_csv) 304 if (!retval)
287 goto free_out; 305 goto free_out;
288 306
289 gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); 307 retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
290 if (!gpdir) 308 if (!retval)
291 goto free_out; 309 goto free_out;
292 310
293 hierdir = debugfs_create_file("rcuhier", 0444, rcudir, 311 retval = debugfs_create_file("rcuhier", 0444, rcudir,
294 NULL, &rcuhier_fops); 312 NULL, &rcuhier_fops);
295 if (!hierdir) 313 if (!retval)
296 goto free_out; 314 goto free_out;
297 315
298 rcu_pendingdir = debugfs_create_file("rcu_pending", 0444, rcudir, 316 retval = debugfs_create_file("rcu_pending", 0444, rcudir,
299 NULL, &rcu_pending_fops); 317 NULL, &rcu_pending_fops);
300 if (!rcu_pendingdir) 318 if (!retval)
301 goto free_out; 319 goto free_out;
302 return 0; 320 return 0;
303free_out: 321free_out:
304 if (datadir) 322 debugfs_remove_recursive(rcudir);
305 debugfs_remove(datadir);
306 if (datadir_csv)
307 debugfs_remove(datadir_csv);
308 if (gpdir)
309 debugfs_remove(gpdir);
310 debugfs_remove(rcudir);
311out:
312 return 1; 323 return 1;
313} 324}
314 325
315static void __exit rcuclassic_trace_cleanup(void) 326static void __exit rcuclassic_trace_cleanup(void)
316{ 327{
317 debugfs_remove(datadir); 328 debugfs_remove_recursive(rcudir);
318 debugfs_remove(datadir_csv);
319 debugfs_remove(gpdir);
320 debugfs_remove(hierdir);
321 debugfs_remove(rcu_pendingdir);
322 debugfs_remove(rcudir);
323} 329}
324 330
325 331
diff --git a/kernel/sched.c b/kernel/sched.c
index 1b59e265273b..e27a53685ed9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -64,7 +64,6 @@
64#include <linux/tsacct_kern.h> 64#include <linux/tsacct_kern.h>
65#include <linux/kprobes.h> 65#include <linux/kprobes.h>
66#include <linux/delayacct.h> 66#include <linux/delayacct.h>
67#include <linux/reciprocal_div.h>
68#include <linux/unistd.h> 67#include <linux/unistd.h>
69#include <linux/pagemap.h> 68#include <linux/pagemap.h>
70#include <linux/hrtimer.h> 69#include <linux/hrtimer.h>
@@ -120,30 +119,8 @@
120 */ 119 */
121#define RUNTIME_INF ((u64)~0ULL) 120#define RUNTIME_INF ((u64)~0ULL)
122 121
123#ifdef CONFIG_SMP
124
125static void double_rq_lock(struct rq *rq1, struct rq *rq2); 122static void double_rq_lock(struct rq *rq1, struct rq *rq2);
126 123
127/*
128 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
129 * Since cpu_power is a 'constant', we can use a reciprocal divide.
130 */
131static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
132{
133 return reciprocal_divide(load, sg->reciprocal_cpu_power);
134}
135
136/*
137 * Each time a sched group cpu_power is changed,
138 * we must compute its reciprocal value
139 */
140static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
141{
142 sg->__cpu_power += val;
143 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
144}
145#endif
146
147static inline int rt_policy(int policy) 124static inline int rt_policy(int policy)
148{ 125{
149 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) 126 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
@@ -309,8 +286,8 @@ void set_tg_uid(struct user_struct *user)
309 286
310/* 287/*
311 * Root task group. 288 * Root task group.
312 * Every UID task group (including init_task_group aka UID-0) will 289 * Every UID task group (including init_task_group aka UID-0) will
313 * be a child to this group. 290 * be a child to this group.
314 */ 291 */
315struct task_group root_task_group; 292struct task_group root_task_group;
316 293
@@ -318,7 +295,7 @@ struct task_group root_task_group;
318/* Default task group's sched entity on each cpu */ 295/* Default task group's sched entity on each cpu */
319static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); 296static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
320/* Default task group's cfs_rq on each cpu */ 297/* Default task group's cfs_rq on each cpu */
321static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; 298static DEFINE_PER_CPU(struct cfs_rq, init_tg_cfs_rq) ____cacheline_aligned_in_smp;
322#endif /* CONFIG_FAIR_GROUP_SCHED */ 299#endif /* CONFIG_FAIR_GROUP_SCHED */
323 300
324#ifdef CONFIG_RT_GROUP_SCHED 301#ifdef CONFIG_RT_GROUP_SCHED
@@ -616,6 +593,7 @@ struct rq {
616 593
617 unsigned char idle_at_tick; 594 unsigned char idle_at_tick;
618 /* For active balancing */ 595 /* For active balancing */
596 int post_schedule;
619 int active_balance; 597 int active_balance;
620 int push_cpu; 598 int push_cpu;
621 /* cpu of this runqueue: */ 599 /* cpu of this runqueue: */
@@ -626,6 +604,9 @@ struct rq {
626 604
627 struct task_struct *migration_thread; 605 struct task_struct *migration_thread;
628 struct list_head migration_queue; 606 struct list_head migration_queue;
607
608 u64 rt_avg;
609 u64 age_stamp;
629#endif 610#endif
630 611
631 /* calc_load related fields */ 612 /* calc_load related fields */
@@ -693,6 +674,7 @@ static inline int cpu_of(struct rq *rq)
693#define this_rq() (&__get_cpu_var(runqueues)) 674#define this_rq() (&__get_cpu_var(runqueues))
694#define task_rq(p) cpu_rq(task_cpu(p)) 675#define task_rq(p) cpu_rq(task_cpu(p))
695#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 676#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
677#define raw_rq() (&__raw_get_cpu_var(runqueues))
696 678
697inline void update_rq_clock(struct rq *rq) 679inline void update_rq_clock(struct rq *rq)
698{ 680{
@@ -861,6 +843,14 @@ unsigned int sysctl_sched_shares_ratelimit = 250000;
861unsigned int sysctl_sched_shares_thresh = 4; 843unsigned int sysctl_sched_shares_thresh = 4;
862 844
863/* 845/*
846 * period over which we average the RT time consumption, measured
847 * in ms.
848 *
849 * default: 1s
850 */
851const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
852
853/*
864 * period over which we measure -rt task cpu usage in us. 854 * period over which we measure -rt task cpu usage in us.
865 * default: 1s 855 * default: 1s
866 */ 856 */
@@ -1278,12 +1268,37 @@ void wake_up_idle_cpu(int cpu)
1278} 1268}
1279#endif /* CONFIG_NO_HZ */ 1269#endif /* CONFIG_NO_HZ */
1280 1270
1271static u64 sched_avg_period(void)
1272{
1273 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
1274}
1275
1276static void sched_avg_update(struct rq *rq)
1277{
1278 s64 period = sched_avg_period();
1279
1280 while ((s64)(rq->clock - rq->age_stamp) > period) {
1281 rq->age_stamp += period;
1282 rq->rt_avg /= 2;
1283 }
1284}
1285
1286static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1287{
1288 rq->rt_avg += rt_delta;
1289 sched_avg_update(rq);
1290}
1291
1281#else /* !CONFIG_SMP */ 1292#else /* !CONFIG_SMP */
1282static void resched_task(struct task_struct *p) 1293static void resched_task(struct task_struct *p)
1283{ 1294{
1284 assert_spin_locked(&task_rq(p)->lock); 1295 assert_spin_locked(&task_rq(p)->lock);
1285 set_tsk_need_resched(p); 1296 set_tsk_need_resched(p);
1286} 1297}
1298
1299static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1300{
1301}
1287#endif /* CONFIG_SMP */ 1302#endif /* CONFIG_SMP */
1288 1303
1289#if BITS_PER_LONG == 32 1304#if BITS_PER_LONG == 32
@@ -1513,28 +1528,35 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1513 1528
1514#ifdef CONFIG_FAIR_GROUP_SCHED 1529#ifdef CONFIG_FAIR_GROUP_SCHED
1515 1530
1531struct update_shares_data {
1532 unsigned long rq_weight[NR_CPUS];
1533};
1534
1535static DEFINE_PER_CPU(struct update_shares_data, update_shares_data);
1536
1516static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1537static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1517 1538
1518/* 1539/*
1519 * Calculate and set the cpu's group shares. 1540 * Calculate and set the cpu's group shares.
1520 */ 1541 */
1521static void 1542static void update_group_shares_cpu(struct task_group *tg, int cpu,
1522update_group_shares_cpu(struct task_group *tg, int cpu, 1543 unsigned long sd_shares,
1523 unsigned long sd_shares, unsigned long sd_rq_weight) 1544 unsigned long sd_rq_weight,
1545 struct update_shares_data *usd)
1524{ 1546{
1525 unsigned long shares; 1547 unsigned long shares, rq_weight;
1526 unsigned long rq_weight; 1548 int boost = 0;
1527
1528 if (!tg->se[cpu])
1529 return;
1530 1549
1531 rq_weight = tg->cfs_rq[cpu]->rq_weight; 1550 rq_weight = usd->rq_weight[cpu];
1551 if (!rq_weight) {
1552 boost = 1;
1553 rq_weight = NICE_0_LOAD;
1554 }
1532 1555
1533 /* 1556 /*
1534 * \Sum shares * rq_weight 1557 * \Sum_j shares_j * rq_weight_i
1535 * shares = ----------------------- 1558 * shares_i = -----------------------------
1536 * \Sum rq_weight 1559 * \Sum_j rq_weight_j
1537 *
1538 */ 1560 */
1539 shares = (sd_shares * rq_weight) / sd_rq_weight; 1561 shares = (sd_shares * rq_weight) / sd_rq_weight;
1540 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); 1562 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
@@ -1545,8 +1567,8 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1545 unsigned long flags; 1567 unsigned long flags;
1546 1568
1547 spin_lock_irqsave(&rq->lock, flags); 1569 spin_lock_irqsave(&rq->lock, flags);
1548 tg->cfs_rq[cpu]->shares = shares; 1570 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1549 1571 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1550 __set_se_shares(tg->se[cpu], shares); 1572 __set_se_shares(tg->se[cpu], shares);
1551 spin_unlock_irqrestore(&rq->lock, flags); 1573 spin_unlock_irqrestore(&rq->lock, flags);
1552 } 1574 }
@@ -1559,22 +1581,30 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1559 */ 1581 */
1560static int tg_shares_up(struct task_group *tg, void *data) 1582static int tg_shares_up(struct task_group *tg, void *data)
1561{ 1583{
1562 unsigned long weight, rq_weight = 0; 1584 unsigned long weight, rq_weight = 0, shares = 0;
1563 unsigned long shares = 0; 1585 struct update_shares_data *usd;
1564 struct sched_domain *sd = data; 1586 struct sched_domain *sd = data;
1587 unsigned long flags;
1565 int i; 1588 int i;
1566 1589
1590 if (!tg->se[0])
1591 return 0;
1592
1593 local_irq_save(flags);
1594 usd = &__get_cpu_var(update_shares_data);
1595
1567 for_each_cpu(i, sched_domain_span(sd)) { 1596 for_each_cpu(i, sched_domain_span(sd)) {
1597 weight = tg->cfs_rq[i]->load.weight;
1598 usd->rq_weight[i] = weight;
1599
1568 /* 1600 /*
1569 * If there are currently no tasks on the cpu pretend there 1601 * If there are currently no tasks on the cpu pretend there
1570 * is one of average load so that when a new task gets to 1602 * is one of average load so that when a new task gets to
1571 * run here it will not get delayed by group starvation. 1603 * run here it will not get delayed by group starvation.
1572 */ 1604 */
1573 weight = tg->cfs_rq[i]->load.weight;
1574 if (!weight) 1605 if (!weight)
1575 weight = NICE_0_LOAD; 1606 weight = NICE_0_LOAD;
1576 1607
1577 tg->cfs_rq[i]->rq_weight = weight;
1578 rq_weight += weight; 1608 rq_weight += weight;
1579 shares += tg->cfs_rq[i]->shares; 1609 shares += tg->cfs_rq[i]->shares;
1580 } 1610 }
@@ -1586,7 +1616,9 @@ static int tg_shares_up(struct task_group *tg, void *data)
1586 shares = tg->shares; 1616 shares = tg->shares;
1587 1617
1588 for_each_cpu(i, sched_domain_span(sd)) 1618 for_each_cpu(i, sched_domain_span(sd))
1589 update_group_shares_cpu(tg, i, shares, rq_weight); 1619 update_group_shares_cpu(tg, i, shares, rq_weight, usd);
1620
1621 local_irq_restore(flags);
1590 1622
1591 return 0; 1623 return 0;
1592} 1624}
@@ -1616,8 +1648,14 @@ static int tg_load_down(struct task_group *tg, void *data)
1616 1648
1617static void update_shares(struct sched_domain *sd) 1649static void update_shares(struct sched_domain *sd)
1618{ 1650{
1619 u64 now = cpu_clock(raw_smp_processor_id()); 1651 s64 elapsed;
1620 s64 elapsed = now - sd->last_update; 1652 u64 now;
1653
1654 if (root_task_group_empty())
1655 return;
1656
1657 now = cpu_clock(raw_smp_processor_id());
1658 elapsed = now - sd->last_update;
1621 1659
1622 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1660 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1623 sd->last_update = now; 1661 sd->last_update = now;
@@ -1627,6 +1665,9 @@ static void update_shares(struct sched_domain *sd)
1627 1665
1628static void update_shares_locked(struct rq *rq, struct sched_domain *sd) 1666static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1629{ 1667{
1668 if (root_task_group_empty())
1669 return;
1670
1630 spin_unlock(&rq->lock); 1671 spin_unlock(&rq->lock);
1631 update_shares(sd); 1672 update_shares(sd);
1632 spin_lock(&rq->lock); 1673 spin_lock(&rq->lock);
@@ -1634,6 +1675,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1634 1675
1635static void update_h_load(long cpu) 1676static void update_h_load(long cpu)
1636{ 1677{
1678 if (root_task_group_empty())
1679 return;
1680
1637 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1681 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1638} 1682}
1639 1683
@@ -2268,8 +2312,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2268 } 2312 }
2269 2313
2270 /* Adjust by relative CPU power of the group */ 2314 /* Adjust by relative CPU power of the group */
2271 avg_load = sg_div_cpu_power(group, 2315 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
2272 avg_load * SCHED_LOAD_SCALE);
2273 2316
2274 if (local_group) { 2317 if (local_group) {
2275 this_load = avg_load; 2318 this_load = avg_load;
@@ -2637,9 +2680,32 @@ void sched_fork(struct task_struct *p, int clone_flags)
2637 set_task_cpu(p, cpu); 2680 set_task_cpu(p, cpu);
2638 2681
2639 /* 2682 /*
2640 * Make sure we do not leak PI boosting priority to the child: 2683 * Make sure we do not leak PI boosting priority to the child.
2641 */ 2684 */
2642 p->prio = current->normal_prio; 2685 p->prio = current->normal_prio;
2686
2687 /*
2688 * Revert to default priority/policy on fork if requested.
2689 */
2690 if (unlikely(p->sched_reset_on_fork)) {
2691 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR)
2692 p->policy = SCHED_NORMAL;
2693
2694 if (p->normal_prio < DEFAULT_PRIO)
2695 p->prio = DEFAULT_PRIO;
2696
2697 if (PRIO_TO_NICE(p->static_prio) < 0) {
2698 p->static_prio = NICE_TO_PRIO(0);
2699 set_load_weight(p);
2700 }
2701
2702 /*
2703 * We don't need the reset flag anymore after the fork. It has
2704 * fulfilled its duty:
2705 */
2706 p->sched_reset_on_fork = 0;
2707 }
2708
2643 if (!rt_prio(p->prio)) 2709 if (!rt_prio(p->prio))
2644 p->sched_class = &fair_sched_class; 2710 p->sched_class = &fair_sched_class;
2645 2711
@@ -2796,12 +2862,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2796{ 2862{
2797 struct mm_struct *mm = rq->prev_mm; 2863 struct mm_struct *mm = rq->prev_mm;
2798 long prev_state; 2864 long prev_state;
2799#ifdef CONFIG_SMP
2800 int post_schedule = 0;
2801
2802 if (current->sched_class->needs_post_schedule)
2803 post_schedule = current->sched_class->needs_post_schedule(rq);
2804#endif
2805 2865
2806 rq->prev_mm = NULL; 2866 rq->prev_mm = NULL;
2807 2867
@@ -2820,10 +2880,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2820 finish_arch_switch(prev); 2880 finish_arch_switch(prev);
2821 perf_counter_task_sched_in(current, cpu_of(rq)); 2881 perf_counter_task_sched_in(current, cpu_of(rq));
2822 finish_lock_switch(rq, prev); 2882 finish_lock_switch(rq, prev);
2823#ifdef CONFIG_SMP
2824 if (post_schedule)
2825 current->sched_class->post_schedule(rq);
2826#endif
2827 2883
2828 fire_sched_in_preempt_notifiers(current); 2884 fire_sched_in_preempt_notifiers(current);
2829 if (mm) 2885 if (mm)
@@ -2838,6 +2894,42 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2838 } 2894 }
2839} 2895}
2840 2896
2897#ifdef CONFIG_SMP
2898
2899/* assumes rq->lock is held */
2900static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
2901{
2902 if (prev->sched_class->pre_schedule)
2903 prev->sched_class->pre_schedule(rq, prev);
2904}
2905
2906/* rq->lock is NOT held, but preemption is disabled */
2907static inline void post_schedule(struct rq *rq)
2908{
2909 if (rq->post_schedule) {
2910 unsigned long flags;
2911
2912 spin_lock_irqsave(&rq->lock, flags);
2913 if (rq->curr->sched_class->post_schedule)
2914 rq->curr->sched_class->post_schedule(rq);
2915 spin_unlock_irqrestore(&rq->lock, flags);
2916
2917 rq->post_schedule = 0;
2918 }
2919}
2920
2921#else
2922
2923static inline void pre_schedule(struct rq *rq, struct task_struct *p)
2924{
2925}
2926
2927static inline void post_schedule(struct rq *rq)
2928{
2929}
2930
2931#endif
2932
2841/** 2933/**
2842 * schedule_tail - first thing a freshly forked thread must call. 2934 * schedule_tail - first thing a freshly forked thread must call.
2843 * @prev: the thread we just switched away from. 2935 * @prev: the thread we just switched away from.
@@ -2848,6 +2940,13 @@ asmlinkage void schedule_tail(struct task_struct *prev)
2848 struct rq *rq = this_rq(); 2940 struct rq *rq = this_rq();
2849 2941
2850 finish_task_switch(rq, prev); 2942 finish_task_switch(rq, prev);
2943
2944 /*
2945 * FIXME: do we need to worry about rq being invalidated by the
2946 * task_switch?
2947 */
2948 post_schedule(rq);
2949
2851#ifdef __ARCH_WANT_UNLOCKED_CTXSW 2950#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2852 /* In this case, finish_task_switch does not reenable preemption */ 2951 /* In this case, finish_task_switch does not reenable preemption */
2853 preempt_enable(); 2952 preempt_enable();
@@ -3379,9 +3478,10 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3379{ 3478{
3380 const struct sched_class *class; 3479 const struct sched_class *class;
3381 3480
3382 for (class = sched_class_highest; class; class = class->next) 3481 for_each_class(class) {
3383 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) 3482 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3384 return 1; 3483 return 1;
3484 }
3385 3485
3386 return 0; 3486 return 0;
3387} 3487}
@@ -3544,7 +3644,7 @@ static inline void update_sd_power_savings_stats(struct sched_group *group,
3544 * capacity but still has some space to pick up some load 3644 * capacity but still has some space to pick up some load
3545 * from other group and save more power 3645 * from other group and save more power
3546 */ 3646 */
3547 if (sgs->sum_nr_running > sgs->group_capacity - 1) 3647 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3548 return; 3648 return;
3549 3649
3550 if (sgs->sum_nr_running > sds->leader_nr_running || 3650 if (sgs->sum_nr_running > sds->leader_nr_running ||
@@ -3611,6 +3711,77 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3611} 3711}
3612#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 3712#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3613 3713
3714unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3715{
3716 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3717 unsigned long smt_gain = sd->smt_gain;
3718
3719 smt_gain /= weight;
3720
3721 return smt_gain;
3722}
3723
3724unsigned long scale_rt_power(int cpu)
3725{
3726 struct rq *rq = cpu_rq(cpu);
3727 u64 total, available;
3728
3729 sched_avg_update(rq);
3730
3731 total = sched_avg_period() + (rq->clock - rq->age_stamp);
3732 available = total - rq->rt_avg;
3733
3734 if (unlikely((s64)total < SCHED_LOAD_SCALE))
3735 total = SCHED_LOAD_SCALE;
3736
3737 total >>= SCHED_LOAD_SHIFT;
3738
3739 return div_u64(available, total);
3740}
3741
3742static void update_cpu_power(struct sched_domain *sd, int cpu)
3743{
3744 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3745 unsigned long power = SCHED_LOAD_SCALE;
3746 struct sched_group *sdg = sd->groups;
3747
3748 /* here we could scale based on cpufreq */
3749
3750 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3751 power *= arch_scale_smt_power(sd, cpu);
3752 power >>= SCHED_LOAD_SHIFT;
3753 }
3754
3755 power *= scale_rt_power(cpu);
3756 power >>= SCHED_LOAD_SHIFT;
3757
3758 if (!power)
3759 power = 1;
3760
3761 sdg->cpu_power = power;
3762}
3763
3764static void update_group_power(struct sched_domain *sd, int cpu)
3765{
3766 struct sched_domain *child = sd->child;
3767 struct sched_group *group, *sdg = sd->groups;
3768 unsigned long power;
3769
3770 if (!child) {
3771 update_cpu_power(sd, cpu);
3772 return;
3773 }
3774
3775 power = 0;
3776
3777 group = child->groups;
3778 do {
3779 power += group->cpu_power;
3780 group = group->next;
3781 } while (group != child->groups);
3782
3783 sdg->cpu_power = power;
3784}
3614 3785
3615/** 3786/**
3616 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 3787 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
@@ -3624,7 +3795,8 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3624 * @balance: Should we balance. 3795 * @balance: Should we balance.
3625 * @sgs: variable to hold the statistics for this group. 3796 * @sgs: variable to hold the statistics for this group.
3626 */ 3797 */
3627static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, 3798static inline void update_sg_lb_stats(struct sched_domain *sd,
3799 struct sched_group *group, int this_cpu,
3628 enum cpu_idle_type idle, int load_idx, int *sd_idle, 3800 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3629 int local_group, const struct cpumask *cpus, 3801 int local_group, const struct cpumask *cpus,
3630 int *balance, struct sg_lb_stats *sgs) 3802 int *balance, struct sg_lb_stats *sgs)
@@ -3635,8 +3807,11 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3635 unsigned long sum_avg_load_per_task; 3807 unsigned long sum_avg_load_per_task;
3636 unsigned long avg_load_per_task; 3808 unsigned long avg_load_per_task;
3637 3809
3638 if (local_group) 3810 if (local_group) {
3639 balance_cpu = group_first_cpu(group); 3811 balance_cpu = group_first_cpu(group);
3812 if (balance_cpu == this_cpu)
3813 update_group_power(sd, this_cpu);
3814 }
3640 3815
3641 /* Tally up the load of all CPUs in the group */ 3816 /* Tally up the load of all CPUs in the group */
3642 sum_avg_load_per_task = avg_load_per_task = 0; 3817 sum_avg_load_per_task = avg_load_per_task = 0;
@@ -3685,8 +3860,7 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3685 } 3860 }
3686 3861
3687 /* Adjust by relative CPU power of the group */ 3862 /* Adjust by relative CPU power of the group */
3688 sgs->avg_load = sg_div_cpu_power(group, 3863 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
3689 sgs->group_load * SCHED_LOAD_SCALE);
3690 3864
3691 3865
3692 /* 3866 /*
@@ -3698,14 +3872,14 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3698 * normalized nr_running number somewhere that negates 3872 * normalized nr_running number somewhere that negates
3699 * the hierarchy? 3873 * the hierarchy?
3700 */ 3874 */
3701 avg_load_per_task = sg_div_cpu_power(group, 3875 avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
3702 sum_avg_load_per_task * SCHED_LOAD_SCALE); 3876 group->cpu_power;
3703 3877
3704 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) 3878 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3705 sgs->group_imb = 1; 3879 sgs->group_imb = 1;
3706 3880
3707 sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; 3881 sgs->group_capacity =
3708 3882 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
3709} 3883}
3710 3884
3711/** 3885/**
@@ -3723,9 +3897,13 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3723 const struct cpumask *cpus, int *balance, 3897 const struct cpumask *cpus, int *balance,
3724 struct sd_lb_stats *sds) 3898 struct sd_lb_stats *sds)
3725{ 3899{
3900 struct sched_domain *child = sd->child;
3726 struct sched_group *group = sd->groups; 3901 struct sched_group *group = sd->groups;
3727 struct sg_lb_stats sgs; 3902 struct sg_lb_stats sgs;
3728 int load_idx; 3903 int load_idx, prefer_sibling = 0;
3904
3905 if (child && child->flags & SD_PREFER_SIBLING)
3906 prefer_sibling = 1;
3729 3907
3730 init_sd_power_savings_stats(sd, sds, idle); 3908 init_sd_power_savings_stats(sd, sds, idle);
3731 load_idx = get_sd_load_idx(sd, idle); 3909 load_idx = get_sd_load_idx(sd, idle);
@@ -3736,14 +3914,22 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3736 local_group = cpumask_test_cpu(this_cpu, 3914 local_group = cpumask_test_cpu(this_cpu,
3737 sched_group_cpus(group)); 3915 sched_group_cpus(group));
3738 memset(&sgs, 0, sizeof(sgs)); 3916 memset(&sgs, 0, sizeof(sgs));
3739 update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle, 3917 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
3740 local_group, cpus, balance, &sgs); 3918 local_group, cpus, balance, &sgs);
3741 3919
3742 if (local_group && balance && !(*balance)) 3920 if (local_group && balance && !(*balance))
3743 return; 3921 return;
3744 3922
3745 sds->total_load += sgs.group_load; 3923 sds->total_load += sgs.group_load;
3746 sds->total_pwr += group->__cpu_power; 3924 sds->total_pwr += group->cpu_power;
3925
3926 /*
3927 * In case the child domain prefers tasks go to siblings
3928 * first, lower the group capacity to one so that we'll try
3929 * and move all the excess tasks away.
3930 */
3931 if (prefer_sibling)
3932 sgs.group_capacity = min(sgs.group_capacity, 1UL);
3747 3933
3748 if (local_group) { 3934 if (local_group) {
3749 sds->this_load = sgs.avg_load; 3935 sds->this_load = sgs.avg_load;
@@ -3763,7 +3949,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3763 update_sd_power_savings_stats(group, sds, local_group, &sgs); 3949 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3764 group = group->next; 3950 group = group->next;
3765 } while (group != sd->groups); 3951 } while (group != sd->groups);
3766
3767} 3952}
3768 3953
3769/** 3954/**
@@ -3801,28 +3986,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3801 * moving them. 3986 * moving them.
3802 */ 3987 */
3803 3988
3804 pwr_now += sds->busiest->__cpu_power * 3989 pwr_now += sds->busiest->cpu_power *
3805 min(sds->busiest_load_per_task, sds->max_load); 3990 min(sds->busiest_load_per_task, sds->max_load);
3806 pwr_now += sds->this->__cpu_power * 3991 pwr_now += sds->this->cpu_power *
3807 min(sds->this_load_per_task, sds->this_load); 3992 min(sds->this_load_per_task, sds->this_load);
3808 pwr_now /= SCHED_LOAD_SCALE; 3993 pwr_now /= SCHED_LOAD_SCALE;
3809 3994
3810 /* Amount of load we'd subtract */ 3995 /* Amount of load we'd subtract */
3811 tmp = sg_div_cpu_power(sds->busiest, 3996 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3812 sds->busiest_load_per_task * SCHED_LOAD_SCALE); 3997 sds->busiest->cpu_power;
3813 if (sds->max_load > tmp) 3998 if (sds->max_load > tmp)
3814 pwr_move += sds->busiest->__cpu_power * 3999 pwr_move += sds->busiest->cpu_power *
3815 min(sds->busiest_load_per_task, sds->max_load - tmp); 4000 min(sds->busiest_load_per_task, sds->max_load - tmp);
3816 4001
3817 /* Amount of load we'd add */ 4002 /* Amount of load we'd add */
3818 if (sds->max_load * sds->busiest->__cpu_power < 4003 if (sds->max_load * sds->busiest->cpu_power <
3819 sds->busiest_load_per_task * SCHED_LOAD_SCALE) 4004 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3820 tmp = sg_div_cpu_power(sds->this, 4005 tmp = (sds->max_load * sds->busiest->cpu_power) /
3821 sds->max_load * sds->busiest->__cpu_power); 4006 sds->this->cpu_power;
3822 else 4007 else
3823 tmp = sg_div_cpu_power(sds->this, 4008 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3824 sds->busiest_load_per_task * SCHED_LOAD_SCALE); 4009 sds->this->cpu_power;
3825 pwr_move += sds->this->__cpu_power * 4010 pwr_move += sds->this->cpu_power *
3826 min(sds->this_load_per_task, sds->this_load + tmp); 4011 min(sds->this_load_per_task, sds->this_load + tmp);
3827 pwr_move /= SCHED_LOAD_SCALE; 4012 pwr_move /= SCHED_LOAD_SCALE;
3828 4013
@@ -3857,8 +4042,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3857 sds->max_load - sds->busiest_load_per_task); 4042 sds->max_load - sds->busiest_load_per_task);
3858 4043
3859 /* How much load to actually move to equalise the imbalance */ 4044 /* How much load to actually move to equalise the imbalance */
3860 *imbalance = min(max_pull * sds->busiest->__cpu_power, 4045 *imbalance = min(max_pull * sds->busiest->cpu_power,
3861 (sds->avg_load - sds->this_load) * sds->this->__cpu_power) 4046 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
3862 / SCHED_LOAD_SCALE; 4047 / SCHED_LOAD_SCALE;
3863 4048
3864 /* 4049 /*
@@ -3976,6 +4161,26 @@ ret:
3976 return NULL; 4161 return NULL;
3977} 4162}
3978 4163
4164static struct sched_group *group_of(int cpu)
4165{
4166 struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
4167
4168 if (!sd)
4169 return NULL;
4170
4171 return sd->groups;
4172}
4173
4174static unsigned long power_of(int cpu)
4175{
4176 struct sched_group *group = group_of(cpu);
4177
4178 if (!group)
4179 return SCHED_LOAD_SCALE;
4180
4181 return group->cpu_power;
4182}
4183
3979/* 4184/*
3980 * find_busiest_queue - find the busiest runqueue among the cpus in group. 4185 * find_busiest_queue - find the busiest runqueue among the cpus in group.
3981 */ 4186 */
@@ -3988,15 +4193,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3988 int i; 4193 int i;
3989 4194
3990 for_each_cpu(i, sched_group_cpus(group)) { 4195 for_each_cpu(i, sched_group_cpus(group)) {
4196 unsigned long power = power_of(i);
4197 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
3991 unsigned long wl; 4198 unsigned long wl;
3992 4199
3993 if (!cpumask_test_cpu(i, cpus)) 4200 if (!cpumask_test_cpu(i, cpus))
3994 continue; 4201 continue;
3995 4202
3996 rq = cpu_rq(i); 4203 rq = cpu_rq(i);
3997 wl = weighted_cpuload(i); 4204 wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
4205 wl /= power;
3998 4206
3999 if (rq->nr_running == 1 && wl > imbalance) 4207 if (capacity && rq->nr_running == 1 && wl > imbalance)
4000 continue; 4208 continue;
4001 4209
4002 if (wl > max_load) { 4210 if (wl > max_load) {
@@ -5325,7 +5533,7 @@ need_resched:
5325 preempt_disable(); 5533 preempt_disable();
5326 cpu = smp_processor_id(); 5534 cpu = smp_processor_id();
5327 rq = cpu_rq(cpu); 5535 rq = cpu_rq(cpu);
5328 rcu_qsctr_inc(cpu); 5536 rcu_sched_qs(cpu);
5329 prev = rq->curr; 5537 prev = rq->curr;
5330 switch_count = &prev->nivcsw; 5538 switch_count = &prev->nivcsw;
5331 5539
@@ -5349,10 +5557,7 @@ need_resched_nonpreemptible:
5349 switch_count = &prev->nvcsw; 5557 switch_count = &prev->nvcsw;
5350 } 5558 }
5351 5559
5352#ifdef CONFIG_SMP 5560 pre_schedule(rq, prev);
5353 if (prev->sched_class->pre_schedule)
5354 prev->sched_class->pre_schedule(rq, prev);
5355#endif
5356 5561
5357 if (unlikely(!rq->nr_running)) 5562 if (unlikely(!rq->nr_running))
5358 idle_balance(cpu, rq); 5563 idle_balance(cpu, rq);
@@ -5378,6 +5583,8 @@ need_resched_nonpreemptible:
5378 } else 5583 } else
5379 spin_unlock_irq(&rq->lock); 5584 spin_unlock_irq(&rq->lock);
5380 5585
5586 post_schedule(rq);
5587
5381 if (unlikely(reacquire_kernel_lock(current) < 0)) 5588 if (unlikely(reacquire_kernel_lock(current) < 0))
5382 goto need_resched_nonpreemptible; 5589 goto need_resched_nonpreemptible;
5383 5590
@@ -6123,17 +6330,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
6123 unsigned long flags; 6330 unsigned long flags;
6124 const struct sched_class *prev_class = p->sched_class; 6331 const struct sched_class *prev_class = p->sched_class;
6125 struct rq *rq; 6332 struct rq *rq;
6333 int reset_on_fork;
6126 6334
6127 /* may grab non-irq protected spin_locks */ 6335 /* may grab non-irq protected spin_locks */
6128 BUG_ON(in_interrupt()); 6336 BUG_ON(in_interrupt());
6129recheck: 6337recheck:
6130 /* double check policy once rq lock held */ 6338 /* double check policy once rq lock held */
6131 if (policy < 0) 6339 if (policy < 0) {
6340 reset_on_fork = p->sched_reset_on_fork;
6132 policy = oldpolicy = p->policy; 6341 policy = oldpolicy = p->policy;
6133 else if (policy != SCHED_FIFO && policy != SCHED_RR && 6342 } else {
6134 policy != SCHED_NORMAL && policy != SCHED_BATCH && 6343 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
6135 policy != SCHED_IDLE) 6344 policy &= ~SCHED_RESET_ON_FORK;
6136 return -EINVAL; 6345
6346 if (policy != SCHED_FIFO && policy != SCHED_RR &&
6347 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
6348 policy != SCHED_IDLE)
6349 return -EINVAL;
6350 }
6351
6137 /* 6352 /*
6138 * Valid priorities for SCHED_FIFO and SCHED_RR are 6353 * Valid priorities for SCHED_FIFO and SCHED_RR are
6139 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 6354 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
@@ -6177,6 +6392,10 @@ recheck:
6177 /* can't change other user's priorities */ 6392 /* can't change other user's priorities */
6178 if (!check_same_owner(p)) 6393 if (!check_same_owner(p))
6179 return -EPERM; 6394 return -EPERM;
6395
6396 /* Normal users shall not reset the sched_reset_on_fork flag */
6397 if (p->sched_reset_on_fork && !reset_on_fork)
6398 return -EPERM;
6180 } 6399 }
6181 6400
6182 if (user) { 6401 if (user) {
@@ -6220,6 +6439,8 @@ recheck:
6220 if (running) 6439 if (running)
6221 p->sched_class->put_prev_task(rq, p); 6440 p->sched_class->put_prev_task(rq, p);
6222 6441
6442 p->sched_reset_on_fork = reset_on_fork;
6443
6223 oldprio = p->prio; 6444 oldprio = p->prio;
6224 __setscheduler(rq, p, policy, param->sched_priority); 6445 __setscheduler(rq, p, policy, param->sched_priority);
6225 6446
@@ -6336,14 +6557,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
6336 if (p) { 6557 if (p) {
6337 retval = security_task_getscheduler(p); 6558 retval = security_task_getscheduler(p);
6338 if (!retval) 6559 if (!retval)
6339 retval = p->policy; 6560 retval = p->policy
6561 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
6340 } 6562 }
6341 read_unlock(&tasklist_lock); 6563 read_unlock(&tasklist_lock);
6342 return retval; 6564 return retval;
6343} 6565}
6344 6566
6345/** 6567/**
6346 * sys_sched_getscheduler - get the RT priority of a thread 6568 * sys_sched_getparam - get the RT priority of a thread
6347 * @pid: the pid in question. 6569 * @pid: the pid in question.
6348 * @param: structure containing the RT priority. 6570 * @param: structure containing the RT priority.
6349 */ 6571 */
@@ -6571,19 +6793,9 @@ static inline int should_resched(void)
6571 6793
6572static void __cond_resched(void) 6794static void __cond_resched(void)
6573{ 6795{
6574#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 6796 add_preempt_count(PREEMPT_ACTIVE);
6575 __might_sleep(__FILE__, __LINE__); 6797 schedule();
6576#endif 6798 sub_preempt_count(PREEMPT_ACTIVE);
6577 /*
6578 * The BKS might be reacquired before we have dropped
6579 * PREEMPT_ACTIVE, which could trigger a second
6580 * cond_resched() call.
6581 */
6582 do {
6583 add_preempt_count(PREEMPT_ACTIVE);
6584 schedule();
6585 sub_preempt_count(PREEMPT_ACTIVE);
6586 } while (need_resched());
6587} 6799}
6588 6800
6589int __sched _cond_resched(void) 6801int __sched _cond_resched(void)
@@ -6597,18 +6809,20 @@ int __sched _cond_resched(void)
6597EXPORT_SYMBOL(_cond_resched); 6809EXPORT_SYMBOL(_cond_resched);
6598 6810
6599/* 6811/*
6600 * cond_resched_lock() - if a reschedule is pending, drop the given lock, 6812 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
6601 * call schedule, and on return reacquire the lock. 6813 * call schedule, and on return reacquire the lock.
6602 * 6814 *
6603 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level 6815 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
6604 * operations here to prevent schedule() from being called twice (once via 6816 * operations here to prevent schedule() from being called twice (once via
6605 * spin_unlock(), once by hand). 6817 * spin_unlock(), once by hand).
6606 */ 6818 */
6607int cond_resched_lock(spinlock_t *lock) 6819int __cond_resched_lock(spinlock_t *lock)
6608{ 6820{
6609 int resched = should_resched(); 6821 int resched = should_resched();
6610 int ret = 0; 6822 int ret = 0;
6611 6823
6824 lockdep_assert_held(lock);
6825
6612 if (spin_needbreak(lock) || resched) { 6826 if (spin_needbreak(lock) || resched) {
6613 spin_unlock(lock); 6827 spin_unlock(lock);
6614 if (resched) 6828 if (resched)
@@ -6620,9 +6834,9 @@ int cond_resched_lock(spinlock_t *lock)
6620 } 6834 }
6621 return ret; 6835 return ret;
6622} 6836}
6623EXPORT_SYMBOL(cond_resched_lock); 6837EXPORT_SYMBOL(__cond_resched_lock);
6624 6838
6625int __sched cond_resched_softirq(void) 6839int __sched __cond_resched_softirq(void)
6626{ 6840{
6627 BUG_ON(!in_softirq()); 6841 BUG_ON(!in_softirq());
6628 6842
@@ -6634,7 +6848,7 @@ int __sched cond_resched_softirq(void)
6634 } 6848 }
6635 return 0; 6849 return 0;
6636} 6850}
6637EXPORT_SYMBOL(cond_resched_softirq); 6851EXPORT_SYMBOL(__cond_resched_softirq);
6638 6852
6639/** 6853/**
6640 * yield - yield the current processor to other threads. 6854 * yield - yield the current processor to other threads.
@@ -6658,11 +6872,13 @@ EXPORT_SYMBOL(yield);
6658 */ 6872 */
6659void __sched io_schedule(void) 6873void __sched io_schedule(void)
6660{ 6874{
6661 struct rq *rq = &__raw_get_cpu_var(runqueues); 6875 struct rq *rq = raw_rq();
6662 6876
6663 delayacct_blkio_start(); 6877 delayacct_blkio_start();
6664 atomic_inc(&rq->nr_iowait); 6878 atomic_inc(&rq->nr_iowait);
6879 current->in_iowait = 1;
6665 schedule(); 6880 schedule();
6881 current->in_iowait = 0;
6666 atomic_dec(&rq->nr_iowait); 6882 atomic_dec(&rq->nr_iowait);
6667 delayacct_blkio_end(); 6883 delayacct_blkio_end();
6668} 6884}
@@ -6670,12 +6886,14 @@ EXPORT_SYMBOL(io_schedule);
6670 6886
6671long __sched io_schedule_timeout(long timeout) 6887long __sched io_schedule_timeout(long timeout)
6672{ 6888{
6673 struct rq *rq = &__raw_get_cpu_var(runqueues); 6889 struct rq *rq = raw_rq();
6674 long ret; 6890 long ret;
6675 6891
6676 delayacct_blkio_start(); 6892 delayacct_blkio_start();
6677 atomic_inc(&rq->nr_iowait); 6893 atomic_inc(&rq->nr_iowait);
6894 current->in_iowait = 1;
6678 ret = schedule_timeout(timeout); 6895 ret = schedule_timeout(timeout);
6896 current->in_iowait = 0;
6679 atomic_dec(&rq->nr_iowait); 6897 atomic_dec(&rq->nr_iowait);
6680 delayacct_blkio_end(); 6898 delayacct_blkio_end();
6681 return ret; 6899 return ret;
@@ -6992,8 +7210,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
6992 7210
6993 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { 7211 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
6994 /* Need help from migration thread: drop lock and wait. */ 7212 /* Need help from migration thread: drop lock and wait. */
7213 struct task_struct *mt = rq->migration_thread;
7214
7215 get_task_struct(mt);
6995 task_rq_unlock(rq, &flags); 7216 task_rq_unlock(rq, &flags);
6996 wake_up_process(rq->migration_thread); 7217 wake_up_process(rq->migration_thread);
7218 put_task_struct(mt);
6997 wait_for_completion(&req.done); 7219 wait_for_completion(&req.done);
6998 tlb_migrate_finish(p->mm); 7220 tlb_migrate_finish(p->mm);
6999 return 0; 7221 return 0;
@@ -7051,6 +7273,11 @@ fail:
7051 return ret; 7273 return ret;
7052} 7274}
7053 7275
7276#define RCU_MIGRATION_IDLE 0
7277#define RCU_MIGRATION_NEED_QS 1
7278#define RCU_MIGRATION_GOT_QS 2
7279#define RCU_MIGRATION_MUST_SYNC 3
7280
7054/* 7281/*
7055 * migration_thread - this is a highprio system thread that performs 7282 * migration_thread - this is a highprio system thread that performs
7056 * thread migration by bumping thread off CPU then 'pushing' onto 7283 * thread migration by bumping thread off CPU then 'pushing' onto
@@ -7058,6 +7285,7 @@ fail:
7058 */ 7285 */
7059static int migration_thread(void *data) 7286static int migration_thread(void *data)
7060{ 7287{
7288 int badcpu;
7061 int cpu = (long)data; 7289 int cpu = (long)data;
7062 struct rq *rq; 7290 struct rq *rq;
7063 7291
@@ -7092,8 +7320,17 @@ static int migration_thread(void *data)
7092 req = list_entry(head->next, struct migration_req, list); 7320 req = list_entry(head->next, struct migration_req, list);
7093 list_del_init(head->next); 7321 list_del_init(head->next);
7094 7322
7095 spin_unlock(&rq->lock); 7323 if (req->task != NULL) {
7096 __migrate_task(req->task, cpu, req->dest_cpu); 7324 spin_unlock(&rq->lock);
7325 __migrate_task(req->task, cpu, req->dest_cpu);
7326 } else if (likely(cpu == (badcpu = smp_processor_id()))) {
7327 req->dest_cpu = RCU_MIGRATION_GOT_QS;
7328 spin_unlock(&rq->lock);
7329 } else {
7330 req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
7331 spin_unlock(&rq->lock);
7332 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
7333 }
7097 local_irq_enable(); 7334 local_irq_enable();
7098 7335
7099 complete(&req->done); 7336 complete(&req->done);
@@ -7625,7 +7862,7 @@ static int __init migration_init(void)
7625 migration_call(&migration_notifier, CPU_ONLINE, cpu); 7862 migration_call(&migration_notifier, CPU_ONLINE, cpu);
7626 register_cpu_notifier(&migration_notifier); 7863 register_cpu_notifier(&migration_notifier);
7627 7864
7628 return err; 7865 return 0;
7629} 7866}
7630early_initcall(migration_init); 7867early_initcall(migration_init);
7631#endif 7868#endif
@@ -7672,7 +7909,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7672 break; 7909 break;
7673 } 7910 }
7674 7911
7675 if (!group->__cpu_power) { 7912 if (!group->cpu_power) {
7676 printk(KERN_CONT "\n"); 7913 printk(KERN_CONT "\n");
7677 printk(KERN_ERR "ERROR: domain->cpu_power not " 7914 printk(KERN_ERR "ERROR: domain->cpu_power not "
7678 "set\n"); 7915 "set\n");
@@ -7696,9 +7933,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7696 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 7933 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
7697 7934
7698 printk(KERN_CONT " %s", str); 7935 printk(KERN_CONT " %s", str);
7699 if (group->__cpu_power != SCHED_LOAD_SCALE) { 7936 if (group->cpu_power != SCHED_LOAD_SCALE) {
7700 printk(KERN_CONT " (__cpu_power = %d)", 7937 printk(KERN_CONT " (cpu_power = %d)",
7701 group->__cpu_power); 7938 group->cpu_power);
7702 } 7939 }
7703 7940
7704 group = group->next; 7941 group = group->next;
@@ -7841,7 +8078,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7841 rq->rd = rd; 8078 rq->rd = rd;
7842 8079
7843 cpumask_set_cpu(rq->cpu, rd->span); 8080 cpumask_set_cpu(rq->cpu, rd->span);
7844 if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) 8081 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
7845 set_rq_online(rq); 8082 set_rq_online(rq);
7846 8083
7847 spin_unlock_irqrestore(&rq->lock, flags); 8084 spin_unlock_irqrestore(&rq->lock, flags);
@@ -7983,7 +8220,7 @@ init_sched_build_groups(const struct cpumask *span,
7983 continue; 8220 continue;
7984 8221
7985 cpumask_clear(sched_group_cpus(sg)); 8222 cpumask_clear(sched_group_cpus(sg));
7986 sg->__cpu_power = 0; 8223 sg->cpu_power = 0;
7987 8224
7988 for_each_cpu(j, span) { 8225 for_each_cpu(j, span) {
7989 if (group_fn(j, cpu_map, NULL, tmpmask) != group) 8226 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
@@ -8091,6 +8328,39 @@ struct static_sched_domain {
8091 DECLARE_BITMAP(span, CONFIG_NR_CPUS); 8328 DECLARE_BITMAP(span, CONFIG_NR_CPUS);
8092}; 8329};
8093 8330
8331struct s_data {
8332#ifdef CONFIG_NUMA
8333 int sd_allnodes;
8334 cpumask_var_t domainspan;
8335 cpumask_var_t covered;
8336 cpumask_var_t notcovered;
8337#endif
8338 cpumask_var_t nodemask;
8339 cpumask_var_t this_sibling_map;
8340 cpumask_var_t this_core_map;
8341 cpumask_var_t send_covered;
8342 cpumask_var_t tmpmask;
8343 struct sched_group **sched_group_nodes;
8344 struct root_domain *rd;
8345};
8346
8347enum s_alloc {
8348 sa_sched_groups = 0,
8349 sa_rootdomain,
8350 sa_tmpmask,
8351 sa_send_covered,
8352 sa_this_core_map,
8353 sa_this_sibling_map,
8354 sa_nodemask,
8355 sa_sched_group_nodes,
8356#ifdef CONFIG_NUMA
8357 sa_notcovered,
8358 sa_covered,
8359 sa_domainspan,
8360#endif
8361 sa_none,
8362};
8363
8094/* 8364/*
8095 * SMT sched-domains: 8365 * SMT sched-domains:
8096 */ 8366 */
@@ -8208,11 +8478,76 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
8208 continue; 8478 continue;
8209 } 8479 }
8210 8480
8211 sg_inc_cpu_power(sg, sd->groups->__cpu_power); 8481 sg->cpu_power += sd->groups->cpu_power;
8212 } 8482 }
8213 sg = sg->next; 8483 sg = sg->next;
8214 } while (sg != group_head); 8484 } while (sg != group_head);
8215} 8485}
8486
8487static int build_numa_sched_groups(struct s_data *d,
8488 const struct cpumask *cpu_map, int num)
8489{
8490 struct sched_domain *sd;
8491 struct sched_group *sg, *prev;
8492 int n, j;
8493
8494 cpumask_clear(d->covered);
8495 cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
8496 if (cpumask_empty(d->nodemask)) {
8497 d->sched_group_nodes[num] = NULL;
8498 goto out;
8499 }
8500
8501 sched_domain_node_span(num, d->domainspan);
8502 cpumask_and(d->domainspan, d->domainspan, cpu_map);
8503
8504 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
8505 GFP_KERNEL, num);
8506 if (!sg) {
8507 printk(KERN_WARNING "Can not alloc domain group for node %d\n",
8508 num);
8509 return -ENOMEM;
8510 }
8511 d->sched_group_nodes[num] = sg;
8512
8513 for_each_cpu(j, d->nodemask) {
8514 sd = &per_cpu(node_domains, j).sd;
8515 sd->groups = sg;
8516 }
8517
8518 sg->cpu_power = 0;
8519 cpumask_copy(sched_group_cpus(sg), d->nodemask);
8520 sg->next = sg;
8521 cpumask_or(d->covered, d->covered, d->nodemask);
8522
8523 prev = sg;
8524 for (j = 0; j < nr_node_ids; j++) {
8525 n = (num + j) % nr_node_ids;
8526 cpumask_complement(d->notcovered, d->covered);
8527 cpumask_and(d->tmpmask, d->notcovered, cpu_map);
8528 cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
8529 if (cpumask_empty(d->tmpmask))
8530 break;
8531 cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
8532 if (cpumask_empty(d->tmpmask))
8533 continue;
8534 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
8535 GFP_KERNEL, num);
8536 if (!sg) {
8537 printk(KERN_WARNING
8538 "Can not alloc domain group for node %d\n", j);
8539 return -ENOMEM;
8540 }
8541 sg->cpu_power = 0;
8542 cpumask_copy(sched_group_cpus(sg), d->tmpmask);
8543 sg->next = prev->next;
8544 cpumask_or(d->covered, d->covered, d->tmpmask);
8545 prev->next = sg;
8546 prev = sg;
8547 }
8548out:
8549 return 0;
8550}
8216#endif /* CONFIG_NUMA */ 8551#endif /* CONFIG_NUMA */
8217 8552
8218#ifdef CONFIG_NUMA 8553#ifdef CONFIG_NUMA
@@ -8266,15 +8601,13 @@ static void free_sched_groups(const struct cpumask *cpu_map,
8266 * there are asymmetries in the topology. If there are asymmetries, group 8601 * there are asymmetries in the topology. If there are asymmetries, group
8267 * having more cpu_power will pickup more load compared to the group having 8602 * having more cpu_power will pickup more load compared to the group having
8268 * less cpu_power. 8603 * less cpu_power.
8269 *
8270 * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
8271 * the maximum number of tasks a group can handle in the presence of other idle
8272 * or lightly loaded groups in the same sched domain.
8273 */ 8604 */
8274static void init_sched_groups_power(int cpu, struct sched_domain *sd) 8605static void init_sched_groups_power(int cpu, struct sched_domain *sd)
8275{ 8606{
8276 struct sched_domain *child; 8607 struct sched_domain *child;
8277 struct sched_group *group; 8608 struct sched_group *group;
8609 long power;
8610 int weight;
8278 8611
8279 WARN_ON(!sd || !sd->groups); 8612 WARN_ON(!sd || !sd->groups);
8280 8613
@@ -8283,28 +8616,32 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
8283 8616
8284 child = sd->child; 8617 child = sd->child;
8285 8618
8286 sd->groups->__cpu_power = 0; 8619 sd->groups->cpu_power = 0;
8287 8620
8288 /* 8621 if (!child) {
8289 * For perf policy, if the groups in child domain share resources 8622 power = SCHED_LOAD_SCALE;
8290 * (for example cores sharing some portions of the cache hierarchy 8623 weight = cpumask_weight(sched_domain_span(sd));
8291 * or SMT), then set this domain groups cpu_power such that each group 8624 /*
8292 * can handle only one task, when there are other idle groups in the 8625 * SMT siblings share the power of a single core.
8293 * same sched domain. 8626 * Usually multiple threads get a better yield out of
8294 */ 8627 * that one core than a single thread would have,
8295 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && 8628 * reflect that in sd->smt_gain.
8296 (child->flags & 8629 */
8297 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { 8630 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
8298 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); 8631 power *= sd->smt_gain;
8632 power /= weight;
8633 power >>= SCHED_LOAD_SHIFT;
8634 }
8635 sd->groups->cpu_power += power;
8299 return; 8636 return;
8300 } 8637 }
8301 8638
8302 /* 8639 /*
8303 * add cpu_power of each child group to this groups cpu_power 8640 * Add cpu_power of each child group to this groups cpu_power.
8304 */ 8641 */
8305 group = child->groups; 8642 group = child->groups;
8306 do { 8643 do {
8307 sg_inc_cpu_power(sd->groups, group->__cpu_power); 8644 sd->groups->cpu_power += group->cpu_power;
8308 group = group->next; 8645 group = group->next;
8309 } while (group != child->groups); 8646 } while (group != child->groups);
8310} 8647}
@@ -8378,280 +8715,285 @@ static void set_domain_attribute(struct sched_domain *sd,
8378 } 8715 }
8379} 8716}
8380 8717
8381/* 8718static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
8382 * Build sched domains for a given set of cpus and attach the sched domains 8719 const struct cpumask *cpu_map)
8383 * to the individual cpus 8720{
8384 */ 8721 switch (what) {
8385static int __build_sched_domains(const struct cpumask *cpu_map, 8722 case sa_sched_groups:
8386 struct sched_domain_attr *attr) 8723 free_sched_groups(cpu_map, d->tmpmask); /* fall through */
8387{ 8724 d->sched_group_nodes = NULL;
8388 int i, err = -ENOMEM; 8725 case sa_rootdomain:
8389 struct root_domain *rd; 8726 free_rootdomain(d->rd); /* fall through */
8390 cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered, 8727 case sa_tmpmask:
8391 tmpmask; 8728 free_cpumask_var(d->tmpmask); /* fall through */
8729 case sa_send_covered:
8730 free_cpumask_var(d->send_covered); /* fall through */
8731 case sa_this_core_map:
8732 free_cpumask_var(d->this_core_map); /* fall through */
8733 case sa_this_sibling_map:
8734 free_cpumask_var(d->this_sibling_map); /* fall through */
8735 case sa_nodemask:
8736 free_cpumask_var(d->nodemask); /* fall through */
8737 case sa_sched_group_nodes:
8392#ifdef CONFIG_NUMA 8738#ifdef CONFIG_NUMA
8393 cpumask_var_t domainspan, covered, notcovered; 8739 kfree(d->sched_group_nodes); /* fall through */
8394 struct sched_group **sched_group_nodes = NULL; 8740 case sa_notcovered:
8395 int sd_allnodes = 0; 8741 free_cpumask_var(d->notcovered); /* fall through */
8396 8742 case sa_covered:
8397 if (!alloc_cpumask_var(&domainspan, GFP_KERNEL)) 8743 free_cpumask_var(d->covered); /* fall through */
8398 goto out; 8744 case sa_domainspan:
8399 if (!alloc_cpumask_var(&covered, GFP_KERNEL)) 8745 free_cpumask_var(d->domainspan); /* fall through */
8400 goto free_domainspan; 8746#endif
8401 if (!alloc_cpumask_var(&notcovered, GFP_KERNEL)) 8747 case sa_none:
8402 goto free_covered; 8748 break;
8403#endif 8749 }
8404 8750}
8405 if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
8406 goto free_notcovered;
8407 if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
8408 goto free_nodemask;
8409 if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
8410 goto free_this_sibling_map;
8411 if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
8412 goto free_this_core_map;
8413 if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
8414 goto free_send_covered;
8415 8751
8752static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
8753 const struct cpumask *cpu_map)
8754{
8416#ifdef CONFIG_NUMA 8755#ifdef CONFIG_NUMA
8417 /* 8756 if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
8418 * Allocate the per-node list of sched groups 8757 return sa_none;
8419 */ 8758 if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
8420 sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), 8759 return sa_domainspan;
8421 GFP_KERNEL); 8760 if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
8422 if (!sched_group_nodes) { 8761 return sa_covered;
8762 /* Allocate the per-node list of sched groups */
8763 d->sched_group_nodes = kcalloc(nr_node_ids,
8764 sizeof(struct sched_group *), GFP_KERNEL);
8765 if (!d->sched_group_nodes) {
8423 printk(KERN_WARNING "Can not alloc sched group node list\n"); 8766 printk(KERN_WARNING "Can not alloc sched group node list\n");
8424 goto free_tmpmask; 8767 return sa_notcovered;
8425 } 8768 }
8426#endif 8769 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
8427 8770#endif
8428 rd = alloc_rootdomain(); 8771 if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
8429 if (!rd) { 8772 return sa_sched_group_nodes;
8773 if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
8774 return sa_nodemask;
8775 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
8776 return sa_this_sibling_map;
8777 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
8778 return sa_this_core_map;
8779 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
8780 return sa_send_covered;
8781 d->rd = alloc_rootdomain();
8782 if (!d->rd) {
8430 printk(KERN_WARNING "Cannot alloc root domain\n"); 8783 printk(KERN_WARNING "Cannot alloc root domain\n");
8431 goto free_sched_groups; 8784 return sa_tmpmask;
8432 } 8785 }
8786 return sa_rootdomain;
8787}
8433 8788
8789static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
8790 const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
8791{
8792 struct sched_domain *sd = NULL;
8434#ifdef CONFIG_NUMA 8793#ifdef CONFIG_NUMA
8435 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes; 8794 struct sched_domain *parent;
8436#endif
8437
8438 /*
8439 * Set up domains for cpus specified by the cpu_map.
8440 */
8441 for_each_cpu(i, cpu_map) {
8442 struct sched_domain *sd = NULL, *p;
8443
8444 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map);
8445
8446#ifdef CONFIG_NUMA
8447 if (cpumask_weight(cpu_map) >
8448 SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
8449 sd = &per_cpu(allnodes_domains, i).sd;
8450 SD_INIT(sd, ALLNODES);
8451 set_domain_attribute(sd, attr);
8452 cpumask_copy(sched_domain_span(sd), cpu_map);
8453 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
8454 p = sd;
8455 sd_allnodes = 1;
8456 } else
8457 p = NULL;
8458 8795
8459 sd = &per_cpu(node_domains, i).sd; 8796 d->sd_allnodes = 0;
8460 SD_INIT(sd, NODE); 8797 if (cpumask_weight(cpu_map) >
8798 SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
8799 sd = &per_cpu(allnodes_domains, i).sd;
8800 SD_INIT(sd, ALLNODES);
8461 set_domain_attribute(sd, attr); 8801 set_domain_attribute(sd, attr);
8462 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); 8802 cpumask_copy(sched_domain_span(sd), cpu_map);
8463 sd->parent = p; 8803 cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
8464 if (p) 8804 d->sd_allnodes = 1;
8465 p->child = sd; 8805 }
8466 cpumask_and(sched_domain_span(sd), 8806 parent = sd;
8467 sched_domain_span(sd), cpu_map); 8807
8808 sd = &per_cpu(node_domains, i).sd;
8809 SD_INIT(sd, NODE);
8810 set_domain_attribute(sd, attr);
8811 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
8812 sd->parent = parent;
8813 if (parent)
8814 parent->child = sd;
8815 cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
8468#endif 8816#endif
8817 return sd;
8818}
8469 8819
8470 p = sd; 8820static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
8471 sd = &per_cpu(phys_domains, i).sd; 8821 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8472 SD_INIT(sd, CPU); 8822 struct sched_domain *parent, int i)
8473 set_domain_attribute(sd, attr); 8823{
8474 cpumask_copy(sched_domain_span(sd), nodemask); 8824 struct sched_domain *sd;
8475 sd->parent = p; 8825 sd = &per_cpu(phys_domains, i).sd;
8476 if (p) 8826 SD_INIT(sd, CPU);
8477 p->child = sd; 8827 set_domain_attribute(sd, attr);
8478 cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); 8828 cpumask_copy(sched_domain_span(sd), d->nodemask);
8829 sd->parent = parent;
8830 if (parent)
8831 parent->child = sd;
8832 cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
8833 return sd;
8834}
8479 8835
8836static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
8837 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8838 struct sched_domain *parent, int i)
8839{
8840 struct sched_domain *sd = parent;
8480#ifdef CONFIG_SCHED_MC 8841#ifdef CONFIG_SCHED_MC
8481 p = sd; 8842 sd = &per_cpu(core_domains, i).sd;
8482 sd = &per_cpu(core_domains, i).sd; 8843 SD_INIT(sd, MC);
8483 SD_INIT(sd, MC); 8844 set_domain_attribute(sd, attr);
8484 set_domain_attribute(sd, attr); 8845 cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
8485 cpumask_and(sched_domain_span(sd), cpu_map, 8846 sd->parent = parent;
8486 cpu_coregroup_mask(i)); 8847 parent->child = sd;
8487 sd->parent = p; 8848 cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
8488 p->child = sd;
8489 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
8490#endif 8849#endif
8850 return sd;
8851}
8491 8852
8853static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
8854 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8855 struct sched_domain *parent, int i)
8856{
8857 struct sched_domain *sd = parent;
8492#ifdef CONFIG_SCHED_SMT 8858#ifdef CONFIG_SCHED_SMT
8493 p = sd; 8859 sd = &per_cpu(cpu_domains, i).sd;
8494 sd = &per_cpu(cpu_domains, i).sd; 8860 SD_INIT(sd, SIBLING);
8495 SD_INIT(sd, SIBLING); 8861 set_domain_attribute(sd, attr);
8496 set_domain_attribute(sd, attr); 8862 cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
8497 cpumask_and(sched_domain_span(sd), 8863 sd->parent = parent;
8498 topology_thread_cpumask(i), cpu_map); 8864 parent->child = sd;
8499 sd->parent = p; 8865 cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
8500 p->child = sd;
8501 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
8502#endif 8866#endif
8503 } 8867 return sd;
8868}
8504 8869
8870static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
8871 const struct cpumask *cpu_map, int cpu)
8872{
8873 switch (l) {
8505#ifdef CONFIG_SCHED_SMT 8874#ifdef CONFIG_SCHED_SMT
8506 /* Set up CPU (sibling) groups */ 8875 case SD_LV_SIBLING: /* set up CPU (sibling) groups */
8507 for_each_cpu(i, cpu_map) { 8876 cpumask_and(d->this_sibling_map, cpu_map,
8508 cpumask_and(this_sibling_map, 8877 topology_thread_cpumask(cpu));
8509 topology_thread_cpumask(i), cpu_map); 8878 if (cpu == cpumask_first(d->this_sibling_map))
8510 if (i != cpumask_first(this_sibling_map)) 8879 init_sched_build_groups(d->this_sibling_map, cpu_map,
8511 continue; 8880 &cpu_to_cpu_group,
8512 8881 d->send_covered, d->tmpmask);
8513 init_sched_build_groups(this_sibling_map, cpu_map, 8882 break;
8514 &cpu_to_cpu_group,
8515 send_covered, tmpmask);
8516 }
8517#endif 8883#endif
8518
8519#ifdef CONFIG_SCHED_MC 8884#ifdef CONFIG_SCHED_MC
8520 /* Set up multi-core groups */ 8885 case SD_LV_MC: /* set up multi-core groups */
8521 for_each_cpu(i, cpu_map) { 8886 cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
8522 cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map); 8887 if (cpu == cpumask_first(d->this_core_map))
8523 if (i != cpumask_first(this_core_map)) 8888 init_sched_build_groups(d->this_core_map, cpu_map,
8524 continue; 8889 &cpu_to_core_group,
8525 8890 d->send_covered, d->tmpmask);
8526 init_sched_build_groups(this_core_map, cpu_map, 8891 break;
8527 &cpu_to_core_group,
8528 send_covered, tmpmask);
8529 }
8530#endif 8892#endif
8531 8893 case SD_LV_CPU: /* set up physical groups */
8532 /* Set up physical groups */ 8894 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
8533 for (i = 0; i < nr_node_ids; i++) { 8895 if (!cpumask_empty(d->nodemask))
8534 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 8896 init_sched_build_groups(d->nodemask, cpu_map,
8535 if (cpumask_empty(nodemask)) 8897 &cpu_to_phys_group,
8536 continue; 8898 d->send_covered, d->tmpmask);
8537 8899 break;
8538 init_sched_build_groups(nodemask, cpu_map,
8539 &cpu_to_phys_group,
8540 send_covered, tmpmask);
8541 }
8542
8543#ifdef CONFIG_NUMA 8900#ifdef CONFIG_NUMA
8544 /* Set up node groups */ 8901 case SD_LV_ALLNODES:
8545 if (sd_allnodes) { 8902 init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
8546 init_sched_build_groups(cpu_map, cpu_map, 8903 d->send_covered, d->tmpmask);
8547 &cpu_to_allnodes_group, 8904 break;
8548 send_covered, tmpmask); 8905#endif
8906 default:
8907 break;
8549 } 8908 }
8909}
8550 8910
8551 for (i = 0; i < nr_node_ids; i++) { 8911/*
8552 /* Set up node groups */ 8912 * Build sched domains for a given set of cpus and attach the sched domains
8553 struct sched_group *sg, *prev; 8913 * to the individual cpus
8554 int j; 8914 */
8555 8915static int __build_sched_domains(const struct cpumask *cpu_map,
8556 cpumask_clear(covered); 8916 struct sched_domain_attr *attr)
8557 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 8917{
8558 if (cpumask_empty(nodemask)) { 8918 enum s_alloc alloc_state = sa_none;
8559 sched_group_nodes[i] = NULL; 8919 struct s_data d;
8560 continue; 8920 struct sched_domain *sd;
8561 } 8921 int i;
8922#ifdef CONFIG_NUMA
8923 d.sd_allnodes = 0;
8924#endif
8562 8925
8563 sched_domain_node_span(i, domainspan); 8926 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
8564 cpumask_and(domainspan, domainspan, cpu_map); 8927 if (alloc_state != sa_rootdomain)
8928 goto error;
8929 alloc_state = sa_sched_groups;
8565 8930
8566 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), 8931 /*
8567 GFP_KERNEL, i); 8932 * Set up domains for cpus specified by the cpu_map.
8568 if (!sg) { 8933 */
8569 printk(KERN_WARNING "Can not alloc domain group for " 8934 for_each_cpu(i, cpu_map) {
8570 "node %d\n", i); 8935 cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
8571 goto error; 8936 cpu_map);
8572 }
8573 sched_group_nodes[i] = sg;
8574 for_each_cpu(j, nodemask) {
8575 struct sched_domain *sd;
8576 8937
8577 sd = &per_cpu(node_domains, j).sd; 8938 sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
8578 sd->groups = sg; 8939 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
8579 } 8940 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
8580 sg->__cpu_power = 0; 8941 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
8581 cpumask_copy(sched_group_cpus(sg), nodemask); 8942 }
8582 sg->next = sg;
8583 cpumask_or(covered, covered, nodemask);
8584 prev = sg;
8585 8943
8586 for (j = 0; j < nr_node_ids; j++) { 8944 for_each_cpu(i, cpu_map) {
8587 int n = (i + j) % nr_node_ids; 8945 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
8946 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
8947 }
8588 8948
8589 cpumask_complement(notcovered, covered); 8949 /* Set up physical groups */
8590 cpumask_and(tmpmask, notcovered, cpu_map); 8950 for (i = 0; i < nr_node_ids; i++)
8591 cpumask_and(tmpmask, tmpmask, domainspan); 8951 build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
8592 if (cpumask_empty(tmpmask))
8593 break;
8594 8952
8595 cpumask_and(tmpmask, tmpmask, cpumask_of_node(n)); 8953#ifdef CONFIG_NUMA
8596 if (cpumask_empty(tmpmask)) 8954 /* Set up node groups */
8597 continue; 8955 if (d.sd_allnodes)
8956 build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
8598 8957
8599 sg = kmalloc_node(sizeof(struct sched_group) + 8958 for (i = 0; i < nr_node_ids; i++)
8600 cpumask_size(), 8959 if (build_numa_sched_groups(&d, cpu_map, i))
8601 GFP_KERNEL, i); 8960 goto error;
8602 if (!sg) {
8603 printk(KERN_WARNING
8604 "Can not alloc domain group for node %d\n", j);
8605 goto error;
8606 }
8607 sg->__cpu_power = 0;
8608 cpumask_copy(sched_group_cpus(sg), tmpmask);
8609 sg->next = prev->next;
8610 cpumask_or(covered, covered, tmpmask);
8611 prev->next = sg;
8612 prev = sg;
8613 }
8614 }
8615#endif 8961#endif
8616 8962
8617 /* Calculate CPU power for physical packages and nodes */ 8963 /* Calculate CPU power for physical packages and nodes */
8618#ifdef CONFIG_SCHED_SMT 8964#ifdef CONFIG_SCHED_SMT
8619 for_each_cpu(i, cpu_map) { 8965 for_each_cpu(i, cpu_map) {
8620 struct sched_domain *sd = &per_cpu(cpu_domains, i).sd; 8966 sd = &per_cpu(cpu_domains, i).sd;
8621
8622 init_sched_groups_power(i, sd); 8967 init_sched_groups_power(i, sd);
8623 } 8968 }
8624#endif 8969#endif
8625#ifdef CONFIG_SCHED_MC 8970#ifdef CONFIG_SCHED_MC
8626 for_each_cpu(i, cpu_map) { 8971 for_each_cpu(i, cpu_map) {
8627 struct sched_domain *sd = &per_cpu(core_domains, i).sd; 8972 sd = &per_cpu(core_domains, i).sd;
8628
8629 init_sched_groups_power(i, sd); 8973 init_sched_groups_power(i, sd);
8630 } 8974 }
8631#endif 8975#endif
8632 8976
8633 for_each_cpu(i, cpu_map) { 8977 for_each_cpu(i, cpu_map) {
8634 struct sched_domain *sd = &per_cpu(phys_domains, i).sd; 8978 sd = &per_cpu(phys_domains, i).sd;
8635
8636 init_sched_groups_power(i, sd); 8979 init_sched_groups_power(i, sd);
8637 } 8980 }
8638 8981
8639#ifdef CONFIG_NUMA 8982#ifdef CONFIG_NUMA
8640 for (i = 0; i < nr_node_ids; i++) 8983 for (i = 0; i < nr_node_ids; i++)
8641 init_numa_sched_groups_power(sched_group_nodes[i]); 8984 init_numa_sched_groups_power(d.sched_group_nodes[i]);
8642 8985
8643 if (sd_allnodes) { 8986 if (d.sd_allnodes) {
8644 struct sched_group *sg; 8987 struct sched_group *sg;
8645 8988
8646 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, 8989 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
8647 tmpmask); 8990 d.tmpmask);
8648 init_numa_sched_groups_power(sg); 8991 init_numa_sched_groups_power(sg);
8649 } 8992 }
8650#endif 8993#endif
8651 8994
8652 /* Attach the domains */ 8995 /* Attach the domains */
8653 for_each_cpu(i, cpu_map) { 8996 for_each_cpu(i, cpu_map) {
8654 struct sched_domain *sd;
8655#ifdef CONFIG_SCHED_SMT 8997#ifdef CONFIG_SCHED_SMT
8656 sd = &per_cpu(cpu_domains, i).sd; 8998 sd = &per_cpu(cpu_domains, i).sd;
8657#elif defined(CONFIG_SCHED_MC) 8999#elif defined(CONFIG_SCHED_MC)
@@ -8659,44 +9001,16 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
8659#else 9001#else
8660 sd = &per_cpu(phys_domains, i).sd; 9002 sd = &per_cpu(phys_domains, i).sd;
8661#endif 9003#endif
8662 cpu_attach_domain(sd, rd, i); 9004 cpu_attach_domain(sd, d.rd, i);
8663 } 9005 }
8664 9006
8665 err = 0; 9007 d.sched_group_nodes = NULL; /* don't free this we still need it */
8666 9008 __free_domain_allocs(&d, sa_tmpmask, cpu_map);
8667free_tmpmask: 9009 return 0;
8668 free_cpumask_var(tmpmask);
8669free_send_covered:
8670 free_cpumask_var(send_covered);
8671free_this_core_map:
8672 free_cpumask_var(this_core_map);
8673free_this_sibling_map:
8674 free_cpumask_var(this_sibling_map);
8675free_nodemask:
8676 free_cpumask_var(nodemask);
8677free_notcovered:
8678#ifdef CONFIG_NUMA
8679 free_cpumask_var(notcovered);
8680free_covered:
8681 free_cpumask_var(covered);
8682free_domainspan:
8683 free_cpumask_var(domainspan);
8684out:
8685#endif
8686 return err;
8687
8688free_sched_groups:
8689#ifdef CONFIG_NUMA
8690 kfree(sched_group_nodes);
8691#endif
8692 goto free_tmpmask;
8693 9010
8694#ifdef CONFIG_NUMA
8695error: 9011error:
8696 free_sched_groups(cpu_map, tmpmask); 9012 __free_domain_allocs(&d, alloc_state, cpu_map);
8697 free_rootdomain(rd); 9013 return -ENOMEM;
8698 goto free_tmpmask;
8699#endif
8700} 9014}
8701 9015
8702static int build_sched_domains(const struct cpumask *cpu_map) 9016static int build_sched_domains(const struct cpumask *cpu_map)
@@ -9304,11 +9618,11 @@ void __init sched_init(void)
9304 * system cpu resource, based on the weight assigned to root 9618 * system cpu resource, based on the weight assigned to root
9305 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished 9619 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
9306 * by letting tasks of init_task_group sit in a separate cfs_rq 9620 * by letting tasks of init_task_group sit in a separate cfs_rq
9307 * (init_cfs_rq) and having one entity represent this group of 9621 * (init_tg_cfs_rq) and having one entity represent this group of
9308 * tasks in rq->cfs (i.e init_task_group->se[] != NULL). 9622 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
9309 */ 9623 */
9310 init_tg_cfs_entry(&init_task_group, 9624 init_tg_cfs_entry(&init_task_group,
9311 &per_cpu(init_cfs_rq, i), 9625 &per_cpu(init_tg_cfs_rq, i),
9312 &per_cpu(init_sched_entity, i), i, 1, 9626 &per_cpu(init_sched_entity, i), i, 1,
9313 root_task_group.se[i]); 9627 root_task_group.se[i]);
9314 9628
@@ -9334,6 +9648,7 @@ void __init sched_init(void)
9334#ifdef CONFIG_SMP 9648#ifdef CONFIG_SMP
9335 rq->sd = NULL; 9649 rq->sd = NULL;
9336 rq->rd = NULL; 9650 rq->rd = NULL;
9651 rq->post_schedule = 0;
9337 rq->active_balance = 0; 9652 rq->active_balance = 0;
9338 rq->next_balance = jiffies; 9653 rq->next_balance = jiffies;
9339 rq->push_cpu = 0; 9654 rq->push_cpu = 0;
@@ -9398,13 +9713,20 @@ void __init sched_init(void)
9398} 9713}
9399 9714
9400#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 9715#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
9401void __might_sleep(char *file, int line) 9716static inline int preempt_count_equals(int preempt_offset)
9717{
9718 int nested = preempt_count() & ~PREEMPT_ACTIVE;
9719
9720 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
9721}
9722
9723void __might_sleep(char *file, int line, int preempt_offset)
9402{ 9724{
9403#ifdef in_atomic 9725#ifdef in_atomic
9404 static unsigned long prev_jiffy; /* ratelimiting */ 9726 static unsigned long prev_jiffy; /* ratelimiting */
9405 9727
9406 if ((!in_atomic() && !irqs_disabled()) || 9728 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
9407 system_state != SYSTEM_RUNNING || oops_in_progress) 9729 system_state != SYSTEM_RUNNING || oops_in_progress)
9408 return; 9730 return;
9409 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 9731 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
9410 return; 9732 return;
@@ -10581,3 +10903,113 @@ struct cgroup_subsys cpuacct_subsys = {
10581 .subsys_id = cpuacct_subsys_id, 10903 .subsys_id = cpuacct_subsys_id,
10582}; 10904};
10583#endif /* CONFIG_CGROUP_CPUACCT */ 10905#endif /* CONFIG_CGROUP_CPUACCT */
10906
10907#ifndef CONFIG_SMP
10908
10909int rcu_expedited_torture_stats(char *page)
10910{
10911 return 0;
10912}
10913EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
10914
10915void synchronize_sched_expedited(void)
10916{
10917}
10918EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
10919
10920#else /* #ifndef CONFIG_SMP */
10921
10922static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);
10923static DEFINE_MUTEX(rcu_sched_expedited_mutex);
10924
10925#define RCU_EXPEDITED_STATE_POST -2
10926#define RCU_EXPEDITED_STATE_IDLE -1
10927
10928static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
10929
10930int rcu_expedited_torture_stats(char *page)
10931{
10932 int cnt = 0;
10933 int cpu;
10934
10935 cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);
10936 for_each_online_cpu(cpu) {
10937 cnt += sprintf(&page[cnt], " %d:%d",
10938 cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);
10939 }
10940 cnt += sprintf(&page[cnt], "\n");
10941 return cnt;
10942}
10943EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
10944
10945static long synchronize_sched_expedited_count;
10946
10947/*
10948 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
10949 * approach to force grace period to end quickly. This consumes
10950 * significant time on all CPUs, and is thus not recommended for
10951 * any sort of common-case code.
10952 *
10953 * Note that it is illegal to call this function while holding any
10954 * lock that is acquired by a CPU-hotplug notifier. Failing to
10955 * observe this restriction will result in deadlock.
10956 */
10957void synchronize_sched_expedited(void)
10958{
10959 int cpu;
10960 unsigned long flags;
10961 bool need_full_sync = 0;
10962 struct rq *rq;
10963 struct migration_req *req;
10964 long snap;
10965 int trycount = 0;
10966
10967 smp_mb(); /* ensure prior mod happens before capturing snap. */
10968 snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;
10969 get_online_cpus();
10970 while (!mutex_trylock(&rcu_sched_expedited_mutex)) {
10971 put_online_cpus();
10972 if (trycount++ < 10)
10973 udelay(trycount * num_online_cpus());
10974 else {
10975 synchronize_sched();
10976 return;
10977 }
10978 if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {
10979 smp_mb(); /* ensure test happens before caller kfree */
10980 return;
10981 }
10982 get_online_cpus();
10983 }
10984 rcu_expedited_state = RCU_EXPEDITED_STATE_POST;
10985 for_each_online_cpu(cpu) {
10986 rq = cpu_rq(cpu);
10987 req = &per_cpu(rcu_migration_req, cpu);
10988 init_completion(&req->done);
10989 req->task = NULL;
10990 req->dest_cpu = RCU_MIGRATION_NEED_QS;
10991 spin_lock_irqsave(&rq->lock, flags);
10992 list_add(&req->list, &rq->migration_queue);
10993 spin_unlock_irqrestore(&rq->lock, flags);
10994 wake_up_process(rq->migration_thread);
10995 }
10996 for_each_online_cpu(cpu) {
10997 rcu_expedited_state = cpu;
10998 req = &per_cpu(rcu_migration_req, cpu);
10999 rq = cpu_rq(cpu);
11000 wait_for_completion(&req->done);
11001 spin_lock_irqsave(&rq->lock, flags);
11002 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
11003 need_full_sync = 1;
11004 req->dest_cpu = RCU_MIGRATION_IDLE;
11005 spin_unlock_irqrestore(&rq->lock, flags);
11006 }
11007 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
11008 mutex_unlock(&rcu_sched_expedited_mutex);
11009 put_online_cpus();
11010 if (need_full_sync)
11011 synchronize_sched();
11012}
11013EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
11014
11015#endif /* #else #ifndef CONFIG_SMP */
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index d014efbf947a..0f052fc674d5 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -127,21 +127,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
127 127
128 /* 128 /*
129 * If the cpu was currently mapped to a different value, we 129 * If the cpu was currently mapped to a different value, we
130 * first need to unmap the old value 130 * need to map it to the new value then remove the old value.
131 * Note, we must add the new value first, otherwise we risk the
132 * cpu being cleared from pri_active, and this cpu could be
133 * missed for a push or pull.
131 */ 134 */
132 if (likely(oldpri != CPUPRI_INVALID)) {
133 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
134
135 spin_lock_irqsave(&vec->lock, flags);
136
137 vec->count--;
138 if (!vec->count)
139 clear_bit(oldpri, cp->pri_active);
140 cpumask_clear_cpu(cpu, vec->mask);
141
142 spin_unlock_irqrestore(&vec->lock, flags);
143 }
144
145 if (likely(newpri != CPUPRI_INVALID)) { 135 if (likely(newpri != CPUPRI_INVALID)) {
146 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; 136 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
147 137
@@ -154,6 +144,18 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
154 144
155 spin_unlock_irqrestore(&vec->lock, flags); 145 spin_unlock_irqrestore(&vec->lock, flags);
156 } 146 }
147 if (likely(oldpri != CPUPRI_INVALID)) {
148 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
149
150 spin_lock_irqsave(&vec->lock, flags);
151
152 vec->count--;
153 if (!vec->count)
154 clear_bit(oldpri, cp->pri_active);
155 cpumask_clear_cpu(cpu, vec->mask);
156
157 spin_unlock_irqrestore(&vec->lock, flags);
158 }
157 159
158 *currpri = newpri; 160 *currpri = newpri;
159} 161}
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 70c7e0b79946..5ddbd0891267 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -409,6 +409,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
409 PN(se.wait_max); 409 PN(se.wait_max);
410 PN(se.wait_sum); 410 PN(se.wait_sum);
411 P(se.wait_count); 411 P(se.wait_count);
412 PN(se.iowait_sum);
413 P(se.iowait_count);
412 P(sched_info.bkl_count); 414 P(sched_info.bkl_count);
413 P(se.nr_migrations); 415 P(se.nr_migrations);
414 P(se.nr_migrations_cold); 416 P(se.nr_migrations_cold);
@@ -479,6 +481,8 @@ void proc_sched_set_task(struct task_struct *p)
479 p->se.wait_max = 0; 481 p->se.wait_max = 0;
480 p->se.wait_sum = 0; 482 p->se.wait_sum = 0;
481 p->se.wait_count = 0; 483 p->se.wait_count = 0;
484 p->se.iowait_sum = 0;
485 p->se.iowait_count = 0;
482 p->se.sleep_max = 0; 486 p->se.sleep_max = 0;
483 p->se.sum_sleep_runtime = 0; 487 p->se.sum_sleep_runtime = 0;
484 p->se.block_max = 0; 488 p->se.block_max = 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 652e8bdef9aa..aa7f84121016 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -24,7 +24,7 @@
24 24
25/* 25/*
26 * Targeted preemption latency for CPU-bound tasks: 26 * Targeted preemption latency for CPU-bound tasks:
27 * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds) 27 * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds)
28 * 28 *
29 * NOTE: this latency value is not the same as the concept of 29 * NOTE: this latency value is not the same as the concept of
30 * 'timeslice length' - timeslices in CFS are of variable length 30 * 'timeslice length' - timeslices in CFS are of variable length
@@ -34,13 +34,13 @@
34 * (to see the precise effective timeslice length of your workload, 34 * (to see the precise effective timeslice length of your workload,
35 * run vmstat and monitor the context-switches (cs) field) 35 * run vmstat and monitor the context-switches (cs) field)
36 */ 36 */
37unsigned int sysctl_sched_latency = 20000000ULL; 37unsigned int sysctl_sched_latency = 5000000ULL;
38 38
39/* 39/*
40 * Minimal preemption granularity for CPU-bound tasks: 40 * Minimal preemption granularity for CPU-bound tasks:
41 * (default: 4 msec * (1 + ilog(ncpus)), units: nanoseconds) 41 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
42 */ 42 */
43unsigned int sysctl_sched_min_granularity = 4000000ULL; 43unsigned int sysctl_sched_min_granularity = 1000000ULL;
44 44
45/* 45/*
46 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity 46 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
@@ -48,10 +48,10 @@ unsigned int sysctl_sched_min_granularity = 4000000ULL;
48static unsigned int sched_nr_latency = 5; 48static unsigned int sched_nr_latency = 5;
49 49
50/* 50/*
51 * After fork, child runs first. (default) If set to 0 then 51 * After fork, child runs first. If set to 0 (default) then
52 * parent will (try to) run first. 52 * parent will (try to) run first.
53 */ 53 */
54const_debug unsigned int sysctl_sched_child_runs_first = 1; 54unsigned int sysctl_sched_child_runs_first __read_mostly;
55 55
56/* 56/*
57 * sys_sched_yield() compat mode 57 * sys_sched_yield() compat mode
@@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
63 63
64/* 64/*
65 * SCHED_OTHER wake-up granularity. 65 * SCHED_OTHER wake-up granularity.
66 * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) 66 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
67 * 67 *
68 * This option delays the preemption effects of decoupled workloads 68 * This option delays the preemption effects of decoupled workloads
69 * and reduces their over-scheduling. Synchronous workloads will still 69 * and reduces their over-scheduling. Synchronous workloads will still
70 * have immediate wakeup/sleep latencies. 70 * have immediate wakeup/sleep latencies.
71 */ 71 */
72unsigned int sysctl_sched_wakeup_granularity = 5000000UL; 72unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
73 73
74const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 74const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
75 75
@@ -79,11 +79,6 @@ static const struct sched_class fair_sched_class;
79 * CFS operations on generic schedulable entities: 79 * CFS operations on generic schedulable entities:
80 */ 80 */
81 81
82static inline struct task_struct *task_of(struct sched_entity *se)
83{
84 return container_of(se, struct task_struct, se);
85}
86
87#ifdef CONFIG_FAIR_GROUP_SCHED 82#ifdef CONFIG_FAIR_GROUP_SCHED
88 83
89/* cpu runqueue to which this cfs_rq is attached */ 84/* cpu runqueue to which this cfs_rq is attached */
@@ -95,6 +90,14 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
95/* An entity is a task if it doesn't "own" a runqueue */ 90/* An entity is a task if it doesn't "own" a runqueue */
96#define entity_is_task(se) (!se->my_q) 91#define entity_is_task(se) (!se->my_q)
97 92
93static inline struct task_struct *task_of(struct sched_entity *se)
94{
95#ifdef CONFIG_SCHED_DEBUG
96 WARN_ON_ONCE(!entity_is_task(se));
97#endif
98 return container_of(se, struct task_struct, se);
99}
100
98/* Walk up scheduling entities hierarchy */ 101/* Walk up scheduling entities hierarchy */
99#define for_each_sched_entity(se) \ 102#define for_each_sched_entity(se) \
100 for (; se; se = se->parent) 103 for (; se; se = se->parent)
@@ -186,7 +189,12 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
186 } 189 }
187} 190}
188 191
189#else /* CONFIG_FAIR_GROUP_SCHED */ 192#else /* !CONFIG_FAIR_GROUP_SCHED */
193
194static inline struct task_struct *task_of(struct sched_entity *se)
195{
196 return container_of(se, struct task_struct, se);
197}
190 198
191static inline struct rq *rq_of(struct cfs_rq *cfs_rq) 199static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
192{ 200{
@@ -537,6 +545,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
537 schedstat_set(se->wait_count, se->wait_count + 1); 545 schedstat_set(se->wait_count, se->wait_count + 1);
538 schedstat_set(se->wait_sum, se->wait_sum + 546 schedstat_set(se->wait_sum, se->wait_sum +
539 rq_of(cfs_rq)->clock - se->wait_start); 547 rq_of(cfs_rq)->clock - se->wait_start);
548#ifdef CONFIG_SCHEDSTATS
549 if (entity_is_task(se)) {
550 trace_sched_stat_wait(task_of(se),
551 rq_of(cfs_rq)->clock - se->wait_start);
552 }
553#endif
540 schedstat_set(se->wait_start, 0); 554 schedstat_set(se->wait_start, 0);
541} 555}
542 556
@@ -628,8 +642,10 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
628 se->sleep_start = 0; 642 se->sleep_start = 0;
629 se->sum_sleep_runtime += delta; 643 se->sum_sleep_runtime += delta;
630 644
631 if (tsk) 645 if (tsk) {
632 account_scheduler_latency(tsk, delta >> 10, 1); 646 account_scheduler_latency(tsk, delta >> 10, 1);
647 trace_sched_stat_sleep(tsk, delta);
648 }
633 } 649 }
634 if (se->block_start) { 650 if (se->block_start) {
635 u64 delta = rq_of(cfs_rq)->clock - se->block_start; 651 u64 delta = rq_of(cfs_rq)->clock - se->block_start;
@@ -644,6 +660,12 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
644 se->sum_sleep_runtime += delta; 660 se->sum_sleep_runtime += delta;
645 661
646 if (tsk) { 662 if (tsk) {
663 if (tsk->in_iowait) {
664 se->iowait_sum += delta;
665 se->iowait_count++;
666 trace_sched_stat_iowait(tsk, delta);
667 }
668
647 /* 669 /*
648 * Blocking time is in units of nanosecs, so shift by 670 * Blocking time is in units of nanosecs, so shift by
649 * 20 to get a milliseconds-range estimation of the 671 * 20 to get a milliseconds-range estimation of the
@@ -705,11 +727,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
705 727
706 vruntime -= thresh; 728 vruntime -= thresh;
707 } 729 }
708
709 /* ensure we never gain time by being placed backwards. */
710 vruntime = max_vruntime(se->vruntime, vruntime);
711 } 730 }
712 731
732 /* ensure we never gain time by being placed backwards. */
733 vruntime = max_vruntime(se->vruntime, vruntime);
734
713 se->vruntime = vruntime; 735 se->vruntime = vruntime;
714} 736}
715 737
@@ -1046,17 +1068,21 @@ static void yield_task_fair(struct rq *rq)
1046 * search starts with cpus closest then further out as needed, 1068 * search starts with cpus closest then further out as needed,
1047 * so we always favor a closer, idle cpu. 1069 * so we always favor a closer, idle cpu.
1048 * Domains may include CPUs that are not usable for migration, 1070 * Domains may include CPUs that are not usable for migration,
1049 * hence we need to mask them out (cpu_active_mask) 1071 * hence we need to mask them out (rq->rd->online)
1050 * 1072 *
1051 * Returns the CPU we should wake onto. 1073 * Returns the CPU we should wake onto.
1052 */ 1074 */
1053#if defined(ARCH_HAS_SCHED_WAKE_IDLE) 1075#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1076
1077#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online)
1078
1054static int wake_idle(int cpu, struct task_struct *p) 1079static int wake_idle(int cpu, struct task_struct *p)
1055{ 1080{
1056 struct sched_domain *sd; 1081 struct sched_domain *sd;
1057 int i; 1082 int i;
1058 unsigned int chosen_wakeup_cpu; 1083 unsigned int chosen_wakeup_cpu;
1059 int this_cpu; 1084 int this_cpu;
1085 struct rq *task_rq = task_rq(p);
1060 1086
1061 /* 1087 /*
1062 * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu 1088 * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
@@ -1089,10 +1115,10 @@ static int wake_idle(int cpu, struct task_struct *p)
1089 for_each_domain(cpu, sd) { 1115 for_each_domain(cpu, sd) {
1090 if ((sd->flags & SD_WAKE_IDLE) 1116 if ((sd->flags & SD_WAKE_IDLE)
1091 || ((sd->flags & SD_WAKE_IDLE_FAR) 1117 || ((sd->flags & SD_WAKE_IDLE_FAR)
1092 && !task_hot(p, task_rq(p)->clock, sd))) { 1118 && !task_hot(p, task_rq->clock, sd))) {
1093 for_each_cpu_and(i, sched_domain_span(sd), 1119 for_each_cpu_and(i, sched_domain_span(sd),
1094 &p->cpus_allowed) { 1120 &p->cpus_allowed) {
1095 if (cpu_active(i) && idle_cpu(i)) { 1121 if (cpu_rd_active(i, task_rq) && idle_cpu(i)) {
1096 if (i != task_cpu(p)) { 1122 if (i != task_cpu(p)) {
1097 schedstat_inc(p, 1123 schedstat_inc(p,
1098 se.nr_wakeups_idle); 1124 se.nr_wakeups_idle);
@@ -1235,7 +1261,17 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1235 tg = task_group(p); 1261 tg = task_group(p);
1236 weight = p->se.load.weight; 1262 weight = p->se.load.weight;
1237 1263
1238 balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= 1264 /*
1265 * In low-load situations, where prev_cpu is idle and this_cpu is idle
1266 * due to the sync cause above having dropped tl to 0, we'll always have
1267 * an imbalance, but there's really nothing you can do about that, so
1268 * that's good too.
1269 *
1270 * Otherwise check if either cpus are near enough in load to allow this
1271 * task to be woken on this_cpu.
1272 */
1273 balanced = !tl ||
1274 100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
1239 imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); 1275 imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
1240 1276
1241 /* 1277 /*
@@ -1278,8 +1314,6 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
1278 this_rq = cpu_rq(this_cpu); 1314 this_rq = cpu_rq(this_cpu);
1279 new_cpu = prev_cpu; 1315 new_cpu = prev_cpu;
1280 1316
1281 if (prev_cpu == this_cpu)
1282 goto out;
1283 /* 1317 /*
1284 * 'this_sd' is the first domain that both 1318 * 'this_sd' is the first domain that both
1285 * this_cpu and prev_cpu are present in: 1319 * this_cpu and prev_cpu are present in:
@@ -1721,6 +1755,8 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1721 sched_info_queued(p); 1755 sched_info_queued(p);
1722 1756
1723 update_curr(cfs_rq); 1757 update_curr(cfs_rq);
1758 if (curr)
1759 se->vruntime = curr->vruntime;
1724 place_entity(cfs_rq, se, 1); 1760 place_entity(cfs_rq, se, 1);
1725 1761
1726 /* 'curr' will be NULL if the child belongs to a different group */ 1762 /* 'curr' will be NULL if the child belongs to a different group */
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 4569bfa7df9b..e2dc63a5815d 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,4 +1,4 @@
1SCHED_FEAT(NEW_FAIR_SLEEPERS, 1) 1SCHED_FEAT(NEW_FAIR_SLEEPERS, 0)
2SCHED_FEAT(NORMALIZED_SLEEPER, 0) 2SCHED_FEAT(NORMALIZED_SLEEPER, 0)
3SCHED_FEAT(ADAPTIVE_GRAN, 1) 3SCHED_FEAT(ADAPTIVE_GRAN, 1)
4SCHED_FEAT(WAKEUP_PREEMPT, 1) 4SCHED_FEAT(WAKEUP_PREEMPT, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 3918e01994e0..2eb4bd6a526c 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -3,15 +3,18 @@
3 * policies) 3 * policies)
4 */ 4 */
5 5
6#ifdef CONFIG_RT_GROUP_SCHED
7
8#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
9
6static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) 10static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
7{ 11{
12#ifdef CONFIG_SCHED_DEBUG
13 WARN_ON_ONCE(!rt_entity_is_task(rt_se));
14#endif
8 return container_of(rt_se, struct task_struct, rt); 15 return container_of(rt_se, struct task_struct, rt);
9} 16}
10 17
11#ifdef CONFIG_RT_GROUP_SCHED
12
13#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
14
15static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 18static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
16{ 19{
17 return rt_rq->rq; 20 return rt_rq->rq;
@@ -26,6 +29,11 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
26 29
27#define rt_entity_is_task(rt_se) (1) 30#define rt_entity_is_task(rt_se) (1)
28 31
32static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
33{
34 return container_of(rt_se, struct task_struct, rt);
35}
36
29static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 37static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
30{ 38{
31 return container_of(rt_rq, struct rq, rt); 39 return container_of(rt_rq, struct rq, rt);
@@ -128,6 +136,11 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
128 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 136 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
129} 137}
130 138
139static inline int has_pushable_tasks(struct rq *rq)
140{
141 return !plist_head_empty(&rq->rt.pushable_tasks);
142}
143
131#else 144#else
132 145
133static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 146static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -602,6 +615,8 @@ static void update_curr_rt(struct rq *rq)
602 curr->se.exec_start = rq->clock; 615 curr->se.exec_start = rq->clock;
603 cpuacct_charge(curr, delta_exec); 616 cpuacct_charge(curr, delta_exec);
604 617
618 sched_rt_avg_update(rq, delta_exec);
619
605 if (!rt_bandwidth_enabled()) 620 if (!rt_bandwidth_enabled())
606 return; 621 return;
607 622
@@ -874,8 +889,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
874 889
875 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 890 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
876 enqueue_pushable_task(rq, p); 891 enqueue_pushable_task(rq, p);
877
878 inc_cpu_load(rq, p->se.load.weight);
879} 892}
880 893
881static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) 894static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -886,8 +899,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
886 dequeue_rt_entity(rt_se); 899 dequeue_rt_entity(rt_se);
887 900
888 dequeue_pushable_task(rq, p); 901 dequeue_pushable_task(rq, p);
889
890 dec_cpu_load(rq, p->se.load.weight);
891} 902}
892 903
893/* 904/*
@@ -1064,6 +1075,14 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
1064 if (p) 1075 if (p)
1065 dequeue_pushable_task(rq, p); 1076 dequeue_pushable_task(rq, p);
1066 1077
1078#ifdef CONFIG_SMP
1079 /*
1080 * We detect this state here so that we can avoid taking the RQ
1081 * lock again later if there is no need to push
1082 */
1083 rq->post_schedule = has_pushable_tasks(rq);
1084#endif
1085
1067 return p; 1086 return p;
1068} 1087}
1069 1088
@@ -1162,13 +1181,6 @@ static int find_lowest_rq(struct task_struct *task)
1162 return -1; /* No targets found */ 1181 return -1; /* No targets found */
1163 1182
1164 /* 1183 /*
1165 * Only consider CPUs that are usable for migration.
1166 * I guess we might want to change cpupri_find() to ignore those
1167 * in the first place.
1168 */
1169 cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
1170
1171 /*
1172 * At this point we have built a mask of cpus representing the 1184 * At this point we have built a mask of cpus representing the
1173 * lowest priority tasks in the system. Now we want to elect 1185 * lowest priority tasks in the system. Now we want to elect
1174 * the best one based on our affinity and topology. 1186 * the best one based on our affinity and topology.
@@ -1262,11 +1274,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1262 return lowest_rq; 1274 return lowest_rq;
1263} 1275}
1264 1276
1265static inline int has_pushable_tasks(struct rq *rq)
1266{
1267 return !plist_head_empty(&rq->rt.pushable_tasks);
1268}
1269
1270static struct task_struct *pick_next_pushable_task(struct rq *rq) 1277static struct task_struct *pick_next_pushable_task(struct rq *rq)
1271{ 1278{
1272 struct task_struct *p; 1279 struct task_struct *p;
@@ -1466,23 +1473,9 @@ static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
1466 pull_rt_task(rq); 1473 pull_rt_task(rq);
1467} 1474}
1468 1475
1469/*
1470 * assumes rq->lock is held
1471 */
1472static int needs_post_schedule_rt(struct rq *rq)
1473{
1474 return has_pushable_tasks(rq);
1475}
1476
1477static void post_schedule_rt(struct rq *rq) 1476static void post_schedule_rt(struct rq *rq)
1478{ 1477{
1479 /*
1480 * This is only called if needs_post_schedule_rt() indicates that
1481 * we need to push tasks away
1482 */
1483 spin_lock_irq(&rq->lock);
1484 push_rt_tasks(rq); 1478 push_rt_tasks(rq);
1485 spin_unlock_irq(&rq->lock);
1486} 1479}
1487 1480
1488/* 1481/*
@@ -1758,7 +1751,6 @@ static const struct sched_class rt_sched_class = {
1758 .rq_online = rq_online_rt, 1751 .rq_online = rq_online_rt,
1759 .rq_offline = rq_offline_rt, 1752 .rq_offline = rq_offline_rt,
1760 .pre_schedule = pre_schedule_rt, 1753 .pre_schedule = pre_schedule_rt,
1761 .needs_post_schedule = needs_post_schedule_rt,
1762 .post_schedule = post_schedule_rt, 1754 .post_schedule = post_schedule_rt,
1763 .task_wake_up = task_wake_up_rt, 1755 .task_wake_up = task_wake_up_rt,
1764 .switched_from = switched_from_rt, 1756 .switched_from = switched_from_rt,
diff --git a/kernel/softirq.c b/kernel/softirq.c
index eb5e131a0485..7db25067cd2d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -227,7 +227,7 @@ restart:
227 preempt_count() = prev_count; 227 preempt_count() = prev_count;
228 } 228 }
229 229
230 rcu_bh_qsctr_inc(cpu); 230 rcu_bh_qs(cpu);
231 } 231 }
232 h++; 232 h++;
233 pending >>= 1; 233 pending >>= 1;
@@ -721,7 +721,7 @@ static int ksoftirqd(void * __bind_cpu)
721 preempt_enable_no_resched(); 721 preempt_enable_no_resched();
722 cond_resched(); 722 cond_resched();
723 preempt_disable(); 723 preempt_disable();
724 rcu_qsctr_inc((long)__bind_cpu); 724 rcu_sched_qs((long)__bind_cpu);
725 } 725 }
726 preempt_enable(); 726 preempt_enable();
727 set_current_state(TASK_INTERRUPTIBLE); 727 set_current_state(TASK_INTERRUPTIBLE);
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 7932653c4ebd..5ddab730cb2f 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -21,44 +21,29 @@
21#include <linux/debug_locks.h> 21#include <linux/debug_locks.h>
22#include <linux/module.h> 22#include <linux/module.h>
23 23
24#ifndef _spin_trylock
24int __lockfunc _spin_trylock(spinlock_t *lock) 25int __lockfunc _spin_trylock(spinlock_t *lock)
25{ 26{
26 preempt_disable(); 27 return __spin_trylock(lock);
27 if (_raw_spin_trylock(lock)) {
28 spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
29 return 1;
30 }
31
32 preempt_enable();
33 return 0;
34} 28}
35EXPORT_SYMBOL(_spin_trylock); 29EXPORT_SYMBOL(_spin_trylock);
30#endif
36 31
32#ifndef _read_trylock
37int __lockfunc _read_trylock(rwlock_t *lock) 33int __lockfunc _read_trylock(rwlock_t *lock)
38{ 34{
39 preempt_disable(); 35 return __read_trylock(lock);
40 if (_raw_read_trylock(lock)) {
41 rwlock_acquire_read(&lock->dep_map, 0, 1, _RET_IP_);
42 return 1;
43 }
44
45 preempt_enable();
46 return 0;
47} 36}
48EXPORT_SYMBOL(_read_trylock); 37EXPORT_SYMBOL(_read_trylock);
38#endif
49 39
40#ifndef _write_trylock
50int __lockfunc _write_trylock(rwlock_t *lock) 41int __lockfunc _write_trylock(rwlock_t *lock)
51{ 42{
52 preempt_disable(); 43 return __write_trylock(lock);
53 if (_raw_write_trylock(lock)) {
54 rwlock_acquire(&lock->dep_map, 0, 1, _RET_IP_);
55 return 1;
56 }
57
58 preempt_enable();
59 return 0;
60} 44}
61EXPORT_SYMBOL(_write_trylock); 45EXPORT_SYMBOL(_write_trylock);
46#endif
62 47
63/* 48/*
64 * If lockdep is enabled then we use the non-preemption spin-ops 49 * If lockdep is enabled then we use the non-preemption spin-ops
@@ -67,132 +52,101 @@ EXPORT_SYMBOL(_write_trylock);
67 */ 52 */
68#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) 53#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
69 54
55#ifndef _read_lock
70void __lockfunc _read_lock(rwlock_t *lock) 56void __lockfunc _read_lock(rwlock_t *lock)
71{ 57{
72 preempt_disable(); 58 __read_lock(lock);
73 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
74 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
75} 59}
76EXPORT_SYMBOL(_read_lock); 60EXPORT_SYMBOL(_read_lock);
61#endif
77 62
63#ifndef _spin_lock_irqsave
78unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) 64unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
79{ 65{
80 unsigned long flags; 66 return __spin_lock_irqsave(lock);
81
82 local_irq_save(flags);
83 preempt_disable();
84 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
85 /*
86 * On lockdep we dont want the hand-coded irq-enable of
87 * _raw_spin_lock_flags() code, because lockdep assumes
88 * that interrupts are not re-enabled during lock-acquire:
89 */
90#ifdef CONFIG_LOCKDEP
91 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
92#else
93 _raw_spin_lock_flags(lock, &flags);
94#endif
95 return flags;
96} 67}
97EXPORT_SYMBOL(_spin_lock_irqsave); 68EXPORT_SYMBOL(_spin_lock_irqsave);
69#endif
98 70
71#ifndef _spin_lock_irq
99void __lockfunc _spin_lock_irq(spinlock_t *lock) 72void __lockfunc _spin_lock_irq(spinlock_t *lock)
100{ 73{
101 local_irq_disable(); 74 __spin_lock_irq(lock);
102 preempt_disable();
103 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
104 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
105} 75}
106EXPORT_SYMBOL(_spin_lock_irq); 76EXPORT_SYMBOL(_spin_lock_irq);
77#endif
107 78
79#ifndef _spin_lock_bh
108void __lockfunc _spin_lock_bh(spinlock_t *lock) 80void __lockfunc _spin_lock_bh(spinlock_t *lock)
109{ 81{
110 local_bh_disable(); 82 __spin_lock_bh(lock);
111 preempt_disable();
112 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
113 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
114} 83}
115EXPORT_SYMBOL(_spin_lock_bh); 84EXPORT_SYMBOL(_spin_lock_bh);
85#endif
116 86
87#ifndef _read_lock_irqsave
117unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) 88unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
118{ 89{
119 unsigned long flags; 90 return __read_lock_irqsave(lock);
120
121 local_irq_save(flags);
122 preempt_disable();
123 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
124 LOCK_CONTENDED_FLAGS(lock, _raw_read_trylock, _raw_read_lock,
125 _raw_read_lock_flags, &flags);
126 return flags;
127} 91}
128EXPORT_SYMBOL(_read_lock_irqsave); 92EXPORT_SYMBOL(_read_lock_irqsave);
93#endif
129 94
95#ifndef _read_lock_irq
130void __lockfunc _read_lock_irq(rwlock_t *lock) 96void __lockfunc _read_lock_irq(rwlock_t *lock)
131{ 97{
132 local_irq_disable(); 98 __read_lock_irq(lock);
133 preempt_disable();
134 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
135 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
136} 99}
137EXPORT_SYMBOL(_read_lock_irq); 100EXPORT_SYMBOL(_read_lock_irq);
101#endif
138 102
103#ifndef _read_lock_bh
139void __lockfunc _read_lock_bh(rwlock_t *lock) 104void __lockfunc _read_lock_bh(rwlock_t *lock)
140{ 105{
141 local_bh_disable(); 106 __read_lock_bh(lock);
142 preempt_disable();
143 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
144 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
145} 107}
146EXPORT_SYMBOL(_read_lock_bh); 108EXPORT_SYMBOL(_read_lock_bh);
109#endif
147 110
111#ifndef _write_lock_irqsave
148unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) 112unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
149{ 113{
150 unsigned long flags; 114 return __write_lock_irqsave(lock);
151
152 local_irq_save(flags);
153 preempt_disable();
154 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
155 LOCK_CONTENDED_FLAGS(lock, _raw_write_trylock, _raw_write_lock,
156 _raw_write_lock_flags, &flags);
157 return flags;
158} 115}
159EXPORT_SYMBOL(_write_lock_irqsave); 116EXPORT_SYMBOL(_write_lock_irqsave);
117#endif
160 118
119#ifndef _write_lock_irq
161void __lockfunc _write_lock_irq(rwlock_t *lock) 120void __lockfunc _write_lock_irq(rwlock_t *lock)
162{ 121{
163 local_irq_disable(); 122 __write_lock_irq(lock);
164 preempt_disable();
165 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
166 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
167} 123}
168EXPORT_SYMBOL(_write_lock_irq); 124EXPORT_SYMBOL(_write_lock_irq);
125#endif
169 126
127#ifndef _write_lock_bh
170void __lockfunc _write_lock_bh(rwlock_t *lock) 128void __lockfunc _write_lock_bh(rwlock_t *lock)
171{ 129{
172 local_bh_disable(); 130 __write_lock_bh(lock);
173 preempt_disable();
174 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
175 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
176} 131}
177EXPORT_SYMBOL(_write_lock_bh); 132EXPORT_SYMBOL(_write_lock_bh);
133#endif
178 134
135#ifndef _spin_lock
179void __lockfunc _spin_lock(spinlock_t *lock) 136void __lockfunc _spin_lock(spinlock_t *lock)
180{ 137{
181 preempt_disable(); 138 __spin_lock(lock);
182 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
183 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
184} 139}
185
186EXPORT_SYMBOL(_spin_lock); 140EXPORT_SYMBOL(_spin_lock);
141#endif
187 142
143#ifndef _write_lock
188void __lockfunc _write_lock(rwlock_t *lock) 144void __lockfunc _write_lock(rwlock_t *lock)
189{ 145{
190 preempt_disable(); 146 __write_lock(lock);
191 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
192 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
193} 147}
194
195EXPORT_SYMBOL(_write_lock); 148EXPORT_SYMBOL(_write_lock);
149#endif
196 150
197#else /* CONFIG_PREEMPT: */ 151#else /* CONFIG_PREEMPT: */
198 152
@@ -318,125 +272,109 @@ EXPORT_SYMBOL(_spin_lock_nest_lock);
318 272
319#endif 273#endif
320 274
275#ifndef _spin_unlock
321void __lockfunc _spin_unlock(spinlock_t *lock) 276void __lockfunc _spin_unlock(spinlock_t *lock)
322{ 277{
323 spin_release(&lock->dep_map, 1, _RET_IP_); 278 __spin_unlock(lock);
324 _raw_spin_unlock(lock);
325 preempt_enable();
326} 279}
327EXPORT_SYMBOL(_spin_unlock); 280EXPORT_SYMBOL(_spin_unlock);
281#endif
328 282
283#ifndef _write_unlock
329void __lockfunc _write_unlock(rwlock_t *lock) 284void __lockfunc _write_unlock(rwlock_t *lock)
330{ 285{
331 rwlock_release(&lock->dep_map, 1, _RET_IP_); 286 __write_unlock(lock);
332 _raw_write_unlock(lock);
333 preempt_enable();
334} 287}
335EXPORT_SYMBOL(_write_unlock); 288EXPORT_SYMBOL(_write_unlock);
289#endif
336 290
291#ifndef _read_unlock
337void __lockfunc _read_unlock(rwlock_t *lock) 292void __lockfunc _read_unlock(rwlock_t *lock)
338{ 293{
339 rwlock_release(&lock->dep_map, 1, _RET_IP_); 294 __read_unlock(lock);
340 _raw_read_unlock(lock);
341 preempt_enable();
342} 295}
343EXPORT_SYMBOL(_read_unlock); 296EXPORT_SYMBOL(_read_unlock);
297#endif
344 298
299#ifndef _spin_unlock_irqrestore
345void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) 300void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
346{ 301{
347 spin_release(&lock->dep_map, 1, _RET_IP_); 302 __spin_unlock_irqrestore(lock, flags);
348 _raw_spin_unlock(lock);
349 local_irq_restore(flags);
350 preempt_enable();
351} 303}
352EXPORT_SYMBOL(_spin_unlock_irqrestore); 304EXPORT_SYMBOL(_spin_unlock_irqrestore);
305#endif
353 306
307#ifndef _spin_unlock_irq
354void __lockfunc _spin_unlock_irq(spinlock_t *lock) 308void __lockfunc _spin_unlock_irq(spinlock_t *lock)
355{ 309{
356 spin_release(&lock->dep_map, 1, _RET_IP_); 310 __spin_unlock_irq(lock);
357 _raw_spin_unlock(lock);
358 local_irq_enable();
359 preempt_enable();
360} 311}
361EXPORT_SYMBOL(_spin_unlock_irq); 312EXPORT_SYMBOL(_spin_unlock_irq);
313#endif
362 314
315#ifndef _spin_unlock_bh
363void __lockfunc _spin_unlock_bh(spinlock_t *lock) 316void __lockfunc _spin_unlock_bh(spinlock_t *lock)
364{ 317{
365 spin_release(&lock->dep_map, 1, _RET_IP_); 318 __spin_unlock_bh(lock);
366 _raw_spin_unlock(lock);
367 preempt_enable_no_resched();
368 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
369} 319}
370EXPORT_SYMBOL(_spin_unlock_bh); 320EXPORT_SYMBOL(_spin_unlock_bh);
321#endif
371 322
323#ifndef _read_unlock_irqrestore
372void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 324void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
373{ 325{
374 rwlock_release(&lock->dep_map, 1, _RET_IP_); 326 __read_unlock_irqrestore(lock, flags);
375 _raw_read_unlock(lock);
376 local_irq_restore(flags);
377 preempt_enable();
378} 327}
379EXPORT_SYMBOL(_read_unlock_irqrestore); 328EXPORT_SYMBOL(_read_unlock_irqrestore);
329#endif
380 330
331#ifndef _read_unlock_irq
381void __lockfunc _read_unlock_irq(rwlock_t *lock) 332void __lockfunc _read_unlock_irq(rwlock_t *lock)
382{ 333{
383 rwlock_release(&lock->dep_map, 1, _RET_IP_); 334 __read_unlock_irq(lock);
384 _raw_read_unlock(lock);
385 local_irq_enable();
386 preempt_enable();
387} 335}
388EXPORT_SYMBOL(_read_unlock_irq); 336EXPORT_SYMBOL(_read_unlock_irq);
337#endif
389 338
339#ifndef _read_unlock_bh
390void __lockfunc _read_unlock_bh(rwlock_t *lock) 340void __lockfunc _read_unlock_bh(rwlock_t *lock)
391{ 341{
392 rwlock_release(&lock->dep_map, 1, _RET_IP_); 342 __read_unlock_bh(lock);
393 _raw_read_unlock(lock);
394 preempt_enable_no_resched();
395 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
396} 343}
397EXPORT_SYMBOL(_read_unlock_bh); 344EXPORT_SYMBOL(_read_unlock_bh);
345#endif
398 346
347#ifndef _write_unlock_irqrestore
399void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 348void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
400{ 349{
401 rwlock_release(&lock->dep_map, 1, _RET_IP_); 350 __write_unlock_irqrestore(lock, flags);
402 _raw_write_unlock(lock);
403 local_irq_restore(flags);
404 preempt_enable();
405} 351}
406EXPORT_SYMBOL(_write_unlock_irqrestore); 352EXPORT_SYMBOL(_write_unlock_irqrestore);
353#endif
407 354
355#ifndef _write_unlock_irq
408void __lockfunc _write_unlock_irq(rwlock_t *lock) 356void __lockfunc _write_unlock_irq(rwlock_t *lock)
409{ 357{
410 rwlock_release(&lock->dep_map, 1, _RET_IP_); 358 __write_unlock_irq(lock);
411 _raw_write_unlock(lock);
412 local_irq_enable();
413 preempt_enable();
414} 359}
415EXPORT_SYMBOL(_write_unlock_irq); 360EXPORT_SYMBOL(_write_unlock_irq);
361#endif
416 362
363#ifndef _write_unlock_bh
417void __lockfunc _write_unlock_bh(rwlock_t *lock) 364void __lockfunc _write_unlock_bh(rwlock_t *lock)
418{ 365{
419 rwlock_release(&lock->dep_map, 1, _RET_IP_); 366 __write_unlock_bh(lock);
420 _raw_write_unlock(lock);
421 preempt_enable_no_resched();
422 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
423} 367}
424EXPORT_SYMBOL(_write_unlock_bh); 368EXPORT_SYMBOL(_write_unlock_bh);
369#endif
425 370
371#ifndef _spin_trylock_bh
426int __lockfunc _spin_trylock_bh(spinlock_t *lock) 372int __lockfunc _spin_trylock_bh(spinlock_t *lock)
427{ 373{
428 local_bh_disable(); 374 return __spin_trylock_bh(lock);
429 preempt_disable();
430 if (_raw_spin_trylock(lock)) {
431 spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
432 return 1;
433 }
434
435 preempt_enable_no_resched();
436 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
437 return 0;
438} 375}
439EXPORT_SYMBOL(_spin_trylock_bh); 376EXPORT_SYMBOL(_spin_trylock_bh);
377#endif
440 378
441notrace int in_lock_functions(unsigned long addr) 379notrace int in_lock_functions(unsigned long addr)
442{ 380{
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 71d8dc7f9920..3125cff1c570 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -245,6 +245,14 @@ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
245#endif 245#endif
246 246
247static struct ctl_table kern_table[] = { 247static struct ctl_table kern_table[] = {
248 {
249 .ctl_name = CTL_UNNUMBERED,
250 .procname = "sched_child_runs_first",
251 .data = &sysctl_sched_child_runs_first,
252 .maxlen = sizeof(unsigned int),
253 .mode = 0644,
254 .proc_handler = &proc_dointvec,
255 },
248#ifdef CONFIG_SCHED_DEBUG 256#ifdef CONFIG_SCHED_DEBUG
249 { 257 {
250 .ctl_name = CTL_UNNUMBERED, 258 .ctl_name = CTL_UNNUMBERED,
@@ -299,14 +307,6 @@ static struct ctl_table kern_table[] = {
299 }, 307 },
300 { 308 {
301 .ctl_name = CTL_UNNUMBERED, 309 .ctl_name = CTL_UNNUMBERED,
302 .procname = "sched_child_runs_first",
303 .data = &sysctl_sched_child_runs_first,
304 .maxlen = sizeof(unsigned int),
305 .mode = 0644,
306 .proc_handler = &proc_dointvec,
307 },
308 {
309 .ctl_name = CTL_UNNUMBERED,
310 .procname = "sched_features", 310 .procname = "sched_features",
311 .data = &sysctl_sched_features, 311 .data = &sysctl_sched_features,
312 .maxlen = sizeof(unsigned int), 312 .maxlen = sizeof(unsigned int),
@@ -331,6 +331,14 @@ static struct ctl_table kern_table[] = {
331 }, 331 },
332 { 332 {
333 .ctl_name = CTL_UNNUMBERED, 333 .ctl_name = CTL_UNNUMBERED,
334 .procname = "sched_time_avg",
335 .data = &sysctl_sched_time_avg,
336 .maxlen = sizeof(unsigned int),
337 .mode = 0644,
338 .proc_handler = &proc_dointvec,
339 },
340 {
341 .ctl_name = CTL_UNNUMBERED,
334 .procname = "timer_migration", 342 .procname = "timer_migration",
335 .data = &sysctl_timer_migration, 343 .data = &sysctl_timer_migration,
336 .maxlen = sizeof(unsigned int), 344 .maxlen = sizeof(unsigned int),
diff --git a/kernel/timer.c b/kernel/timer.c
index a7f07d5a6241..a3d25f415019 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1156,8 +1156,7 @@ void update_process_times(int user_tick)
1156 /* Note: this timer irq context must be accounted for as well. */ 1156 /* Note: this timer irq context must be accounted for as well. */
1157 account_process_tick(p, user_tick); 1157 account_process_tick(p, user_tick);
1158 run_local_timers(); 1158 run_local_timers();
1159 if (rcu_pending(cpu)) 1159 rcu_check_callbacks(cpu, user_tick);
1160 rcu_check_callbacks(cpu, user_tick);
1161 printk_tick(); 1160 printk_tick();
1162 scheduler_tick(); 1161 scheduler_tick();
1163 run_posix_cpu_timers(p); 1162 run_posix_cpu_timers(p);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 019f380fd764..1ea0d1234f4a 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -41,7 +41,7 @@ config HAVE_FTRACE_MCOUNT_RECORD
41config HAVE_HW_BRANCH_TRACER 41config HAVE_HW_BRANCH_TRACER
42 bool 42 bool
43 43
44config HAVE_FTRACE_SYSCALLS 44config HAVE_SYSCALL_TRACEPOINTS
45 bool 45 bool
46 46
47config TRACER_MAX_TRACE 47config TRACER_MAX_TRACE
@@ -60,9 +60,14 @@ config EVENT_TRACING
60 bool 60 bool
61 61
62config CONTEXT_SWITCH_TRACER 62config CONTEXT_SWITCH_TRACER
63 select MARKERS
64 bool 63 bool
65 64
65config RING_BUFFER_ALLOW_SWAP
66 bool
67 help
68 Allow the use of ring_buffer_swap_cpu.
69 Adds a very slight overhead to tracing when enabled.
70
66# All tracer options should select GENERIC_TRACER. For those options that are 71# All tracer options should select GENERIC_TRACER. For those options that are
67# enabled by all tracers (context switch and event tracer) they select TRACING. 72# enabled by all tracers (context switch and event tracer) they select TRACING.
68# This allows those options to appear when no other tracer is selected. But the 73# This allows those options to appear when no other tracer is selected. But the
@@ -147,6 +152,7 @@ config IRQSOFF_TRACER
147 select TRACE_IRQFLAGS 152 select TRACE_IRQFLAGS
148 select GENERIC_TRACER 153 select GENERIC_TRACER
149 select TRACER_MAX_TRACE 154 select TRACER_MAX_TRACE
155 select RING_BUFFER_ALLOW_SWAP
150 help 156 help
151 This option measures the time spent in irqs-off critical 157 This option measures the time spent in irqs-off critical
152 sections, with microsecond accuracy. 158 sections, with microsecond accuracy.
@@ -168,6 +174,7 @@ config PREEMPT_TRACER
168 depends on PREEMPT 174 depends on PREEMPT
169 select GENERIC_TRACER 175 select GENERIC_TRACER
170 select TRACER_MAX_TRACE 176 select TRACER_MAX_TRACE
177 select RING_BUFFER_ALLOW_SWAP
171 help 178 help
172 This option measures the time spent in preemption off critical 179 This option measures the time spent in preemption off critical
173 sections, with microsecond accuracy. 180 sections, with microsecond accuracy.
@@ -211,7 +218,7 @@ config ENABLE_DEFAULT_TRACERS
211 218
212config FTRACE_SYSCALLS 219config FTRACE_SYSCALLS
213 bool "Trace syscalls" 220 bool "Trace syscalls"
214 depends on HAVE_FTRACE_SYSCALLS 221 depends on HAVE_SYSCALL_TRACEPOINTS
215 select GENERIC_TRACER 222 select GENERIC_TRACER
216 select KALLSYMS 223 select KALLSYMS
217 help 224 help
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 7a34cb563fec..3eb159c277c8 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -65,13 +65,15 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
65{ 65{
66 struct blk_io_trace *t; 66 struct blk_io_trace *t;
67 struct ring_buffer_event *event = NULL; 67 struct ring_buffer_event *event = NULL;
68 struct ring_buffer *buffer = NULL;
68 int pc = 0; 69 int pc = 0;
69 int cpu = smp_processor_id(); 70 int cpu = smp_processor_id();
70 bool blk_tracer = blk_tracer_enabled; 71 bool blk_tracer = blk_tracer_enabled;
71 72
72 if (blk_tracer) { 73 if (blk_tracer) {
74 buffer = blk_tr->buffer;
73 pc = preempt_count(); 75 pc = preempt_count();
74 event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK, 76 event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
75 sizeof(*t) + len, 77 sizeof(*t) + len,
76 0, pc); 78 0, pc);
77 if (!event) 79 if (!event)
@@ -96,7 +98,7 @@ record_it:
96 memcpy((void *) t + sizeof(*t), data, len); 98 memcpy((void *) t + sizeof(*t), data, len);
97 99
98 if (blk_tracer) 100 if (blk_tracer)
99 trace_buffer_unlock_commit(blk_tr, event, 0, pc); 101 trace_buffer_unlock_commit(buffer, event, 0, pc);
100 } 102 }
101} 103}
102 104
@@ -179,6 +181,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
179{ 181{
180 struct task_struct *tsk = current; 182 struct task_struct *tsk = current;
181 struct ring_buffer_event *event = NULL; 183 struct ring_buffer_event *event = NULL;
184 struct ring_buffer *buffer = NULL;
182 struct blk_io_trace *t; 185 struct blk_io_trace *t;
183 unsigned long flags = 0; 186 unsigned long flags = 0;
184 unsigned long *sequence; 187 unsigned long *sequence;
@@ -204,8 +207,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
204 if (blk_tracer) { 207 if (blk_tracer) {
205 tracing_record_cmdline(current); 208 tracing_record_cmdline(current);
206 209
210 buffer = blk_tr->buffer;
207 pc = preempt_count(); 211 pc = preempt_count();
208 event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK, 212 event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
209 sizeof(*t) + pdu_len, 213 sizeof(*t) + pdu_len,
210 0, pc); 214 0, pc);
211 if (!event) 215 if (!event)
@@ -252,7 +256,7 @@ record_it:
252 memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); 256 memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
253 257
254 if (blk_tracer) { 258 if (blk_tracer) {
255 trace_buffer_unlock_commit(blk_tr, event, 0, pc); 259 trace_buffer_unlock_commit(buffer, event, 0, pc);
256 return; 260 return;
257 } 261 }
258 } 262 }
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 25edd5cc5935..8c804e24f96f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1016,71 +1016,35 @@ static int
1016__ftrace_replace_code(struct dyn_ftrace *rec, int enable) 1016__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1017{ 1017{
1018 unsigned long ftrace_addr; 1018 unsigned long ftrace_addr;
1019 unsigned long ip, fl; 1019 unsigned long flag = 0UL;
1020 1020
1021 ftrace_addr = (unsigned long)FTRACE_ADDR; 1021 ftrace_addr = (unsigned long)FTRACE_ADDR;
1022 1022
1023 ip = rec->ip;
1024
1025 /* 1023 /*
1026 * If this record is not to be traced and 1024 * If this record is not to be traced or we want to disable it,
1027 * it is not enabled then do nothing. 1025 * then disable it.
1028 * 1026 *
1029 * If this record is not to be traced and 1027 * If we want to enable it and filtering is off, then enable it.
1030 * it is enabled then disable it.
1031 * 1028 *
1029 * If we want to enable it and filtering is on, enable it only if
1030 * it's filtered
1032 */ 1031 */
1033 if (rec->flags & FTRACE_FL_NOTRACE) { 1032 if (enable && !(rec->flags & FTRACE_FL_NOTRACE)) {
1034 if (rec->flags & FTRACE_FL_ENABLED) 1033 if (!ftrace_filtered || (rec->flags & FTRACE_FL_FILTER))
1035 rec->flags &= ~FTRACE_FL_ENABLED; 1034 flag = FTRACE_FL_ENABLED;
1036 else 1035 }
1037 return 0;
1038
1039 } else if (ftrace_filtered && enable) {
1040 /*
1041 * Filtering is on:
1042 */
1043
1044 fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED);
1045
1046 /* Record is filtered and enabled, do nothing */
1047 if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED))
1048 return 0;
1049
1050 /* Record is not filtered or enabled, do nothing */
1051 if (!fl)
1052 return 0;
1053
1054 /* Record is not filtered but enabled, disable it */
1055 if (fl == FTRACE_FL_ENABLED)
1056 rec->flags &= ~FTRACE_FL_ENABLED;
1057 else
1058 /* Otherwise record is filtered but not enabled, enable it */
1059 rec->flags |= FTRACE_FL_ENABLED;
1060 } else {
1061 /* Disable or not filtered */
1062
1063 if (enable) {
1064 /* if record is enabled, do nothing */
1065 if (rec->flags & FTRACE_FL_ENABLED)
1066 return 0;
1067
1068 rec->flags |= FTRACE_FL_ENABLED;
1069
1070 } else {
1071 1036
1072 /* if record is not enabled, do nothing */ 1037 /* If the state of this record hasn't changed, then do nothing */
1073 if (!(rec->flags & FTRACE_FL_ENABLED)) 1038 if ((rec->flags & FTRACE_FL_ENABLED) == flag)
1074 return 0; 1039 return 0;
1075 1040
1076 rec->flags &= ~FTRACE_FL_ENABLED; 1041 if (flag) {
1077 } 1042 rec->flags |= FTRACE_FL_ENABLED;
1043 return ftrace_make_call(rec, ftrace_addr);
1078 } 1044 }
1079 1045
1080 if (rec->flags & FTRACE_FL_ENABLED) 1046 rec->flags &= ~FTRACE_FL_ENABLED;
1081 return ftrace_make_call(rec, ftrace_addr); 1047 return ftrace_make_nop(NULL, rec, ftrace_addr);
1082 else
1083 return ftrace_make_nop(NULL, rec, ftrace_addr);
1084} 1048}
1085 1049
1086static void ftrace_replace_code(int enable) 1050static void ftrace_replace_code(int enable)
@@ -1375,7 +1339,6 @@ struct ftrace_iterator {
1375 unsigned flags; 1339 unsigned flags;
1376 unsigned char buffer[FTRACE_BUFF_MAX+1]; 1340 unsigned char buffer[FTRACE_BUFF_MAX+1];
1377 unsigned buffer_idx; 1341 unsigned buffer_idx;
1378 unsigned filtered;
1379}; 1342};
1380 1343
1381static void * 1344static void *
@@ -1438,18 +1401,13 @@ static int t_hash_show(struct seq_file *m, void *v)
1438{ 1401{
1439 struct ftrace_func_probe *rec; 1402 struct ftrace_func_probe *rec;
1440 struct hlist_node *hnd = v; 1403 struct hlist_node *hnd = v;
1441 char str[KSYM_SYMBOL_LEN];
1442 1404
1443 rec = hlist_entry(hnd, struct ftrace_func_probe, node); 1405 rec = hlist_entry(hnd, struct ftrace_func_probe, node);
1444 1406
1445 if (rec->ops->print) 1407 if (rec->ops->print)
1446 return rec->ops->print(m, rec->ip, rec->ops, rec->data); 1408 return rec->ops->print(m, rec->ip, rec->ops, rec->data);
1447 1409
1448 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 1410 seq_printf(m, "%pf:%pf", (void *)rec->ip, (void *)rec->ops->func);
1449 seq_printf(m, "%s:", str);
1450
1451 kallsyms_lookup((unsigned long)rec->ops->func, NULL, NULL, NULL, str);
1452 seq_printf(m, "%s", str);
1453 1411
1454 if (rec->data) 1412 if (rec->data)
1455 seq_printf(m, ":%p", rec->data); 1413 seq_printf(m, ":%p", rec->data);
@@ -1547,7 +1505,6 @@ static int t_show(struct seq_file *m, void *v)
1547{ 1505{
1548 struct ftrace_iterator *iter = m->private; 1506 struct ftrace_iterator *iter = m->private;
1549 struct dyn_ftrace *rec = v; 1507 struct dyn_ftrace *rec = v;
1550 char str[KSYM_SYMBOL_LEN];
1551 1508
1552 if (iter->flags & FTRACE_ITER_HASH) 1509 if (iter->flags & FTRACE_ITER_HASH)
1553 return t_hash_show(m, v); 1510 return t_hash_show(m, v);
@@ -1560,9 +1517,7 @@ static int t_show(struct seq_file *m, void *v)
1560 if (!rec) 1517 if (!rec)
1561 return 0; 1518 return 0;
1562 1519
1563 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 1520 seq_printf(m, "%pf\n", (void *)rec->ip);
1564
1565 seq_printf(m, "%s\n", str);
1566 1521
1567 return 0; 1522 return 0;
1568} 1523}
@@ -1601,17 +1556,6 @@ ftrace_avail_open(struct inode *inode, struct file *file)
1601 return ret; 1556 return ret;
1602} 1557}
1603 1558
1604int ftrace_avail_release(struct inode *inode, struct file *file)
1605{
1606 struct seq_file *m = (struct seq_file *)file->private_data;
1607 struct ftrace_iterator *iter = m->private;
1608
1609 seq_release(inode, file);
1610 kfree(iter);
1611
1612 return 0;
1613}
1614
1615static int 1559static int
1616ftrace_failures_open(struct inode *inode, struct file *file) 1560ftrace_failures_open(struct inode *inode, struct file *file)
1617{ 1561{
@@ -2317,7 +2261,6 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2317 } 2261 }
2318 2262
2319 if (isspace(ch)) { 2263 if (isspace(ch)) {
2320 iter->filtered++;
2321 iter->buffer[iter->buffer_idx] = 0; 2264 iter->buffer[iter->buffer_idx] = 0;
2322 ret = ftrace_process_regex(iter->buffer, 2265 ret = ftrace_process_regex(iter->buffer,
2323 iter->buffer_idx, enable); 2266 iter->buffer_idx, enable);
@@ -2448,7 +2391,6 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2448 iter = file->private_data; 2391 iter = file->private_data;
2449 2392
2450 if (iter->buffer_idx) { 2393 if (iter->buffer_idx) {
2451 iter->filtered++;
2452 iter->buffer[iter->buffer_idx] = 0; 2394 iter->buffer[iter->buffer_idx] = 0;
2453 ftrace_match_records(iter->buffer, iter->buffer_idx, enable); 2395 ftrace_match_records(iter->buffer, iter->buffer_idx, enable);
2454 } 2396 }
@@ -2479,14 +2421,14 @@ static const struct file_operations ftrace_avail_fops = {
2479 .open = ftrace_avail_open, 2421 .open = ftrace_avail_open,
2480 .read = seq_read, 2422 .read = seq_read,
2481 .llseek = seq_lseek, 2423 .llseek = seq_lseek,
2482 .release = ftrace_avail_release, 2424 .release = seq_release_private,
2483}; 2425};
2484 2426
2485static const struct file_operations ftrace_failures_fops = { 2427static const struct file_operations ftrace_failures_fops = {
2486 .open = ftrace_failures_open, 2428 .open = ftrace_failures_open,
2487 .read = seq_read, 2429 .read = seq_read,
2488 .llseek = seq_lseek, 2430 .llseek = seq_lseek,
2489 .release = ftrace_avail_release, 2431 .release = seq_release_private,
2490}; 2432};
2491 2433
2492static const struct file_operations ftrace_filter_fops = { 2434static const struct file_operations ftrace_filter_fops = {
@@ -2548,7 +2490,6 @@ static void g_stop(struct seq_file *m, void *p)
2548static int g_show(struct seq_file *m, void *v) 2490static int g_show(struct seq_file *m, void *v)
2549{ 2491{
2550 unsigned long *ptr = v; 2492 unsigned long *ptr = v;
2551 char str[KSYM_SYMBOL_LEN];
2552 2493
2553 if (!ptr) 2494 if (!ptr)
2554 return 0; 2495 return 0;
@@ -2558,9 +2499,7 @@ static int g_show(struct seq_file *m, void *v)
2558 return 0; 2499 return 0;
2559 } 2500 }
2560 2501
2561 kallsyms_lookup(*ptr, NULL, NULL, NULL, str); 2502 seq_printf(m, "%pf\n", v);
2562
2563 seq_printf(m, "%s\n", str);
2564 2503
2565 return 0; 2504 return 0;
2566} 2505}
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 1edaa9516e81..81b1645c8549 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -183,11 +183,9 @@ static void kmemtrace_stop_probes(void)
183 183
184static int kmem_trace_init(struct trace_array *tr) 184static int kmem_trace_init(struct trace_array *tr)
185{ 185{
186 int cpu;
187 kmemtrace_array = tr; 186 kmemtrace_array = tr;
188 187
189 for_each_cpu(cpu, cpu_possible_mask) 188 tracing_reset_online_cpus(tr);
190 tracing_reset(tr, cpu);
191 189
192 kmemtrace_start_probes(); 190 kmemtrace_start_probes();
193 191
@@ -239,12 +237,52 @@ struct kmemtrace_user_event_alloc {
239}; 237};
240 238
241static enum print_line_t 239static enum print_line_t
242kmemtrace_print_alloc_user(struct trace_iterator *iter, 240kmemtrace_print_alloc(struct trace_iterator *iter, int flags)
243 struct kmemtrace_alloc_entry *entry)
244{ 241{
245 struct kmemtrace_user_event_alloc *ev_alloc;
246 struct trace_seq *s = &iter->seq; 242 struct trace_seq *s = &iter->seq;
243 struct kmemtrace_alloc_entry *entry;
244 int ret;
245
246 trace_assign_type(entry, iter->ent);
247
248 ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu "
249 "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n",
250 entry->type_id, (void *)entry->call_site, (unsigned long)entry->ptr,
251 (unsigned long)entry->bytes_req, (unsigned long)entry->bytes_alloc,
252 (unsigned long)entry->gfp_flags, entry->node);
253
254 if (!ret)
255 return TRACE_TYPE_PARTIAL_LINE;
256 return TRACE_TYPE_HANDLED;
257}
258
259static enum print_line_t
260kmemtrace_print_free(struct trace_iterator *iter, int flags)
261{
262 struct trace_seq *s = &iter->seq;
263 struct kmemtrace_free_entry *entry;
264 int ret;
265
266 trace_assign_type(entry, iter->ent);
267
268 ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu\n",
269 entry->type_id, (void *)entry->call_site,
270 (unsigned long)entry->ptr);
271
272 if (!ret)
273 return TRACE_TYPE_PARTIAL_LINE;
274 return TRACE_TYPE_HANDLED;
275}
276
277static enum print_line_t
278kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags)
279{
280 struct trace_seq *s = &iter->seq;
281 struct kmemtrace_alloc_entry *entry;
247 struct kmemtrace_user_event *ev; 282 struct kmemtrace_user_event *ev;
283 struct kmemtrace_user_event_alloc *ev_alloc;
284
285 trace_assign_type(entry, iter->ent);
248 286
249 ev = trace_seq_reserve(s, sizeof(*ev)); 287 ev = trace_seq_reserve(s, sizeof(*ev));
250 if (!ev) 288 if (!ev)
@@ -271,12 +309,14 @@ kmemtrace_print_alloc_user(struct trace_iterator *iter,
271} 309}
272 310
273static enum print_line_t 311static enum print_line_t
274kmemtrace_print_free_user(struct trace_iterator *iter, 312kmemtrace_print_free_user(struct trace_iterator *iter, int flags)
275 struct kmemtrace_free_entry *entry)
276{ 313{
277 struct trace_seq *s = &iter->seq; 314 struct trace_seq *s = &iter->seq;
315 struct kmemtrace_free_entry *entry;
278 struct kmemtrace_user_event *ev; 316 struct kmemtrace_user_event *ev;
279 317
318 trace_assign_type(entry, iter->ent);
319
280 ev = trace_seq_reserve(s, sizeof(*ev)); 320 ev = trace_seq_reserve(s, sizeof(*ev));
281 if (!ev) 321 if (!ev)
282 return TRACE_TYPE_PARTIAL_LINE; 322 return TRACE_TYPE_PARTIAL_LINE;
@@ -294,12 +334,14 @@ kmemtrace_print_free_user(struct trace_iterator *iter,
294 334
295/* The two other following provide a more minimalistic output */ 335/* The two other following provide a more minimalistic output */
296static enum print_line_t 336static enum print_line_t
297kmemtrace_print_alloc_compress(struct trace_iterator *iter, 337kmemtrace_print_alloc_compress(struct trace_iterator *iter)
298 struct kmemtrace_alloc_entry *entry)
299{ 338{
339 struct kmemtrace_alloc_entry *entry;
300 struct trace_seq *s = &iter->seq; 340 struct trace_seq *s = &iter->seq;
301 int ret; 341 int ret;
302 342
343 trace_assign_type(entry, iter->ent);
344
303 /* Alloc entry */ 345 /* Alloc entry */
304 ret = trace_seq_printf(s, " + "); 346 ret = trace_seq_printf(s, " + ");
305 if (!ret) 347 if (!ret)
@@ -345,29 +387,24 @@ kmemtrace_print_alloc_compress(struct trace_iterator *iter,
345 if (!ret) 387 if (!ret)
346 return TRACE_TYPE_PARTIAL_LINE; 388 return TRACE_TYPE_PARTIAL_LINE;
347 389
348 /* Node */ 390 /* Node and call site*/
349 ret = trace_seq_printf(s, "%4d ", entry->node); 391 ret = trace_seq_printf(s, "%4d %pf\n", entry->node,
350 if (!ret) 392 (void *)entry->call_site);
351 return TRACE_TYPE_PARTIAL_LINE;
352
353 /* Call site */
354 ret = seq_print_ip_sym(s, entry->call_site, 0);
355 if (!ret) 393 if (!ret)
356 return TRACE_TYPE_PARTIAL_LINE; 394 return TRACE_TYPE_PARTIAL_LINE;
357 395
358 if (!trace_seq_printf(s, "\n"))
359 return TRACE_TYPE_PARTIAL_LINE;
360
361 return TRACE_TYPE_HANDLED; 396 return TRACE_TYPE_HANDLED;
362} 397}
363 398
364static enum print_line_t 399static enum print_line_t
365kmemtrace_print_free_compress(struct trace_iterator *iter, 400kmemtrace_print_free_compress(struct trace_iterator *iter)
366 struct kmemtrace_free_entry *entry)
367{ 401{
402 struct kmemtrace_free_entry *entry;
368 struct trace_seq *s = &iter->seq; 403 struct trace_seq *s = &iter->seq;
369 int ret; 404 int ret;
370 405
406 trace_assign_type(entry, iter->ent);
407
371 /* Free entry */ 408 /* Free entry */
372 ret = trace_seq_printf(s, " - "); 409 ret = trace_seq_printf(s, " - ");
373 if (!ret) 410 if (!ret)
@@ -401,19 +438,11 @@ kmemtrace_print_free_compress(struct trace_iterator *iter,
401 if (!ret) 438 if (!ret)
402 return TRACE_TYPE_PARTIAL_LINE; 439 return TRACE_TYPE_PARTIAL_LINE;
403 440
404 /* Skip node */ 441 /* Skip node and print call site*/
405 ret = trace_seq_printf(s, " "); 442 ret = trace_seq_printf(s, " %pf\n", (void *)entry->call_site);
406 if (!ret) 443 if (!ret)
407 return TRACE_TYPE_PARTIAL_LINE; 444 return TRACE_TYPE_PARTIAL_LINE;
408 445
409 /* Call site */
410 ret = seq_print_ip_sym(s, entry->call_site, 0);
411 if (!ret)
412 return TRACE_TYPE_PARTIAL_LINE;
413
414 if (!trace_seq_printf(s, "\n"))
415 return TRACE_TYPE_PARTIAL_LINE;
416
417 return TRACE_TYPE_HANDLED; 446 return TRACE_TYPE_HANDLED;
418} 447}
419 448
@@ -421,32 +450,31 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
421{ 450{
422 struct trace_entry *entry = iter->ent; 451 struct trace_entry *entry = iter->ent;
423 452
424 switch (entry->type) { 453 if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
425 case TRACE_KMEM_ALLOC: { 454 return TRACE_TYPE_UNHANDLED;
426 struct kmemtrace_alloc_entry *field;
427
428 trace_assign_type(field, entry);
429 if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
430 return kmemtrace_print_alloc_compress(iter, field);
431 else
432 return kmemtrace_print_alloc_user(iter, field);
433 }
434
435 case TRACE_KMEM_FREE: {
436 struct kmemtrace_free_entry *field;
437
438 trace_assign_type(field, entry);
439 if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
440 return kmemtrace_print_free_compress(iter, field);
441 else
442 return kmemtrace_print_free_user(iter, field);
443 }
444 455
456 switch (entry->type) {
457 case TRACE_KMEM_ALLOC:
458 return kmemtrace_print_alloc_compress(iter);
459 case TRACE_KMEM_FREE:
460 return kmemtrace_print_free_compress(iter);
445 default: 461 default:
446 return TRACE_TYPE_UNHANDLED; 462 return TRACE_TYPE_UNHANDLED;
447 } 463 }
448} 464}
449 465
466static struct trace_event kmem_trace_alloc = {
467 .type = TRACE_KMEM_ALLOC,
468 .trace = kmemtrace_print_alloc,
469 .binary = kmemtrace_print_alloc_user,
470};
471
472static struct trace_event kmem_trace_free = {
473 .type = TRACE_KMEM_FREE,
474 .trace = kmemtrace_print_free,
475 .binary = kmemtrace_print_free_user,
476};
477
450static struct tracer kmem_tracer __read_mostly = { 478static struct tracer kmem_tracer __read_mostly = {
451 .name = "kmemtrace", 479 .name = "kmemtrace",
452 .init = kmem_trace_init, 480 .init = kmem_trace_init,
@@ -463,6 +491,21 @@ void kmemtrace_init(void)
463 491
464static int __init init_kmem_tracer(void) 492static int __init init_kmem_tracer(void)
465{ 493{
466 return register_tracer(&kmem_tracer); 494 if (!register_ftrace_event(&kmem_trace_alloc)) {
495 pr_warning("Warning: could not register kmem events\n");
496 return 1;
497 }
498
499 if (!register_ftrace_event(&kmem_trace_free)) {
500 pr_warning("Warning: could not register kmem events\n");
501 return 1;
502 }
503
504 if (!register_tracer(&kmem_tracer)) {
505 pr_warning("Warning: could not register the kmem tracer\n");
506 return 1;
507 }
508
509 return 0;
467} 510}
468device_initcall(init_kmem_tracer); 511device_initcall(init_kmem_tracer);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a330513d96ce..454e74e718cf 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -218,17 +218,12 @@ enum {
218 218
219static inline int rb_null_event(struct ring_buffer_event *event) 219static inline int rb_null_event(struct ring_buffer_event *event)
220{ 220{
221 return event->type_len == RINGBUF_TYPE_PADDING 221 return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
222 && event->time_delta == 0;
223}
224
225static inline int rb_discarded_event(struct ring_buffer_event *event)
226{
227 return event->type_len == RINGBUF_TYPE_PADDING && event->time_delta;
228} 222}
229 223
230static void rb_event_set_padding(struct ring_buffer_event *event) 224static void rb_event_set_padding(struct ring_buffer_event *event)
231{ 225{
226 /* padding has a NULL time_delta */
232 event->type_len = RINGBUF_TYPE_PADDING; 227 event->type_len = RINGBUF_TYPE_PADDING;
233 event->time_delta = 0; 228 event->time_delta = 0;
234} 229}
@@ -322,6 +317,14 @@ struct buffer_data_page {
322 unsigned char data[]; /* data of buffer page */ 317 unsigned char data[]; /* data of buffer page */
323}; 318};
324 319
320/*
321 * Note, the buffer_page list must be first. The buffer pages
322 * are allocated in cache lines, which means that each buffer
323 * page will be at the beginning of a cache line, and thus
324 * the least significant bits will be zero. We use this to
325 * add flags in the list struct pointers, to make the ring buffer
326 * lockless.
327 */
325struct buffer_page { 328struct buffer_page {
326 struct list_head list; /* list of buffer pages */ 329 struct list_head list; /* list of buffer pages */
327 local_t write; /* index for next write */ 330 local_t write; /* index for next write */
@@ -330,6 +333,21 @@ struct buffer_page {
330 struct buffer_data_page *page; /* Actual data page */ 333 struct buffer_data_page *page; /* Actual data page */
331}; 334};
332 335
336/*
337 * The buffer page counters, write and entries, must be reset
338 * atomically when crossing page boundaries. To synchronize this
339 * update, two counters are inserted into the number. One is
340 * the actual counter for the write position or count on the page.
341 *
342 * The other is a counter of updaters. Before an update happens
343 * the update partition of the counter is incremented. This will
344 * allow the updater to update the counter atomically.
345 *
346 * The counter is 20 bits, and the state data is 12.
347 */
348#define RB_WRITE_MASK 0xfffff
349#define RB_WRITE_INTCNT (1 << 20)
350
333static void rb_init_page(struct buffer_data_page *bpage) 351static void rb_init_page(struct buffer_data_page *bpage)
334{ 352{
335 local_set(&bpage->commit, 0); 353 local_set(&bpage->commit, 0);
@@ -403,21 +421,20 @@ int ring_buffer_print_page_header(struct trace_seq *s)
403struct ring_buffer_per_cpu { 421struct ring_buffer_per_cpu {
404 int cpu; 422 int cpu;
405 struct ring_buffer *buffer; 423 struct ring_buffer *buffer;
406 spinlock_t reader_lock; /* serialize readers */ 424 spinlock_t reader_lock; /* serialize readers */
407 raw_spinlock_t lock; 425 raw_spinlock_t lock;
408 struct lock_class_key lock_key; 426 struct lock_class_key lock_key;
409 struct list_head pages; 427 struct list_head *pages;
410 struct buffer_page *head_page; /* read from head */ 428 struct buffer_page *head_page; /* read from head */
411 struct buffer_page *tail_page; /* write to tail */ 429 struct buffer_page *tail_page; /* write to tail */
412 struct buffer_page *commit_page; /* committed pages */ 430 struct buffer_page *commit_page; /* committed pages */
413 struct buffer_page *reader_page; 431 struct buffer_page *reader_page;
414 unsigned long nmi_dropped; 432 local_t commit_overrun;
415 unsigned long commit_overrun; 433 local_t overrun;
416 unsigned long overrun;
417 unsigned long read;
418 local_t entries; 434 local_t entries;
419 local_t committing; 435 local_t committing;
420 local_t commits; 436 local_t commits;
437 unsigned long read;
421 u64 write_stamp; 438 u64 write_stamp;
422 u64 read_stamp; 439 u64 read_stamp;
423 atomic_t record_disabled; 440 atomic_t record_disabled;
@@ -450,14 +467,19 @@ struct ring_buffer_iter {
450}; 467};
451 468
452/* buffer may be either ring_buffer or ring_buffer_per_cpu */ 469/* buffer may be either ring_buffer or ring_buffer_per_cpu */
453#define RB_WARN_ON(buffer, cond) \ 470#define RB_WARN_ON(b, cond) \
454 ({ \ 471 ({ \
455 int _____ret = unlikely(cond); \ 472 int _____ret = unlikely(cond); \
456 if (_____ret) { \ 473 if (_____ret) { \
457 atomic_inc(&buffer->record_disabled); \ 474 if (__same_type(*(b), struct ring_buffer_per_cpu)) { \
458 WARN_ON(1); \ 475 struct ring_buffer_per_cpu *__b = \
459 } \ 476 (void *)b; \
460 _____ret; \ 477 atomic_inc(&__b->buffer->record_disabled); \
478 } else \
479 atomic_inc(&b->record_disabled); \
480 WARN_ON(1); \
481 } \
482 _____ret; \
461 }) 483 })
462 484
463/* Up this if you want to test the TIME_EXTENTS and normalization */ 485/* Up this if you want to test the TIME_EXTENTS and normalization */
@@ -489,6 +511,390 @@ void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
489} 511}
490EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); 512EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
491 513
514/*
515 * Making the ring buffer lockless makes things tricky.
516 * Although writes only happen on the CPU that they are on,
517 * and they only need to worry about interrupts. Reads can
518 * happen on any CPU.
519 *
520 * The reader page is always off the ring buffer, but when the
521 * reader finishes with a page, it needs to swap its page with
522 * a new one from the buffer. The reader needs to take from
523 * the head (writes go to the tail). But if a writer is in overwrite
524 * mode and wraps, it must push the head page forward.
525 *
526 * Here lies the problem.
527 *
528 * The reader must be careful to replace only the head page, and
529 * not another one. As described at the top of the file in the
530 * ASCII art, the reader sets its old page to point to the next
531 * page after head. It then sets the page after head to point to
532 * the old reader page. But if the writer moves the head page
533 * during this operation, the reader could end up with the tail.
534 *
535 * We use cmpxchg to help prevent this race. We also do something
536 * special with the page before head. We set the LSB to 1.
537 *
538 * When the writer must push the page forward, it will clear the
539 * bit that points to the head page, move the head, and then set
540 * the bit that points to the new head page.
541 *
542 * We also don't want an interrupt coming in and moving the head
543 * page on another writer. Thus we use the second LSB to catch
544 * that too. Thus:
545 *
546 * head->list->prev->next bit 1 bit 0
547 * ------- -------
548 * Normal page 0 0
549 * Points to head page 0 1
550 * New head page 1 0
551 *
552 * Note we can not trust the prev pointer of the head page, because:
553 *
554 * +----+ +-----+ +-----+
555 * | |------>| T |---X--->| N |
556 * | |<------| | | |
557 * +----+ +-----+ +-----+
558 * ^ ^ |
559 * | +-----+ | |
560 * +----------| R |----------+ |
561 * | |<-----------+
562 * +-----+
563 *
564 * Key: ---X--> HEAD flag set in pointer
565 * T Tail page
566 * R Reader page
567 * N Next page
568 *
569 * (see __rb_reserve_next() to see where this happens)
570 *
571 * What the above shows is that the reader just swapped out
572 * the reader page with a page in the buffer, but before it
573 * could make the new header point back to the new page added
574 * it was preempted by a writer. The writer moved forward onto
575 * the new page added by the reader and is about to move forward
576 * again.
577 *
578 * You can see, it is legitimate for the previous pointer of
579 * the head (or any page) not to point back to itself. But only
580 * temporarially.
581 */
582
583#define RB_PAGE_NORMAL 0UL
584#define RB_PAGE_HEAD 1UL
585#define RB_PAGE_UPDATE 2UL
586
587
588#define RB_FLAG_MASK 3UL
589
590/* PAGE_MOVED is not part of the mask */
591#define RB_PAGE_MOVED 4UL
592
593/*
594 * rb_list_head - remove any bit
595 */
596static struct list_head *rb_list_head(struct list_head *list)
597{
598 unsigned long val = (unsigned long)list;
599
600 return (struct list_head *)(val & ~RB_FLAG_MASK);
601}
602
603/*
604 * rb_is_head_page - test if the give page is the head page
605 *
606 * Because the reader may move the head_page pointer, we can
607 * not trust what the head page is (it may be pointing to
608 * the reader page). But if the next page is a header page,
609 * its flags will be non zero.
610 */
611static int inline
612rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer,
613 struct buffer_page *page, struct list_head *list)
614{
615 unsigned long val;
616
617 val = (unsigned long)list->next;
618
619 if ((val & ~RB_FLAG_MASK) != (unsigned long)&page->list)
620 return RB_PAGE_MOVED;
621
622 return val & RB_FLAG_MASK;
623}
624
625/*
626 * rb_is_reader_page
627 *
628 * The unique thing about the reader page, is that, if the
629 * writer is ever on it, the previous pointer never points
630 * back to the reader page.
631 */
632static int rb_is_reader_page(struct buffer_page *page)
633{
634 struct list_head *list = page->list.prev;
635
636 return rb_list_head(list->next) != &page->list;
637}
638
639/*
640 * rb_set_list_to_head - set a list_head to be pointing to head.
641 */
642static void rb_set_list_to_head(struct ring_buffer_per_cpu *cpu_buffer,
643 struct list_head *list)
644{
645 unsigned long *ptr;
646
647 ptr = (unsigned long *)&list->next;
648 *ptr |= RB_PAGE_HEAD;
649 *ptr &= ~RB_PAGE_UPDATE;
650}
651
652/*
653 * rb_head_page_activate - sets up head page
654 */
655static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer)
656{
657 struct buffer_page *head;
658
659 head = cpu_buffer->head_page;
660 if (!head)
661 return;
662
663 /*
664 * Set the previous list pointer to have the HEAD flag.
665 */
666 rb_set_list_to_head(cpu_buffer, head->list.prev);
667}
668
669static void rb_list_head_clear(struct list_head *list)
670{
671 unsigned long *ptr = (unsigned long *)&list->next;
672
673 *ptr &= ~RB_FLAG_MASK;
674}
675
676/*
677 * rb_head_page_dactivate - clears head page ptr (for free list)
678 */
679static void
680rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer)
681{
682 struct list_head *hd;
683
684 /* Go through the whole list and clear any pointers found. */
685 rb_list_head_clear(cpu_buffer->pages);
686
687 list_for_each(hd, cpu_buffer->pages)
688 rb_list_head_clear(hd);
689}
690
691static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer,
692 struct buffer_page *head,
693 struct buffer_page *prev,
694 int old_flag, int new_flag)
695{
696 struct list_head *list;
697 unsigned long val = (unsigned long)&head->list;
698 unsigned long ret;
699
700 list = &prev->list;
701
702 val &= ~RB_FLAG_MASK;
703
704 ret = (unsigned long)cmpxchg(&list->next,
705 val | old_flag, val | new_flag);
706
707 /* check if the reader took the page */
708 if ((ret & ~RB_FLAG_MASK) != val)
709 return RB_PAGE_MOVED;
710
711 return ret & RB_FLAG_MASK;
712}
713
714static int rb_head_page_set_update(struct ring_buffer_per_cpu *cpu_buffer,
715 struct buffer_page *head,
716 struct buffer_page *prev,
717 int old_flag)
718{
719 return rb_head_page_set(cpu_buffer, head, prev,
720 old_flag, RB_PAGE_UPDATE);
721}
722
723static int rb_head_page_set_head(struct ring_buffer_per_cpu *cpu_buffer,
724 struct buffer_page *head,
725 struct buffer_page *prev,
726 int old_flag)
727{
728 return rb_head_page_set(cpu_buffer, head, prev,
729 old_flag, RB_PAGE_HEAD);
730}
731
732static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer,
733 struct buffer_page *head,
734 struct buffer_page *prev,
735 int old_flag)
736{
737 return rb_head_page_set(cpu_buffer, head, prev,
738 old_flag, RB_PAGE_NORMAL);
739}
740
741static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
742 struct buffer_page **bpage)
743{
744 struct list_head *p = rb_list_head((*bpage)->list.next);
745
746 *bpage = list_entry(p, struct buffer_page, list);
747}
748
749static struct buffer_page *
750rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
751{
752 struct buffer_page *head;
753 struct buffer_page *page;
754 struct list_head *list;
755 int i;
756
757 if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page))
758 return NULL;
759
760 /* sanity check */
761 list = cpu_buffer->pages;
762 if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev->next) != list))
763 return NULL;
764
765 page = head = cpu_buffer->head_page;
766 /*
767 * It is possible that the writer moves the header behind
768 * where we started, and we miss in one loop.
769 * A second loop should grab the header, but we'll do
770 * three loops just because I'm paranoid.
771 */
772 for (i = 0; i < 3; i++) {
773 do {
774 if (rb_is_head_page(cpu_buffer, page, page->list.prev)) {
775 cpu_buffer->head_page = page;
776 return page;
777 }
778 rb_inc_page(cpu_buffer, &page);
779 } while (page != head);
780 }
781
782 RB_WARN_ON(cpu_buffer, 1);
783
784 return NULL;
785}
786
787static int rb_head_page_replace(struct buffer_page *old,
788 struct buffer_page *new)
789{
790 unsigned long *ptr = (unsigned long *)&old->list.prev->next;
791 unsigned long val;
792 unsigned long ret;
793
794 val = *ptr & ~RB_FLAG_MASK;
795 val |= RB_PAGE_HEAD;
796
797 ret = cmpxchg(ptr, val, &new->list);
798
799 return ret == val;
800}
801
802/*
803 * rb_tail_page_update - move the tail page forward
804 *
805 * Returns 1 if moved tail page, 0 if someone else did.
806 */
807static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
808 struct buffer_page *tail_page,
809 struct buffer_page *next_page)
810{
811 struct buffer_page *old_tail;
812 unsigned long old_entries;
813 unsigned long old_write;
814 int ret = 0;
815
816 /*
817 * The tail page now needs to be moved forward.
818 *
819 * We need to reset the tail page, but without messing
820 * with possible erasing of data brought in by interrupts
821 * that have moved the tail page and are currently on it.
822 *
823 * We add a counter to the write field to denote this.
824 */
825 old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write);
826 old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries);
827
828 /*
829 * Just make sure we have seen our old_write and synchronize
830 * with any interrupts that come in.
831 */
832 barrier();
833
834 /*
835 * If the tail page is still the same as what we think
836 * it is, then it is up to us to update the tail
837 * pointer.
838 */
839 if (tail_page == cpu_buffer->tail_page) {
840 /* Zero the write counter */
841 unsigned long val = old_write & ~RB_WRITE_MASK;
842 unsigned long eval = old_entries & ~RB_WRITE_MASK;
843
844 /*
845 * This will only succeed if an interrupt did
846 * not come in and change it. In which case, we
847 * do not want to modify it.
848 *
849 * We add (void) to let the compiler know that we do not care
850 * about the return value of these functions. We use the
851 * cmpxchg to only update if an interrupt did not already
852 * do it for us. If the cmpxchg fails, we don't care.
853 */
854 (void)local_cmpxchg(&next_page->write, old_write, val);
855 (void)local_cmpxchg(&next_page->entries, old_entries, eval);
856
857 /*
858 * No need to worry about races with clearing out the commit.
859 * it only can increment when a commit takes place. But that
860 * only happens in the outer most nested commit.
861 */
862 local_set(&next_page->page->commit, 0);
863
864 old_tail = cmpxchg(&cpu_buffer->tail_page,
865 tail_page, next_page);
866
867 if (old_tail == tail_page)
868 ret = 1;
869 }
870
871 return ret;
872}
873
874static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
875 struct buffer_page *bpage)
876{
877 unsigned long val = (unsigned long)bpage;
878
879 if (RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK))
880 return 1;
881
882 return 0;
883}
884
885/**
886 * rb_check_list - make sure a pointer to a list has the last bits zero
887 */
888static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer,
889 struct list_head *list)
890{
891 if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev) != list->prev))
892 return 1;
893 if (RB_WARN_ON(cpu_buffer, rb_list_head(list->next) != list->next))
894 return 1;
895 return 0;
896}
897
492/** 898/**
493 * check_pages - integrity check of buffer pages 899 * check_pages - integrity check of buffer pages
494 * @cpu_buffer: CPU buffer with pages to test 900 * @cpu_buffer: CPU buffer with pages to test
@@ -498,14 +904,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
498 */ 904 */
499static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) 905static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
500{ 906{
501 struct list_head *head = &cpu_buffer->pages; 907 struct list_head *head = cpu_buffer->pages;
502 struct buffer_page *bpage, *tmp; 908 struct buffer_page *bpage, *tmp;
503 909
910 rb_head_page_deactivate(cpu_buffer);
911
504 if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) 912 if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
505 return -1; 913 return -1;
506 if (RB_WARN_ON(cpu_buffer, head->prev->next != head)) 914 if (RB_WARN_ON(cpu_buffer, head->prev->next != head))
507 return -1; 915 return -1;
508 916
917 if (rb_check_list(cpu_buffer, head))
918 return -1;
919
509 list_for_each_entry_safe(bpage, tmp, head, list) { 920 list_for_each_entry_safe(bpage, tmp, head, list) {
510 if (RB_WARN_ON(cpu_buffer, 921 if (RB_WARN_ON(cpu_buffer,
511 bpage->list.next->prev != &bpage->list)) 922 bpage->list.next->prev != &bpage->list))
@@ -513,25 +924,33 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
513 if (RB_WARN_ON(cpu_buffer, 924 if (RB_WARN_ON(cpu_buffer,
514 bpage->list.prev->next != &bpage->list)) 925 bpage->list.prev->next != &bpage->list))
515 return -1; 926 return -1;
927 if (rb_check_list(cpu_buffer, &bpage->list))
928 return -1;
516 } 929 }
517 930
931 rb_head_page_activate(cpu_buffer);
932
518 return 0; 933 return 0;
519} 934}
520 935
521static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, 936static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
522 unsigned nr_pages) 937 unsigned nr_pages)
523{ 938{
524 struct list_head *head = &cpu_buffer->pages;
525 struct buffer_page *bpage, *tmp; 939 struct buffer_page *bpage, *tmp;
526 unsigned long addr; 940 unsigned long addr;
527 LIST_HEAD(pages); 941 LIST_HEAD(pages);
528 unsigned i; 942 unsigned i;
529 943
944 WARN_ON(!nr_pages);
945
530 for (i = 0; i < nr_pages; i++) { 946 for (i = 0; i < nr_pages; i++) {
531 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 947 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
532 GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); 948 GFP_KERNEL, cpu_to_node(cpu_buffer->cpu));
533 if (!bpage) 949 if (!bpage)
534 goto free_pages; 950 goto free_pages;
951
952 rb_check_bpage(cpu_buffer, bpage);
953
535 list_add(&bpage->list, &pages); 954 list_add(&bpage->list, &pages);
536 955
537 addr = __get_free_page(GFP_KERNEL); 956 addr = __get_free_page(GFP_KERNEL);
@@ -541,7 +960,13 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
541 rb_init_page(bpage->page); 960 rb_init_page(bpage->page);
542 } 961 }
543 962
544 list_splice(&pages, head); 963 /*
964 * The ring buffer page list is a circular list that does not
965 * start and end with a list head. All page list items point to
966 * other pages.
967 */
968 cpu_buffer->pages = pages.next;
969 list_del(&pages);
545 970
546 rb_check_pages(cpu_buffer); 971 rb_check_pages(cpu_buffer);
547 972
@@ -573,13 +998,14 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
573 spin_lock_init(&cpu_buffer->reader_lock); 998 spin_lock_init(&cpu_buffer->reader_lock);
574 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); 999 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
575 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 1000 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
576 INIT_LIST_HEAD(&cpu_buffer->pages);
577 1001
578 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1002 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
579 GFP_KERNEL, cpu_to_node(cpu)); 1003 GFP_KERNEL, cpu_to_node(cpu));
580 if (!bpage) 1004 if (!bpage)
581 goto fail_free_buffer; 1005 goto fail_free_buffer;
582 1006
1007 rb_check_bpage(cpu_buffer, bpage);
1008
583 cpu_buffer->reader_page = bpage; 1009 cpu_buffer->reader_page = bpage;
584 addr = __get_free_page(GFP_KERNEL); 1010 addr = __get_free_page(GFP_KERNEL);
585 if (!addr) 1011 if (!addr)
@@ -594,9 +1020,11 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
594 goto fail_free_reader; 1020 goto fail_free_reader;
595 1021
596 cpu_buffer->head_page 1022 cpu_buffer->head_page
597 = list_entry(cpu_buffer->pages.next, struct buffer_page, list); 1023 = list_entry(cpu_buffer->pages, struct buffer_page, list);
598 cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; 1024 cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
599 1025
1026 rb_head_page_activate(cpu_buffer);
1027
600 return cpu_buffer; 1028 return cpu_buffer;
601 1029
602 fail_free_reader: 1030 fail_free_reader:
@@ -609,15 +1037,22 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
609 1037
610static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) 1038static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
611{ 1039{
612 struct list_head *head = &cpu_buffer->pages; 1040 struct list_head *head = cpu_buffer->pages;
613 struct buffer_page *bpage, *tmp; 1041 struct buffer_page *bpage, *tmp;
614 1042
615 free_buffer_page(cpu_buffer->reader_page); 1043 free_buffer_page(cpu_buffer->reader_page);
616 1044
617 list_for_each_entry_safe(bpage, tmp, head, list) { 1045 rb_head_page_deactivate(cpu_buffer);
618 list_del_init(&bpage->list); 1046
1047 if (head) {
1048 list_for_each_entry_safe(bpage, tmp, head, list) {
1049 list_del_init(&bpage->list);
1050 free_buffer_page(bpage);
1051 }
1052 bpage = list_entry(head, struct buffer_page, list);
619 free_buffer_page(bpage); 1053 free_buffer_page(bpage);
620 } 1054 }
1055
621 kfree(cpu_buffer); 1056 kfree(cpu_buffer);
622} 1057}
623 1058
@@ -760,15 +1195,17 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
760 atomic_inc(&cpu_buffer->record_disabled); 1195 atomic_inc(&cpu_buffer->record_disabled);
761 synchronize_sched(); 1196 synchronize_sched();
762 1197
1198 rb_head_page_deactivate(cpu_buffer);
1199
763 for (i = 0; i < nr_pages; i++) { 1200 for (i = 0; i < nr_pages; i++) {
764 if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) 1201 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
765 return; 1202 return;
766 p = cpu_buffer->pages.next; 1203 p = cpu_buffer->pages->next;
767 bpage = list_entry(p, struct buffer_page, list); 1204 bpage = list_entry(p, struct buffer_page, list);
768 list_del_init(&bpage->list); 1205 list_del_init(&bpage->list);
769 free_buffer_page(bpage); 1206 free_buffer_page(bpage);
770 } 1207 }
771 if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) 1208 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
772 return; 1209 return;
773 1210
774 rb_reset_cpu(cpu_buffer); 1211 rb_reset_cpu(cpu_buffer);
@@ -790,15 +1227,19 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
790 atomic_inc(&cpu_buffer->record_disabled); 1227 atomic_inc(&cpu_buffer->record_disabled);
791 synchronize_sched(); 1228 synchronize_sched();
792 1229
1230 spin_lock_irq(&cpu_buffer->reader_lock);
1231 rb_head_page_deactivate(cpu_buffer);
1232
793 for (i = 0; i < nr_pages; i++) { 1233 for (i = 0; i < nr_pages; i++) {
794 if (RB_WARN_ON(cpu_buffer, list_empty(pages))) 1234 if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
795 return; 1235 return;
796 p = pages->next; 1236 p = pages->next;
797 bpage = list_entry(p, struct buffer_page, list); 1237 bpage = list_entry(p, struct buffer_page, list);
798 list_del_init(&bpage->list); 1238 list_del_init(&bpage->list);
799 list_add_tail(&bpage->list, &cpu_buffer->pages); 1239 list_add_tail(&bpage->list, cpu_buffer->pages);
800 } 1240 }
801 rb_reset_cpu(cpu_buffer); 1241 rb_reset_cpu(cpu_buffer);
1242 spin_unlock_irq(&cpu_buffer->reader_lock);
802 1243
803 rb_check_pages(cpu_buffer); 1244 rb_check_pages(cpu_buffer);
804 1245
@@ -949,21 +1390,14 @@ rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer)
949} 1390}
950 1391
951static inline struct ring_buffer_event * 1392static inline struct ring_buffer_event *
952rb_head_event(struct ring_buffer_per_cpu *cpu_buffer)
953{
954 return __rb_page_index(cpu_buffer->head_page,
955 cpu_buffer->head_page->read);
956}
957
958static inline struct ring_buffer_event *
959rb_iter_head_event(struct ring_buffer_iter *iter) 1393rb_iter_head_event(struct ring_buffer_iter *iter)
960{ 1394{
961 return __rb_page_index(iter->head_page, iter->head); 1395 return __rb_page_index(iter->head_page, iter->head);
962} 1396}
963 1397
964static inline unsigned rb_page_write(struct buffer_page *bpage) 1398static inline unsigned long rb_page_write(struct buffer_page *bpage)
965{ 1399{
966 return local_read(&bpage->write); 1400 return local_read(&bpage->write) & RB_WRITE_MASK;
967} 1401}
968 1402
969static inline unsigned rb_page_commit(struct buffer_page *bpage) 1403static inline unsigned rb_page_commit(struct buffer_page *bpage)
@@ -971,6 +1405,11 @@ static inline unsigned rb_page_commit(struct buffer_page *bpage)
971 return local_read(&bpage->page->commit); 1405 return local_read(&bpage->page->commit);
972} 1406}
973 1407
1408static inline unsigned long rb_page_entries(struct buffer_page *bpage)
1409{
1410 return local_read(&bpage->entries) & RB_WRITE_MASK;
1411}
1412
974/* Size is determined by what has been commited */ 1413/* Size is determined by what has been commited */
975static inline unsigned rb_page_size(struct buffer_page *bpage) 1414static inline unsigned rb_page_size(struct buffer_page *bpage)
976{ 1415{
@@ -983,22 +1422,6 @@ rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
983 return rb_page_commit(cpu_buffer->commit_page); 1422 return rb_page_commit(cpu_buffer->commit_page);
984} 1423}
985 1424
986static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
987{
988 return rb_page_commit(cpu_buffer->head_page);
989}
990
991static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
992 struct buffer_page **bpage)
993{
994 struct list_head *p = (*bpage)->list.next;
995
996 if (p == &cpu_buffer->pages)
997 p = p->next;
998
999 *bpage = list_entry(p, struct buffer_page, list);
1000}
1001
1002static inline unsigned 1425static inline unsigned
1003rb_event_index(struct ring_buffer_event *event) 1426rb_event_index(struct ring_buffer_event *event)
1004{ 1427{
@@ -1024,6 +1447,8 @@ rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
1024static void 1447static void
1025rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) 1448rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1026{ 1449{
1450 unsigned long max_count;
1451
1027 /* 1452 /*
1028 * We only race with interrupts and NMIs on this CPU. 1453 * We only race with interrupts and NMIs on this CPU.
1029 * If we own the commit event, then we can commit 1454 * If we own the commit event, then we can commit
@@ -1033,9 +1458,16 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1033 * assign the commit to the tail. 1458 * assign the commit to the tail.
1034 */ 1459 */
1035 again: 1460 again:
1461 max_count = cpu_buffer->buffer->pages * 100;
1462
1036 while (cpu_buffer->commit_page != cpu_buffer->tail_page) { 1463 while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
1037 cpu_buffer->commit_page->page->commit = 1464 if (RB_WARN_ON(cpu_buffer, !(--max_count)))
1038 cpu_buffer->commit_page->write; 1465 return;
1466 if (RB_WARN_ON(cpu_buffer,
1467 rb_is_reader_page(cpu_buffer->tail_page)))
1468 return;
1469 local_set(&cpu_buffer->commit_page->page->commit,
1470 rb_page_write(cpu_buffer->commit_page));
1039 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); 1471 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
1040 cpu_buffer->write_stamp = 1472 cpu_buffer->write_stamp =
1041 cpu_buffer->commit_page->page->time_stamp; 1473 cpu_buffer->commit_page->page->time_stamp;
@@ -1044,8 +1476,12 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1044 } 1476 }
1045 while (rb_commit_index(cpu_buffer) != 1477 while (rb_commit_index(cpu_buffer) !=
1046 rb_page_write(cpu_buffer->commit_page)) { 1478 rb_page_write(cpu_buffer->commit_page)) {
1047 cpu_buffer->commit_page->page->commit = 1479
1048 cpu_buffer->commit_page->write; 1480 local_set(&cpu_buffer->commit_page->page->commit,
1481 rb_page_write(cpu_buffer->commit_page));
1482 RB_WARN_ON(cpu_buffer,
1483 local_read(&cpu_buffer->commit_page->page->commit) &
1484 ~RB_WRITE_MASK);
1049 barrier(); 1485 barrier();
1050 } 1486 }
1051 1487
@@ -1078,7 +1514,7 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
1078 * to the head page instead of next. 1514 * to the head page instead of next.
1079 */ 1515 */
1080 if (iter->head_page == cpu_buffer->reader_page) 1516 if (iter->head_page == cpu_buffer->reader_page)
1081 iter->head_page = cpu_buffer->head_page; 1517 iter->head_page = rb_set_head_page(cpu_buffer);
1082 else 1518 else
1083 rb_inc_page(cpu_buffer, &iter->head_page); 1519 rb_inc_page(cpu_buffer, &iter->head_page);
1084 1520
@@ -1122,6 +1558,163 @@ rb_update_event(struct ring_buffer_event *event,
1122 } 1558 }
1123} 1559}
1124 1560
1561/*
1562 * rb_handle_head_page - writer hit the head page
1563 *
1564 * Returns: +1 to retry page
1565 * 0 to continue
1566 * -1 on error
1567 */
1568static int
1569rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
1570 struct buffer_page *tail_page,
1571 struct buffer_page *next_page)
1572{
1573 struct buffer_page *new_head;
1574 int entries;
1575 int type;
1576 int ret;
1577
1578 entries = rb_page_entries(next_page);
1579
1580 /*
1581 * The hard part is here. We need to move the head
1582 * forward, and protect against both readers on
1583 * other CPUs and writers coming in via interrupts.
1584 */
1585 type = rb_head_page_set_update(cpu_buffer, next_page, tail_page,
1586 RB_PAGE_HEAD);
1587
1588 /*
1589 * type can be one of four:
1590 * NORMAL - an interrupt already moved it for us
1591 * HEAD - we are the first to get here.
1592 * UPDATE - we are the interrupt interrupting
1593 * a current move.
1594 * MOVED - a reader on another CPU moved the next
1595 * pointer to its reader page. Give up
1596 * and try again.
1597 */
1598
1599 switch (type) {
1600 case RB_PAGE_HEAD:
1601 /*
1602 * We changed the head to UPDATE, thus
1603 * it is our responsibility to update
1604 * the counters.
1605 */
1606 local_add(entries, &cpu_buffer->overrun);
1607
1608 /*
1609 * The entries will be zeroed out when we move the
1610 * tail page.
1611 */
1612
1613 /* still more to do */
1614 break;
1615
1616 case RB_PAGE_UPDATE:
1617 /*
1618 * This is an interrupt that interrupt the
1619 * previous update. Still more to do.
1620 */
1621 break;
1622 case RB_PAGE_NORMAL:
1623 /*
1624 * An interrupt came in before the update
1625 * and processed this for us.
1626 * Nothing left to do.
1627 */
1628 return 1;
1629 case RB_PAGE_MOVED:
1630 /*
1631 * The reader is on another CPU and just did
1632 * a swap with our next_page.
1633 * Try again.
1634 */
1635 return 1;
1636 default:
1637 RB_WARN_ON(cpu_buffer, 1); /* WTF??? */
1638 return -1;
1639 }
1640
1641 /*
1642 * Now that we are here, the old head pointer is
1643 * set to UPDATE. This will keep the reader from
1644 * swapping the head page with the reader page.
1645 * The reader (on another CPU) will spin till
1646 * we are finished.
1647 *
1648 * We just need to protect against interrupts
1649 * doing the job. We will set the next pointer
1650 * to HEAD. After that, we set the old pointer
1651 * to NORMAL, but only if it was HEAD before.
1652 * otherwise we are an interrupt, and only
1653 * want the outer most commit to reset it.
1654 */
1655 new_head = next_page;
1656 rb_inc_page(cpu_buffer, &new_head);
1657
1658 ret = rb_head_page_set_head(cpu_buffer, new_head, next_page,
1659 RB_PAGE_NORMAL);
1660
1661 /*
1662 * Valid returns are:
1663 * HEAD - an interrupt came in and already set it.
1664 * NORMAL - One of two things:
1665 * 1) We really set it.
1666 * 2) A bunch of interrupts came in and moved
1667 * the page forward again.
1668 */
1669 switch (ret) {
1670 case RB_PAGE_HEAD:
1671 case RB_PAGE_NORMAL:
1672 /* OK */
1673 break;
1674 default:
1675 RB_WARN_ON(cpu_buffer, 1);
1676 return -1;
1677 }
1678
1679 /*
1680 * It is possible that an interrupt came in,
1681 * set the head up, then more interrupts came in
1682 * and moved it again. When we get back here,
1683 * the page would have been set to NORMAL but we
1684 * just set it back to HEAD.
1685 *
1686 * How do you detect this? Well, if that happened
1687 * the tail page would have moved.
1688 */
1689 if (ret == RB_PAGE_NORMAL) {
1690 /*
1691 * If the tail had moved passed next, then we need
1692 * to reset the pointer.
1693 */
1694 if (cpu_buffer->tail_page != tail_page &&
1695 cpu_buffer->tail_page != next_page)
1696 rb_head_page_set_normal(cpu_buffer, new_head,
1697 next_page,
1698 RB_PAGE_HEAD);
1699 }
1700
1701 /*
1702 * If this was the outer most commit (the one that
1703 * changed the original pointer from HEAD to UPDATE),
1704 * then it is up to us to reset it to NORMAL.
1705 */
1706 if (type == RB_PAGE_HEAD) {
1707 ret = rb_head_page_set_normal(cpu_buffer, next_page,
1708 tail_page,
1709 RB_PAGE_UPDATE);
1710 if (RB_WARN_ON(cpu_buffer,
1711 ret != RB_PAGE_UPDATE))
1712 return -1;
1713 }
1714
1715 return 0;
1716}
1717
1125static unsigned rb_calculate_event_length(unsigned length) 1718static unsigned rb_calculate_event_length(unsigned length)
1126{ 1719{
1127 struct ring_buffer_event event; /* Used only for sizeof array */ 1720 struct ring_buffer_event event; /* Used only for sizeof array */
@@ -1185,9 +1778,6 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1185 event->type_len = RINGBUF_TYPE_PADDING; 1778 event->type_len = RINGBUF_TYPE_PADDING;
1186 /* time delta must be non zero */ 1779 /* time delta must be non zero */
1187 event->time_delta = 1; 1780 event->time_delta = 1;
1188 /* Account for this as an entry */
1189 local_inc(&tail_page->entries);
1190 local_inc(&cpu_buffer->entries);
1191 1781
1192 /* Set write to end of buffer */ 1782 /* Set write to end of buffer */
1193 length = (tail + length) - BUF_PAGE_SIZE; 1783 length = (tail + length) - BUF_PAGE_SIZE;
@@ -1200,96 +1790,93 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1200 struct buffer_page *commit_page, 1790 struct buffer_page *commit_page,
1201 struct buffer_page *tail_page, u64 *ts) 1791 struct buffer_page *tail_page, u64 *ts)
1202{ 1792{
1203 struct buffer_page *next_page, *head_page, *reader_page;
1204 struct ring_buffer *buffer = cpu_buffer->buffer; 1793 struct ring_buffer *buffer = cpu_buffer->buffer;
1205 bool lock_taken = false; 1794 struct buffer_page *next_page;
1206 unsigned long flags; 1795 int ret;
1207 1796
1208 next_page = tail_page; 1797 next_page = tail_page;
1209 1798
1210 local_irq_save(flags);
1211 /*
1212 * Since the write to the buffer is still not
1213 * fully lockless, we must be careful with NMIs.
1214 * The locks in the writers are taken when a write
1215 * crosses to a new page. The locks protect against
1216 * races with the readers (this will soon be fixed
1217 * with a lockless solution).
1218 *
1219 * Because we can not protect against NMIs, and we
1220 * want to keep traces reentrant, we need to manage
1221 * what happens when we are in an NMI.
1222 *
1223 * NMIs can happen after we take the lock.
1224 * If we are in an NMI, only take the lock
1225 * if it is not already taken. Otherwise
1226 * simply fail.
1227 */
1228 if (unlikely(in_nmi())) {
1229 if (!__raw_spin_trylock(&cpu_buffer->lock)) {
1230 cpu_buffer->nmi_dropped++;
1231 goto out_reset;
1232 }
1233 } else
1234 __raw_spin_lock(&cpu_buffer->lock);
1235
1236 lock_taken = true;
1237
1238 rb_inc_page(cpu_buffer, &next_page); 1799 rb_inc_page(cpu_buffer, &next_page);
1239 1800
1240 head_page = cpu_buffer->head_page;
1241 reader_page = cpu_buffer->reader_page;
1242
1243 /* we grabbed the lock before incrementing */
1244 if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
1245 goto out_reset;
1246
1247 /* 1801 /*
1248 * If for some reason, we had an interrupt storm that made 1802 * If for some reason, we had an interrupt storm that made
1249 * it all the way around the buffer, bail, and warn 1803 * it all the way around the buffer, bail, and warn
1250 * about it. 1804 * about it.
1251 */ 1805 */
1252 if (unlikely(next_page == commit_page)) { 1806 if (unlikely(next_page == commit_page)) {
1253 cpu_buffer->commit_overrun++; 1807 local_inc(&cpu_buffer->commit_overrun);
1254 goto out_reset; 1808 goto out_reset;
1255 } 1809 }
1256 1810
1257 if (next_page == head_page) { 1811 /*
1258 if (!(buffer->flags & RB_FL_OVERWRITE)) 1812 * This is where the fun begins!
1259 goto out_reset; 1813 *
1260 1814 * We are fighting against races between a reader that
1261 /* tail_page has not moved yet? */ 1815 * could be on another CPU trying to swap its reader
1262 if (tail_page == cpu_buffer->tail_page) { 1816 * page with the buffer head.
1263 /* count overflows */ 1817 *
1264 cpu_buffer->overrun += 1818 * We are also fighting against interrupts coming in and
1265 local_read(&head_page->entries); 1819 * moving the head or tail on us as well.
1820 *
1821 * If the next page is the head page then we have filled
1822 * the buffer, unless the commit page is still on the
1823 * reader page.
1824 */
1825 if (rb_is_head_page(cpu_buffer, next_page, &tail_page->list)) {
1266 1826
1267 rb_inc_page(cpu_buffer, &head_page); 1827 /*
1268 cpu_buffer->head_page = head_page; 1828 * If the commit is not on the reader page, then
1269 cpu_buffer->head_page->read = 0; 1829 * move the header page.
1830 */
1831 if (!rb_is_reader_page(cpu_buffer->commit_page)) {
1832 /*
1833 * If we are not in overwrite mode,
1834 * this is easy, just stop here.
1835 */
1836 if (!(buffer->flags & RB_FL_OVERWRITE))
1837 goto out_reset;
1838
1839 ret = rb_handle_head_page(cpu_buffer,
1840 tail_page,
1841 next_page);
1842 if (ret < 0)
1843 goto out_reset;
1844 if (ret)
1845 goto out_again;
1846 } else {
1847 /*
1848 * We need to be careful here too. The
1849 * commit page could still be on the reader
1850 * page. We could have a small buffer, and
1851 * have filled up the buffer with events
1852 * from interrupts and such, and wrapped.
1853 *
1854 * Note, if the tail page is also the on the
1855 * reader_page, we let it move out.
1856 */
1857 if (unlikely((cpu_buffer->commit_page !=
1858 cpu_buffer->tail_page) &&
1859 (cpu_buffer->commit_page ==
1860 cpu_buffer->reader_page))) {
1861 local_inc(&cpu_buffer->commit_overrun);
1862 goto out_reset;
1863 }
1270 } 1864 }
1271 } 1865 }
1272 1866
1273 /* 1867 ret = rb_tail_page_update(cpu_buffer, tail_page, next_page);
1274 * If the tail page is still the same as what we think 1868 if (ret) {
1275 * it is, then it is up to us to update the tail 1869 /*
1276 * pointer. 1870 * Nested commits always have zero deltas, so
1277 */ 1871 * just reread the time stamp
1278 if (tail_page == cpu_buffer->tail_page) { 1872 */
1279 local_set(&next_page->write, 0);
1280 local_set(&next_page->entries, 0);
1281 local_set(&next_page->page->commit, 0);
1282 cpu_buffer->tail_page = next_page;
1283
1284 /* reread the time stamp */
1285 *ts = rb_time_stamp(buffer, cpu_buffer->cpu); 1873 *ts = rb_time_stamp(buffer, cpu_buffer->cpu);
1286 cpu_buffer->tail_page->page->time_stamp = *ts; 1874 next_page->page->time_stamp = *ts;
1287 } 1875 }
1288 1876
1289 rb_reset_tail(cpu_buffer, tail_page, tail, length); 1877 out_again:
1290 1878
1291 __raw_spin_unlock(&cpu_buffer->lock); 1879 rb_reset_tail(cpu_buffer, tail_page, tail, length);
1292 local_irq_restore(flags);
1293 1880
1294 /* fail and let the caller try again */ 1881 /* fail and let the caller try again */
1295 return ERR_PTR(-EAGAIN); 1882 return ERR_PTR(-EAGAIN);
@@ -1298,9 +1885,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1298 /* reset write */ 1885 /* reset write */
1299 rb_reset_tail(cpu_buffer, tail_page, tail, length); 1886 rb_reset_tail(cpu_buffer, tail_page, tail, length);
1300 1887
1301 if (likely(lock_taken))
1302 __raw_spin_unlock(&cpu_buffer->lock);
1303 local_irq_restore(flags);
1304 return NULL; 1888 return NULL;
1305} 1889}
1306 1890
@@ -1317,6 +1901,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1317 barrier(); 1901 barrier();
1318 tail_page = cpu_buffer->tail_page; 1902 tail_page = cpu_buffer->tail_page;
1319 write = local_add_return(length, &tail_page->write); 1903 write = local_add_return(length, &tail_page->write);
1904
1905 /* set write to only the index of the write */
1906 write &= RB_WRITE_MASK;
1320 tail = write - length; 1907 tail = write - length;
1321 1908
1322 /* See if we shot pass the end of this buffer page */ 1909 /* See if we shot pass the end of this buffer page */
@@ -1361,12 +1948,16 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
1361 bpage = cpu_buffer->tail_page; 1948 bpage = cpu_buffer->tail_page;
1362 1949
1363 if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { 1950 if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
1951 unsigned long write_mask =
1952 local_read(&bpage->write) & ~RB_WRITE_MASK;
1364 /* 1953 /*
1365 * This is on the tail page. It is possible that 1954 * This is on the tail page. It is possible that
1366 * a write could come in and move the tail page 1955 * a write could come in and move the tail page
1367 * and write to the next page. That is fine 1956 * and write to the next page. That is fine
1368 * because we just shorten what is on this page. 1957 * because we just shorten what is on this page.
1369 */ 1958 */
1959 old_index += write_mask;
1960 new_index += write_mask;
1370 index = local_cmpxchg(&bpage->write, old_index, new_index); 1961 index = local_cmpxchg(&bpage->write, old_index, new_index);
1371 if (index == old_index) 1962 if (index == old_index)
1372 return 1; 1963 return 1;
@@ -1482,7 +2073,8 @@ static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
1482} 2073}
1483 2074
1484static struct ring_buffer_event * 2075static struct ring_buffer_event *
1485rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, 2076rb_reserve_next_event(struct ring_buffer *buffer,
2077 struct ring_buffer_per_cpu *cpu_buffer,
1486 unsigned long length) 2078 unsigned long length)
1487{ 2079{
1488 struct ring_buffer_event *event; 2080 struct ring_buffer_event *event;
@@ -1492,6 +2084,21 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1492 2084
1493 rb_start_commit(cpu_buffer); 2085 rb_start_commit(cpu_buffer);
1494 2086
2087#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
2088 /*
2089 * Due to the ability to swap a cpu buffer from a buffer
2090 * it is possible it was swapped before we committed.
2091 * (committing stops a swap). We check for it here and
2092 * if it happened, we have to fail the write.
2093 */
2094 barrier();
2095 if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) {
2096 local_dec(&cpu_buffer->committing);
2097 local_dec(&cpu_buffer->commits);
2098 return NULL;
2099 }
2100#endif
2101
1495 length = rb_calculate_event_length(length); 2102 length = rb_calculate_event_length(length);
1496 again: 2103 again:
1497 /* 2104 /*
@@ -1652,7 +2259,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
1652 if (length > BUF_MAX_DATA_SIZE) 2259 if (length > BUF_MAX_DATA_SIZE)
1653 goto out; 2260 goto out;
1654 2261
1655 event = rb_reserve_next_event(cpu_buffer, length); 2262 event = rb_reserve_next_event(buffer, cpu_buffer, length);
1656 if (!event) 2263 if (!event)
1657 goto out; 2264 goto out;
1658 2265
@@ -1675,18 +2282,23 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
1675} 2282}
1676EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); 2283EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
1677 2284
1678static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, 2285static void
2286rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1679 struct ring_buffer_event *event) 2287 struct ring_buffer_event *event)
1680{ 2288{
1681 local_inc(&cpu_buffer->entries);
1682
1683 /* 2289 /*
1684 * The event first in the commit queue updates the 2290 * The event first in the commit queue updates the
1685 * time stamp. 2291 * time stamp.
1686 */ 2292 */
1687 if (rb_event_is_commit(cpu_buffer, event)) 2293 if (rb_event_is_commit(cpu_buffer, event))
1688 cpu_buffer->write_stamp += event->time_delta; 2294 cpu_buffer->write_stamp += event->time_delta;
2295}
1689 2296
2297static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
2298 struct ring_buffer_event *event)
2299{
2300 local_inc(&cpu_buffer->entries);
2301 rb_update_write_stamp(cpu_buffer, event);
1690 rb_end_commit(cpu_buffer); 2302 rb_end_commit(cpu_buffer);
1691} 2303}
1692 2304
@@ -1733,32 +2345,57 @@ static inline void rb_event_discard(struct ring_buffer_event *event)
1733 event->time_delta = 1; 2345 event->time_delta = 1;
1734} 2346}
1735 2347
1736/** 2348/*
1737 * ring_buffer_event_discard - discard any event in the ring buffer 2349 * Decrement the entries to the page that an event is on.
1738 * @event: the event to discard 2350 * The event does not even need to exist, only the pointer
1739 * 2351 * to the page it is on. This may only be called before the commit
1740 * Sometimes a event that is in the ring buffer needs to be ignored. 2352 * takes place.
1741 * This function lets the user discard an event in the ring buffer
1742 * and then that event will not be read later.
1743 *
1744 * Note, it is up to the user to be careful with this, and protect
1745 * against races. If the user discards an event that has been consumed
1746 * it is possible that it could corrupt the ring buffer.
1747 */ 2353 */
1748void ring_buffer_event_discard(struct ring_buffer_event *event) 2354static inline void
2355rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer,
2356 struct ring_buffer_event *event)
1749{ 2357{
1750 rb_event_discard(event); 2358 unsigned long addr = (unsigned long)event;
2359 struct buffer_page *bpage = cpu_buffer->commit_page;
2360 struct buffer_page *start;
2361
2362 addr &= PAGE_MASK;
2363
2364 /* Do the likely case first */
2365 if (likely(bpage->page == (void *)addr)) {
2366 local_dec(&bpage->entries);
2367 return;
2368 }
2369
2370 /*
2371 * Because the commit page may be on the reader page we
2372 * start with the next page and check the end loop there.
2373 */
2374 rb_inc_page(cpu_buffer, &bpage);
2375 start = bpage;
2376 do {
2377 if (bpage->page == (void *)addr) {
2378 local_dec(&bpage->entries);
2379 return;
2380 }
2381 rb_inc_page(cpu_buffer, &bpage);
2382 } while (bpage != start);
2383
2384 /* commit not part of this buffer?? */
2385 RB_WARN_ON(cpu_buffer, 1);
1751} 2386}
1752EXPORT_SYMBOL_GPL(ring_buffer_event_discard);
1753 2387
1754/** 2388/**
1755 * ring_buffer_commit_discard - discard an event that has not been committed 2389 * ring_buffer_commit_discard - discard an event that has not been committed
1756 * @buffer: the ring buffer 2390 * @buffer: the ring buffer
1757 * @event: non committed event to discard 2391 * @event: non committed event to discard
1758 * 2392 *
1759 * This is similar to ring_buffer_event_discard but must only be 2393 * Sometimes an event that is in the ring buffer needs to be ignored.
1760 * performed on an event that has not been committed yet. The difference 2394 * This function lets the user discard an event in the ring buffer
1761 * is that this will also try to free the event from the ring buffer 2395 * and then that event will not be read later.
2396 *
2397 * This function only works if it is called before the the item has been
2398 * committed. It will try to free the event from the ring buffer
1762 * if another event has not been added behind it. 2399 * if another event has not been added behind it.
1763 * 2400 *
1764 * If another event has been added behind it, it will set the event 2401 * If another event has been added behind it, it will set the event
@@ -1786,14 +2423,15 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
1786 */ 2423 */
1787 RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); 2424 RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing));
1788 2425
2426 rb_decrement_entry(cpu_buffer, event);
1789 if (rb_try_to_discard(cpu_buffer, event)) 2427 if (rb_try_to_discard(cpu_buffer, event))
1790 goto out; 2428 goto out;
1791 2429
1792 /* 2430 /*
1793 * The commit is still visible by the reader, so we 2431 * The commit is still visible by the reader, so we
1794 * must increment entries. 2432 * must still update the timestamp.
1795 */ 2433 */
1796 local_inc(&cpu_buffer->entries); 2434 rb_update_write_stamp(cpu_buffer, event);
1797 out: 2435 out:
1798 rb_end_commit(cpu_buffer); 2436 rb_end_commit(cpu_buffer);
1799 2437
@@ -1854,7 +2492,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
1854 if (length > BUF_MAX_DATA_SIZE) 2492 if (length > BUF_MAX_DATA_SIZE)
1855 goto out; 2493 goto out;
1856 2494
1857 event = rb_reserve_next_event(cpu_buffer, length); 2495 event = rb_reserve_next_event(buffer, cpu_buffer, length);
1858 if (!event) 2496 if (!event)
1859 goto out; 2497 goto out;
1860 2498
@@ -1875,9 +2513,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_write);
1875static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) 2513static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
1876{ 2514{
1877 struct buffer_page *reader = cpu_buffer->reader_page; 2515 struct buffer_page *reader = cpu_buffer->reader_page;
1878 struct buffer_page *head = cpu_buffer->head_page; 2516 struct buffer_page *head = rb_set_head_page(cpu_buffer);
1879 struct buffer_page *commit = cpu_buffer->commit_page; 2517 struct buffer_page *commit = cpu_buffer->commit_page;
1880 2518
2519 /* In case of error, head will be NULL */
2520 if (unlikely(!head))
2521 return 1;
2522
1881 return reader->read == rb_page_commit(reader) && 2523 return reader->read == rb_page_commit(reader) &&
1882 (commit == reader || 2524 (commit == reader ||
1883 (commit == head && 2525 (commit == head &&
@@ -1968,7 +2610,7 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
1968 return 0; 2610 return 0;
1969 2611
1970 cpu_buffer = buffer->buffers[cpu]; 2612 cpu_buffer = buffer->buffers[cpu];
1971 ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun) 2613 ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun))
1972 - cpu_buffer->read; 2614 - cpu_buffer->read;
1973 2615
1974 return ret; 2616 return ret;
@@ -1989,33 +2631,13 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
1989 return 0; 2631 return 0;
1990 2632
1991 cpu_buffer = buffer->buffers[cpu]; 2633 cpu_buffer = buffer->buffers[cpu];
1992 ret = cpu_buffer->overrun; 2634 ret = local_read(&cpu_buffer->overrun);
1993 2635
1994 return ret; 2636 return ret;
1995} 2637}
1996EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); 2638EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
1997 2639
1998/** 2640/**
1999 * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped
2000 * @buffer: The ring buffer
2001 * @cpu: The per CPU buffer to get the number of overruns from
2002 */
2003unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu)
2004{
2005 struct ring_buffer_per_cpu *cpu_buffer;
2006 unsigned long ret;
2007
2008 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2009 return 0;
2010
2011 cpu_buffer = buffer->buffers[cpu];
2012 ret = cpu_buffer->nmi_dropped;
2013
2014 return ret;
2015}
2016EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu);
2017
2018/**
2019 * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits 2641 * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits
2020 * @buffer: The ring buffer 2642 * @buffer: The ring buffer
2021 * @cpu: The per CPU buffer to get the number of overruns from 2643 * @cpu: The per CPU buffer to get the number of overruns from
@@ -2030,7 +2652,7 @@ ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
2030 return 0; 2652 return 0;
2031 2653
2032 cpu_buffer = buffer->buffers[cpu]; 2654 cpu_buffer = buffer->buffers[cpu];
2033 ret = cpu_buffer->commit_overrun; 2655 ret = local_read(&cpu_buffer->commit_overrun);
2034 2656
2035 return ret; 2657 return ret;
2036} 2658}
@@ -2053,7 +2675,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
2053 for_each_buffer_cpu(buffer, cpu) { 2675 for_each_buffer_cpu(buffer, cpu) {
2054 cpu_buffer = buffer->buffers[cpu]; 2676 cpu_buffer = buffer->buffers[cpu];
2055 entries += (local_read(&cpu_buffer->entries) - 2677 entries += (local_read(&cpu_buffer->entries) -
2056 cpu_buffer->overrun) - cpu_buffer->read; 2678 local_read(&cpu_buffer->overrun)) - cpu_buffer->read;
2057 } 2679 }
2058 2680
2059 return entries; 2681 return entries;
@@ -2076,7 +2698,7 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
2076 /* if you care about this being correct, lock the buffer */ 2698 /* if you care about this being correct, lock the buffer */
2077 for_each_buffer_cpu(buffer, cpu) { 2699 for_each_buffer_cpu(buffer, cpu) {
2078 cpu_buffer = buffer->buffers[cpu]; 2700 cpu_buffer = buffer->buffers[cpu];
2079 overruns += cpu_buffer->overrun; 2701 overruns += local_read(&cpu_buffer->overrun);
2080 } 2702 }
2081 2703
2082 return overruns; 2704 return overruns;
@@ -2089,8 +2711,10 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
2089 2711
2090 /* Iterator usage is expected to have record disabled */ 2712 /* Iterator usage is expected to have record disabled */
2091 if (list_empty(&cpu_buffer->reader_page->list)) { 2713 if (list_empty(&cpu_buffer->reader_page->list)) {
2092 iter->head_page = cpu_buffer->head_page; 2714 iter->head_page = rb_set_head_page(cpu_buffer);
2093 iter->head = cpu_buffer->head_page->read; 2715 if (unlikely(!iter->head_page))
2716 return;
2717 iter->head = iter->head_page->read;
2094 } else { 2718 } else {
2095 iter->head_page = cpu_buffer->reader_page; 2719 iter->head_page = cpu_buffer->reader_page;
2096 iter->head = cpu_buffer->reader_page->read; 2720 iter->head = cpu_buffer->reader_page->read;
@@ -2207,6 +2831,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2207 struct buffer_page *reader = NULL; 2831 struct buffer_page *reader = NULL;
2208 unsigned long flags; 2832 unsigned long flags;
2209 int nr_loops = 0; 2833 int nr_loops = 0;
2834 int ret;
2210 2835
2211 local_irq_save(flags); 2836 local_irq_save(flags);
2212 __raw_spin_lock(&cpu_buffer->lock); 2837 __raw_spin_lock(&cpu_buffer->lock);
@@ -2240,30 +2865,56 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2240 goto out; 2865 goto out;
2241 2866
2242 /* 2867 /*
2243 * Splice the empty reader page into the list around the head.
2244 * Reset the reader page to size zero. 2868 * Reset the reader page to size zero.
2245 */ 2869 */
2870 local_set(&cpu_buffer->reader_page->write, 0);
2871 local_set(&cpu_buffer->reader_page->entries, 0);
2872 local_set(&cpu_buffer->reader_page->page->commit, 0);
2246 2873
2247 reader = cpu_buffer->head_page; 2874 spin:
2875 /*
2876 * Splice the empty reader page into the list around the head.
2877 */
2878 reader = rb_set_head_page(cpu_buffer);
2248 cpu_buffer->reader_page->list.next = reader->list.next; 2879 cpu_buffer->reader_page->list.next = reader->list.next;
2249 cpu_buffer->reader_page->list.prev = reader->list.prev; 2880 cpu_buffer->reader_page->list.prev = reader->list.prev;
2250 2881
2251 local_set(&cpu_buffer->reader_page->write, 0); 2882 /*
2252 local_set(&cpu_buffer->reader_page->entries, 0); 2883 * cpu_buffer->pages just needs to point to the buffer, it
2253 local_set(&cpu_buffer->reader_page->page->commit, 0); 2884 * has no specific buffer page to point to. Lets move it out
2885 * of our way so we don't accidently swap it.
2886 */
2887 cpu_buffer->pages = reader->list.prev;
2254 2888
2255 /* Make the reader page now replace the head */ 2889 /* The reader page will be pointing to the new head */
2256 reader->list.prev->next = &cpu_buffer->reader_page->list; 2890 rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
2257 reader->list.next->prev = &cpu_buffer->reader_page->list;
2258 2891
2259 /* 2892 /*
2260 * If the tail is on the reader, then we must set the head 2893 * Here's the tricky part.
2261 * to the inserted page, otherwise we set it one before. 2894 *
2895 * We need to move the pointer past the header page.
2896 * But we can only do that if a writer is not currently
2897 * moving it. The page before the header page has the
2898 * flag bit '1' set if it is pointing to the page we want.
2899 * but if the writer is in the process of moving it
2900 * than it will be '2' or already moved '0'.
2262 */ 2901 */
2263 cpu_buffer->head_page = cpu_buffer->reader_page;
2264 2902
2265 if (cpu_buffer->commit_page != reader) 2903 ret = rb_head_page_replace(reader, cpu_buffer->reader_page);
2266 rb_inc_page(cpu_buffer, &cpu_buffer->head_page); 2904
2905 /*
2906 * If we did not convert it, then we must try again.
2907 */
2908 if (!ret)
2909 goto spin;
2910
2911 /*
2912 * Yeah! We succeeded in replacing the page.
2913 *
2914 * Now make the new head point back to the reader page.
2915 */
2916 reader->list.next->prev = &cpu_buffer->reader_page->list;
2917 rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
2267 2918
2268 /* Finally update the reader page to the new head */ 2919 /* Finally update the reader page to the new head */
2269 cpu_buffer->reader_page = reader; 2920 cpu_buffer->reader_page = reader;
@@ -2292,8 +2943,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
2292 2943
2293 event = rb_reader_event(cpu_buffer); 2944 event = rb_reader_event(cpu_buffer);
2294 2945
2295 if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX 2946 if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
2296 || rb_discarded_event(event))
2297 cpu_buffer->read++; 2947 cpu_buffer->read++;
2298 2948
2299 rb_update_read_stamp(cpu_buffer, event); 2949 rb_update_read_stamp(cpu_buffer, event);
@@ -2525,10 +3175,8 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2525 spin_unlock(&cpu_buffer->reader_lock); 3175 spin_unlock(&cpu_buffer->reader_lock);
2526 local_irq_restore(flags); 3176 local_irq_restore(flags);
2527 3177
2528 if (event && event->type_len == RINGBUF_TYPE_PADDING) { 3178 if (event && event->type_len == RINGBUF_TYPE_PADDING)
2529 cpu_relax();
2530 goto again; 3179 goto again;
2531 }
2532 3180
2533 return event; 3181 return event;
2534} 3182}
@@ -2553,10 +3201,8 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2553 event = rb_iter_peek(iter, ts); 3201 event = rb_iter_peek(iter, ts);
2554 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3202 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2555 3203
2556 if (event && event->type_len == RINGBUF_TYPE_PADDING) { 3204 if (event && event->type_len == RINGBUF_TYPE_PADDING)
2557 cpu_relax();
2558 goto again; 3205 goto again;
2559 }
2560 3206
2561 return event; 3207 return event;
2562} 3208}
@@ -2602,10 +3248,8 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2602 out: 3248 out:
2603 preempt_enable(); 3249 preempt_enable();
2604 3250
2605 if (event && event->type_len == RINGBUF_TYPE_PADDING) { 3251 if (event && event->type_len == RINGBUF_TYPE_PADDING)
2606 cpu_relax();
2607 goto again; 3252 goto again;
2608 }
2609 3253
2610 return event; 3254 return event;
2611} 3255}
@@ -2685,21 +3329,19 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
2685 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 3329 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
2686 unsigned long flags; 3330 unsigned long flags;
2687 3331
2688 again:
2689 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3332 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3333 again:
2690 event = rb_iter_peek(iter, ts); 3334 event = rb_iter_peek(iter, ts);
2691 if (!event) 3335 if (!event)
2692 goto out; 3336 goto out;
2693 3337
3338 if (event->type_len == RINGBUF_TYPE_PADDING)
3339 goto again;
3340
2694 rb_advance_iter(iter); 3341 rb_advance_iter(iter);
2695 out: 3342 out:
2696 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3343 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2697 3344
2698 if (event && event->type_len == RINGBUF_TYPE_PADDING) {
2699 cpu_relax();
2700 goto again;
2701 }
2702
2703 return event; 3345 return event;
2704} 3346}
2705EXPORT_SYMBOL_GPL(ring_buffer_read); 3347EXPORT_SYMBOL_GPL(ring_buffer_read);
@@ -2717,8 +3359,10 @@ EXPORT_SYMBOL_GPL(ring_buffer_size);
2717static void 3359static void
2718rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) 3360rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
2719{ 3361{
3362 rb_head_page_deactivate(cpu_buffer);
3363
2720 cpu_buffer->head_page 3364 cpu_buffer->head_page
2721 = list_entry(cpu_buffer->pages.next, struct buffer_page, list); 3365 = list_entry(cpu_buffer->pages, struct buffer_page, list);
2722 local_set(&cpu_buffer->head_page->write, 0); 3366 local_set(&cpu_buffer->head_page->write, 0);
2723 local_set(&cpu_buffer->head_page->entries, 0); 3367 local_set(&cpu_buffer->head_page->entries, 0);
2724 local_set(&cpu_buffer->head_page->page->commit, 0); 3368 local_set(&cpu_buffer->head_page->page->commit, 0);
@@ -2734,16 +3378,17 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
2734 local_set(&cpu_buffer->reader_page->page->commit, 0); 3378 local_set(&cpu_buffer->reader_page->page->commit, 0);
2735 cpu_buffer->reader_page->read = 0; 3379 cpu_buffer->reader_page->read = 0;
2736 3380
2737 cpu_buffer->nmi_dropped = 0; 3381 local_set(&cpu_buffer->commit_overrun, 0);
2738 cpu_buffer->commit_overrun = 0; 3382 local_set(&cpu_buffer->overrun, 0);
2739 cpu_buffer->overrun = 0;
2740 cpu_buffer->read = 0;
2741 local_set(&cpu_buffer->entries, 0); 3383 local_set(&cpu_buffer->entries, 0);
2742 local_set(&cpu_buffer->committing, 0); 3384 local_set(&cpu_buffer->committing, 0);
2743 local_set(&cpu_buffer->commits, 0); 3385 local_set(&cpu_buffer->commits, 0);
3386 cpu_buffer->read = 0;
2744 3387
2745 cpu_buffer->write_stamp = 0; 3388 cpu_buffer->write_stamp = 0;
2746 cpu_buffer->read_stamp = 0; 3389 cpu_buffer->read_stamp = 0;
3390
3391 rb_head_page_activate(cpu_buffer);
2747} 3392}
2748 3393
2749/** 3394/**
@@ -2763,12 +3408,16 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
2763 3408
2764 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3409 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2765 3410
3411 if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
3412 goto out;
3413
2766 __raw_spin_lock(&cpu_buffer->lock); 3414 __raw_spin_lock(&cpu_buffer->lock);
2767 3415
2768 rb_reset_cpu(cpu_buffer); 3416 rb_reset_cpu(cpu_buffer);
2769 3417
2770 __raw_spin_unlock(&cpu_buffer->lock); 3418 __raw_spin_unlock(&cpu_buffer->lock);
2771 3419
3420 out:
2772 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3421 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2773 3422
2774 atomic_dec(&cpu_buffer->record_disabled); 3423 atomic_dec(&cpu_buffer->record_disabled);
@@ -2851,6 +3500,7 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
2851} 3500}
2852EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); 3501EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu);
2853 3502
3503#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
2854/** 3504/**
2855 * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers 3505 * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
2856 * @buffer_a: One buffer to swap with 3506 * @buffer_a: One buffer to swap with
@@ -2905,20 +3555,28 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
2905 atomic_inc(&cpu_buffer_a->record_disabled); 3555 atomic_inc(&cpu_buffer_a->record_disabled);
2906 atomic_inc(&cpu_buffer_b->record_disabled); 3556 atomic_inc(&cpu_buffer_b->record_disabled);
2907 3557
3558 ret = -EBUSY;
3559 if (local_read(&cpu_buffer_a->committing))
3560 goto out_dec;
3561 if (local_read(&cpu_buffer_b->committing))
3562 goto out_dec;
3563
2908 buffer_a->buffers[cpu] = cpu_buffer_b; 3564 buffer_a->buffers[cpu] = cpu_buffer_b;
2909 buffer_b->buffers[cpu] = cpu_buffer_a; 3565 buffer_b->buffers[cpu] = cpu_buffer_a;
2910 3566
2911 cpu_buffer_b->buffer = buffer_a; 3567 cpu_buffer_b->buffer = buffer_a;
2912 cpu_buffer_a->buffer = buffer_b; 3568 cpu_buffer_a->buffer = buffer_b;
2913 3569
3570 ret = 0;
3571
3572out_dec:
2914 atomic_dec(&cpu_buffer_a->record_disabled); 3573 atomic_dec(&cpu_buffer_a->record_disabled);
2915 atomic_dec(&cpu_buffer_b->record_disabled); 3574 atomic_dec(&cpu_buffer_b->record_disabled);
2916
2917 ret = 0;
2918out: 3575out:
2919 return ret; 3576 return ret;
2920} 3577}
2921EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); 3578EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
3579#endif /* CONFIG_RING_BUFFER_ALLOW_SWAP */
2922 3580
2923/** 3581/**
2924 * ring_buffer_alloc_read_page - allocate a page to read from buffer 3582 * ring_buffer_alloc_read_page - allocate a page to read from buffer
@@ -3091,7 +3749,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3091 read = 0; 3749 read = 0;
3092 } else { 3750 } else {
3093 /* update the entry counter */ 3751 /* update the entry counter */
3094 cpu_buffer->read += local_read(&reader->entries); 3752 cpu_buffer->read += rb_page_entries(reader);
3095 3753
3096 /* swap the pages */ 3754 /* swap the pages */
3097 rb_init_page(bpage); 3755 rb_init_page(bpage);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8c358395d338..5c75deeefe30 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -43,14 +43,11 @@
43 43
44#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE) 44#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE)
45 45
46unsigned long __read_mostly tracing_max_latency;
47unsigned long __read_mostly tracing_thresh;
48
49/* 46/*
50 * On boot up, the ring buffer is set to the minimum size, so that 47 * On boot up, the ring buffer is set to the minimum size, so that
51 * we do not waste memory on systems that are not using tracing. 48 * we do not waste memory on systems that are not using tracing.
52 */ 49 */
53static int ring_buffer_expanded; 50int ring_buffer_expanded;
54 51
55/* 52/*
56 * We need to change this state when a selftest is running. 53 * We need to change this state when a selftest is running.
@@ -64,7 +61,7 @@ static bool __read_mostly tracing_selftest_running;
64/* 61/*
65 * If a tracer is running, we do not want to run SELFTEST. 62 * If a tracer is running, we do not want to run SELFTEST.
66 */ 63 */
67static bool __read_mostly tracing_selftest_disabled; 64bool __read_mostly tracing_selftest_disabled;
68 65
69/* For tracers that don't implement custom flags */ 66/* For tracers that don't implement custom flags */
70static struct tracer_opt dummy_tracer_opt[] = { 67static struct tracer_opt dummy_tracer_opt[] = {
@@ -89,7 +86,7 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
89 */ 86 */
90static int tracing_disabled = 1; 87static int tracing_disabled = 1;
91 88
92static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); 89DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
93 90
94static inline void ftrace_disable_cpu(void) 91static inline void ftrace_disable_cpu(void)
95{ 92{
@@ -172,10 +169,11 @@ static struct trace_array global_trace;
172 169
173static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); 170static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
174 171
175int filter_current_check_discard(struct ftrace_event_call *call, void *rec, 172int filter_current_check_discard(struct ring_buffer *buffer,
173 struct ftrace_event_call *call, void *rec,
176 struct ring_buffer_event *event) 174 struct ring_buffer_event *event)
177{ 175{
178 return filter_check_discard(call, rec, global_trace.buffer, event); 176 return filter_check_discard(call, rec, buffer, event);
179} 177}
180EXPORT_SYMBOL_GPL(filter_current_check_discard); 178EXPORT_SYMBOL_GPL(filter_current_check_discard);
181 179
@@ -266,6 +264,9 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
266 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | 264 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
267 TRACE_ITER_GRAPH_TIME; 265 TRACE_ITER_GRAPH_TIME;
268 266
267static int trace_stop_count;
268static DEFINE_SPINLOCK(tracing_start_lock);
269
269/** 270/**
270 * trace_wake_up - wake up tasks waiting for trace input 271 * trace_wake_up - wake up tasks waiting for trace input
271 * 272 *
@@ -323,50 +324,20 @@ static const char *trace_options[] = {
323 "printk-msg-only", 324 "printk-msg-only",
324 "context-info", 325 "context-info",
325 "latency-format", 326 "latency-format",
326 "global-clock",
327 "sleep-time", 327 "sleep-time",
328 "graph-time", 328 "graph-time",
329 NULL 329 NULL
330}; 330};
331 331
332/* 332static struct {
333 * ftrace_max_lock is used to protect the swapping of buffers 333 u64 (*func)(void);
334 * when taking a max snapshot. The buffers themselves are 334 const char *name;
335 * protected by per_cpu spinlocks. But the action of the swap 335} trace_clocks[] = {
336 * needs its own lock. 336 { trace_clock_local, "local" },
337 * 337 { trace_clock_global, "global" },
338 * This is defined as a raw_spinlock_t in order to help 338};
339 * with performance when lockdep debugging is enabled.
340 */
341static raw_spinlock_t ftrace_max_lock =
342 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
343
344/*
345 * Copy the new maximum trace into the separate maximum-trace
346 * structure. (this way the maximum trace is permanently saved,
347 * for later retrieval via /sys/kernel/debug/tracing/latency_trace)
348 */
349static void
350__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
351{
352 struct trace_array_cpu *data = tr->data[cpu];
353
354 max_tr.cpu = cpu;
355 max_tr.time_start = data->preempt_timestamp;
356 339
357 data = max_tr.data[cpu]; 340int trace_clock_id;
358 data->saved_latency = tracing_max_latency;
359
360 memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
361 data->pid = tsk->pid;
362 data->uid = task_uid(tsk);
363 data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
364 data->policy = tsk->policy;
365 data->rt_priority = tsk->rt_priority;
366
367 /* record this tasks comm */
368 tracing_record_cmdline(tsk);
369}
370 341
371ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) 342ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
372{ 343{
@@ -411,6 +382,56 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
411 return cnt; 382 return cnt;
412} 383}
413 384
385/*
386 * ftrace_max_lock is used to protect the swapping of buffers
387 * when taking a max snapshot. The buffers themselves are
388 * protected by per_cpu spinlocks. But the action of the swap
389 * needs its own lock.
390 *
391 * This is defined as a raw_spinlock_t in order to help
392 * with performance when lockdep debugging is enabled.
393 *
394 * It is also used in other places outside the update_max_tr
395 * so it needs to be defined outside of the
396 * CONFIG_TRACER_MAX_TRACE.
397 */
398static raw_spinlock_t ftrace_max_lock =
399 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
400
401#ifdef CONFIG_TRACER_MAX_TRACE
402unsigned long __read_mostly tracing_max_latency;
403unsigned long __read_mostly tracing_thresh;
404
405/*
406 * Copy the new maximum trace into the separate maximum-trace
407 * structure. (this way the maximum trace is permanently saved,
408 * for later retrieval via /sys/kernel/debug/tracing/latency_trace)
409 */
410static void
411__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
412{
413 struct trace_array_cpu *data = tr->data[cpu];
414 struct trace_array_cpu *max_data = tr->data[cpu];
415
416 max_tr.cpu = cpu;
417 max_tr.time_start = data->preempt_timestamp;
418
419 max_data = max_tr.data[cpu];
420 max_data->saved_latency = tracing_max_latency;
421 max_data->critical_start = data->critical_start;
422 max_data->critical_end = data->critical_end;
423
424 memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
425 max_data->pid = tsk->pid;
426 max_data->uid = task_uid(tsk);
427 max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
428 max_data->policy = tsk->policy;
429 max_data->rt_priority = tsk->rt_priority;
430
431 /* record this tasks comm */
432 tracing_record_cmdline(tsk);
433}
434
414/** 435/**
415 * update_max_tr - snapshot all trace buffers from global_trace to max_tr 436 * update_max_tr - snapshot all trace buffers from global_trace to max_tr
416 * @tr: tracer 437 * @tr: tracer
@@ -425,16 +446,15 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
425{ 446{
426 struct ring_buffer *buf = tr->buffer; 447 struct ring_buffer *buf = tr->buffer;
427 448
449 if (trace_stop_count)
450 return;
451
428 WARN_ON_ONCE(!irqs_disabled()); 452 WARN_ON_ONCE(!irqs_disabled());
429 __raw_spin_lock(&ftrace_max_lock); 453 __raw_spin_lock(&ftrace_max_lock);
430 454
431 tr->buffer = max_tr.buffer; 455 tr->buffer = max_tr.buffer;
432 max_tr.buffer = buf; 456 max_tr.buffer = buf;
433 457
434 ftrace_disable_cpu();
435 ring_buffer_reset(tr->buffer);
436 ftrace_enable_cpu();
437
438 __update_max_tr(tr, tsk, cpu); 458 __update_max_tr(tr, tsk, cpu);
439 __raw_spin_unlock(&ftrace_max_lock); 459 __raw_spin_unlock(&ftrace_max_lock);
440} 460}
@@ -452,21 +472,35 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
452{ 472{
453 int ret; 473 int ret;
454 474
475 if (trace_stop_count)
476 return;
477
455 WARN_ON_ONCE(!irqs_disabled()); 478 WARN_ON_ONCE(!irqs_disabled());
456 __raw_spin_lock(&ftrace_max_lock); 479 __raw_spin_lock(&ftrace_max_lock);
457 480
458 ftrace_disable_cpu(); 481 ftrace_disable_cpu();
459 482
460 ring_buffer_reset(max_tr.buffer);
461 ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); 483 ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu);
462 484
485 if (ret == -EBUSY) {
486 /*
487 * We failed to swap the buffer due to a commit taking
488 * place on this CPU. We fail to record, but we reset
489 * the max trace buffer (no one writes directly to it)
490 * and flag that it failed.
491 */
492 trace_array_printk(&max_tr, _THIS_IP_,
493 "Failed to swap buffers due to commit in progress\n");
494 }
495
463 ftrace_enable_cpu(); 496 ftrace_enable_cpu();
464 497
465 WARN_ON_ONCE(ret && ret != -EAGAIN); 498 WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
466 499
467 __update_max_tr(tr, tsk, cpu); 500 __update_max_tr(tr, tsk, cpu);
468 __raw_spin_unlock(&ftrace_max_lock); 501 __raw_spin_unlock(&ftrace_max_lock);
469} 502}
503#endif /* CONFIG_TRACER_MAX_TRACE */
470 504
471/** 505/**
472 * register_tracer - register a tracer with the ftrace system. 506 * register_tracer - register a tracer with the ftrace system.
@@ -523,7 +557,6 @@ __acquires(kernel_lock)
523 if (type->selftest && !tracing_selftest_disabled) { 557 if (type->selftest && !tracing_selftest_disabled) {
524 struct tracer *saved_tracer = current_trace; 558 struct tracer *saved_tracer = current_trace;
525 struct trace_array *tr = &global_trace; 559 struct trace_array *tr = &global_trace;
526 int i;
527 560
528 /* 561 /*
529 * Run a selftest on this tracer. 562 * Run a selftest on this tracer.
@@ -532,8 +565,7 @@ __acquires(kernel_lock)
532 * internal tracing to verify that everything is in order. 565 * internal tracing to verify that everything is in order.
533 * If we fail, we do not register this tracer. 566 * If we fail, we do not register this tracer.
534 */ 567 */
535 for_each_tracing_cpu(i) 568 tracing_reset_online_cpus(tr);
536 tracing_reset(tr, i);
537 569
538 current_trace = type; 570 current_trace = type;
539 /* the test is responsible for initializing and enabling */ 571 /* the test is responsible for initializing and enabling */
@@ -546,8 +578,7 @@ __acquires(kernel_lock)
546 goto out; 578 goto out;
547 } 579 }
548 /* Only reset on passing, to avoid touching corrupted buffers */ 580 /* Only reset on passing, to avoid touching corrupted buffers */
549 for_each_tracing_cpu(i) 581 tracing_reset_online_cpus(tr);
550 tracing_reset(tr, i);
551 582
552 printk(KERN_CONT "PASSED\n"); 583 printk(KERN_CONT "PASSED\n");
553 } 584 }
@@ -622,21 +653,42 @@ void unregister_tracer(struct tracer *type)
622 mutex_unlock(&trace_types_lock); 653 mutex_unlock(&trace_types_lock);
623} 654}
624 655
625void tracing_reset(struct trace_array *tr, int cpu) 656static void __tracing_reset(struct trace_array *tr, int cpu)
626{ 657{
627 ftrace_disable_cpu(); 658 ftrace_disable_cpu();
628 ring_buffer_reset_cpu(tr->buffer, cpu); 659 ring_buffer_reset_cpu(tr->buffer, cpu);
629 ftrace_enable_cpu(); 660 ftrace_enable_cpu();
630} 661}
631 662
663void tracing_reset(struct trace_array *tr, int cpu)
664{
665 struct ring_buffer *buffer = tr->buffer;
666
667 ring_buffer_record_disable(buffer);
668
669 /* Make sure all commits have finished */
670 synchronize_sched();
671 __tracing_reset(tr, cpu);
672
673 ring_buffer_record_enable(buffer);
674}
675
632void tracing_reset_online_cpus(struct trace_array *tr) 676void tracing_reset_online_cpus(struct trace_array *tr)
633{ 677{
678 struct ring_buffer *buffer = tr->buffer;
634 int cpu; 679 int cpu;
635 680
681 ring_buffer_record_disable(buffer);
682
683 /* Make sure all commits have finished */
684 synchronize_sched();
685
636 tr->time_start = ftrace_now(tr->cpu); 686 tr->time_start = ftrace_now(tr->cpu);
637 687
638 for_each_online_cpu(cpu) 688 for_each_online_cpu(cpu)
639 tracing_reset(tr, cpu); 689 __tracing_reset(tr, cpu);
690
691 ring_buffer_record_enable(buffer);
640} 692}
641 693
642void tracing_reset_current(int cpu) 694void tracing_reset_current(int cpu)
@@ -667,9 +719,6 @@ static void trace_init_cmdlines(void)
667 cmdline_idx = 0; 719 cmdline_idx = 0;
668} 720}
669 721
670static int trace_stop_count;
671static DEFINE_SPINLOCK(tracing_start_lock);
672
673/** 722/**
674 * ftrace_off_permanent - disable all ftrace code permanently 723 * ftrace_off_permanent - disable all ftrace code permanently
675 * 724 *
@@ -850,14 +899,15 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
850} 899}
851EXPORT_SYMBOL_GPL(tracing_generic_entry_update); 900EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
852 901
853struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, 902struct ring_buffer_event *
854 int type, 903trace_buffer_lock_reserve(struct ring_buffer *buffer,
855 unsigned long len, 904 int type,
856 unsigned long flags, int pc) 905 unsigned long len,
906 unsigned long flags, int pc)
857{ 907{
858 struct ring_buffer_event *event; 908 struct ring_buffer_event *event;
859 909
860 event = ring_buffer_lock_reserve(tr->buffer, len); 910 event = ring_buffer_lock_reserve(buffer, len);
861 if (event != NULL) { 911 if (event != NULL) {
862 struct trace_entry *ent = ring_buffer_event_data(event); 912 struct trace_entry *ent = ring_buffer_event_data(event);
863 913
@@ -867,58 +917,60 @@ struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
867 917
868 return event; 918 return event;
869} 919}
870static void ftrace_trace_stack(struct trace_array *tr,
871 unsigned long flags, int skip, int pc);
872static void ftrace_trace_userstack(struct trace_array *tr,
873 unsigned long flags, int pc);
874 920
875static inline void __trace_buffer_unlock_commit(struct trace_array *tr, 921static inline void
876 struct ring_buffer_event *event, 922__trace_buffer_unlock_commit(struct ring_buffer *buffer,
877 unsigned long flags, int pc, 923 struct ring_buffer_event *event,
878 int wake) 924 unsigned long flags, int pc,
925 int wake)
879{ 926{
880 ring_buffer_unlock_commit(tr->buffer, event); 927 ring_buffer_unlock_commit(buffer, event);
881 928
882 ftrace_trace_stack(tr, flags, 6, pc); 929 ftrace_trace_stack(buffer, flags, 6, pc);
883 ftrace_trace_userstack(tr, flags, pc); 930 ftrace_trace_userstack(buffer, flags, pc);
884 931
885 if (wake) 932 if (wake)
886 trace_wake_up(); 933 trace_wake_up();
887} 934}
888 935
889void trace_buffer_unlock_commit(struct trace_array *tr, 936void trace_buffer_unlock_commit(struct ring_buffer *buffer,
890 struct ring_buffer_event *event, 937 struct ring_buffer_event *event,
891 unsigned long flags, int pc) 938 unsigned long flags, int pc)
892{ 939{
893 __trace_buffer_unlock_commit(tr, event, flags, pc, 1); 940 __trace_buffer_unlock_commit(buffer, event, flags, pc, 1);
894} 941}
895 942
896struct ring_buffer_event * 943struct ring_buffer_event *
897trace_current_buffer_lock_reserve(int type, unsigned long len, 944trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
945 int type, unsigned long len,
898 unsigned long flags, int pc) 946 unsigned long flags, int pc)
899{ 947{
900 return trace_buffer_lock_reserve(&global_trace, 948 *current_rb = global_trace.buffer;
949 return trace_buffer_lock_reserve(*current_rb,
901 type, len, flags, pc); 950 type, len, flags, pc);
902} 951}
903EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve); 952EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve);
904 953
905void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, 954void trace_current_buffer_unlock_commit(struct ring_buffer *buffer,
955 struct ring_buffer_event *event,
906 unsigned long flags, int pc) 956 unsigned long flags, int pc)
907{ 957{
908 __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); 958 __trace_buffer_unlock_commit(buffer, event, flags, pc, 1);
909} 959}
910EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); 960EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit);
911 961
912void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, 962void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer,
913 unsigned long flags, int pc) 963 struct ring_buffer_event *event,
964 unsigned long flags, int pc)
914{ 965{
915 __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); 966 __trace_buffer_unlock_commit(buffer, event, flags, pc, 0);
916} 967}
917EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); 968EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
918 969
919void trace_current_buffer_discard_commit(struct ring_buffer_event *event) 970void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
971 struct ring_buffer_event *event)
920{ 972{
921 ring_buffer_discard_commit(global_trace.buffer, event); 973 ring_buffer_discard_commit(buffer, event);
922} 974}
923EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit); 975EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit);
924 976
@@ -928,6 +980,7 @@ trace_function(struct trace_array *tr,
928 int pc) 980 int pc)
929{ 981{
930 struct ftrace_event_call *call = &event_function; 982 struct ftrace_event_call *call = &event_function;
983 struct ring_buffer *buffer = tr->buffer;
931 struct ring_buffer_event *event; 984 struct ring_buffer_event *event;
932 struct ftrace_entry *entry; 985 struct ftrace_entry *entry;
933 986
@@ -935,7 +988,7 @@ trace_function(struct trace_array *tr,
935 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) 988 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
936 return; 989 return;
937 990
938 event = trace_buffer_lock_reserve(tr, TRACE_FN, sizeof(*entry), 991 event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
939 flags, pc); 992 flags, pc);
940 if (!event) 993 if (!event)
941 return; 994 return;
@@ -943,58 +996,10 @@ trace_function(struct trace_array *tr,
943 entry->ip = ip; 996 entry->ip = ip;
944 entry->parent_ip = parent_ip; 997 entry->parent_ip = parent_ip;
945 998
946 if (!filter_check_discard(call, entry, tr->buffer, event)) 999 if (!filter_check_discard(call, entry, buffer, event))
947 ring_buffer_unlock_commit(tr->buffer, event); 1000 ring_buffer_unlock_commit(buffer, event);
948}
949
950#ifdef CONFIG_FUNCTION_GRAPH_TRACER
951static int __trace_graph_entry(struct trace_array *tr,
952 struct ftrace_graph_ent *trace,
953 unsigned long flags,
954 int pc)
955{
956 struct ftrace_event_call *call = &event_funcgraph_entry;
957 struct ring_buffer_event *event;
958 struct ftrace_graph_ent_entry *entry;
959
960 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
961 return 0;
962
963 event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_ENT,
964 sizeof(*entry), flags, pc);
965 if (!event)
966 return 0;
967 entry = ring_buffer_event_data(event);
968 entry->graph_ent = *trace;
969 if (!filter_current_check_discard(call, entry, event))
970 ring_buffer_unlock_commit(global_trace.buffer, event);
971
972 return 1;
973} 1001}
974 1002
975static void __trace_graph_return(struct trace_array *tr,
976 struct ftrace_graph_ret *trace,
977 unsigned long flags,
978 int pc)
979{
980 struct ftrace_event_call *call = &event_funcgraph_exit;
981 struct ring_buffer_event *event;
982 struct ftrace_graph_ret_entry *entry;
983
984 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
985 return;
986
987 event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_RET,
988 sizeof(*entry), flags, pc);
989 if (!event)
990 return;
991 entry = ring_buffer_event_data(event);
992 entry->ret = *trace;
993 if (!filter_current_check_discard(call, entry, event))
994 ring_buffer_unlock_commit(global_trace.buffer, event);
995}
996#endif
997
998void 1003void
999ftrace(struct trace_array *tr, struct trace_array_cpu *data, 1004ftrace(struct trace_array *tr, struct trace_array_cpu *data,
1000 unsigned long ip, unsigned long parent_ip, unsigned long flags, 1005 unsigned long ip, unsigned long parent_ip, unsigned long flags,
@@ -1004,17 +1009,17 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
1004 trace_function(tr, ip, parent_ip, flags, pc); 1009 trace_function(tr, ip, parent_ip, flags, pc);
1005} 1010}
1006 1011
1007static void __ftrace_trace_stack(struct trace_array *tr, 1012#ifdef CONFIG_STACKTRACE
1013static void __ftrace_trace_stack(struct ring_buffer *buffer,
1008 unsigned long flags, 1014 unsigned long flags,
1009 int skip, int pc) 1015 int skip, int pc)
1010{ 1016{
1011#ifdef CONFIG_STACKTRACE
1012 struct ftrace_event_call *call = &event_kernel_stack; 1017 struct ftrace_event_call *call = &event_kernel_stack;
1013 struct ring_buffer_event *event; 1018 struct ring_buffer_event *event;
1014 struct stack_entry *entry; 1019 struct stack_entry *entry;
1015 struct stack_trace trace; 1020 struct stack_trace trace;
1016 1021
1017 event = trace_buffer_lock_reserve(tr, TRACE_STACK, 1022 event = trace_buffer_lock_reserve(buffer, TRACE_STACK,
1018 sizeof(*entry), flags, pc); 1023 sizeof(*entry), flags, pc);
1019 if (!event) 1024 if (!event)
1020 return; 1025 return;
@@ -1027,32 +1032,28 @@ static void __ftrace_trace_stack(struct trace_array *tr,
1027 trace.entries = entry->caller; 1032 trace.entries = entry->caller;
1028 1033
1029 save_stack_trace(&trace); 1034 save_stack_trace(&trace);
1030 if (!filter_check_discard(call, entry, tr->buffer, event)) 1035 if (!filter_check_discard(call, entry, buffer, event))
1031 ring_buffer_unlock_commit(tr->buffer, event); 1036 ring_buffer_unlock_commit(buffer, event);
1032#endif
1033} 1037}
1034 1038
1035static void ftrace_trace_stack(struct trace_array *tr, 1039void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
1036 unsigned long flags, 1040 int skip, int pc)
1037 int skip, int pc)
1038{ 1041{
1039 if (!(trace_flags & TRACE_ITER_STACKTRACE)) 1042 if (!(trace_flags & TRACE_ITER_STACKTRACE))
1040 return; 1043 return;
1041 1044
1042 __ftrace_trace_stack(tr, flags, skip, pc); 1045 __ftrace_trace_stack(buffer, flags, skip, pc);
1043} 1046}
1044 1047
1045void __trace_stack(struct trace_array *tr, 1048void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
1046 unsigned long flags, 1049 int pc)
1047 int skip, int pc)
1048{ 1050{
1049 __ftrace_trace_stack(tr, flags, skip, pc); 1051 __ftrace_trace_stack(tr->buffer, flags, skip, pc);
1050} 1052}
1051 1053
1052static void ftrace_trace_userstack(struct trace_array *tr, 1054void
1053 unsigned long flags, int pc) 1055ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1054{ 1056{
1055#ifdef CONFIG_STACKTRACE
1056 struct ftrace_event_call *call = &event_user_stack; 1057 struct ftrace_event_call *call = &event_user_stack;
1057 struct ring_buffer_event *event; 1058 struct ring_buffer_event *event;
1058 struct userstack_entry *entry; 1059 struct userstack_entry *entry;
@@ -1061,7 +1062,7 @@ static void ftrace_trace_userstack(struct trace_array *tr,
1061 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) 1062 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
1062 return; 1063 return;
1063 1064
1064 event = trace_buffer_lock_reserve(tr, TRACE_USER_STACK, 1065 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
1065 sizeof(*entry), flags, pc); 1066 sizeof(*entry), flags, pc);
1066 if (!event) 1067 if (!event)
1067 return; 1068 return;
@@ -1075,9 +1076,8 @@ static void ftrace_trace_userstack(struct trace_array *tr,
1075 trace.entries = entry->caller; 1076 trace.entries = entry->caller;
1076 1077
1077 save_stack_trace_user(&trace); 1078 save_stack_trace_user(&trace);
1078 if (!filter_check_discard(call, entry, tr->buffer, event)) 1079 if (!filter_check_discard(call, entry, buffer, event))
1079 ring_buffer_unlock_commit(tr->buffer, event); 1080 ring_buffer_unlock_commit(buffer, event);
1080#endif
1081} 1081}
1082 1082
1083#ifdef UNUSED 1083#ifdef UNUSED
@@ -1087,6 +1087,8 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
1087} 1087}
1088#endif /* UNUSED */ 1088#endif /* UNUSED */
1089 1089
1090#endif /* CONFIG_STACKTRACE */
1091
1090static void 1092static void
1091ftrace_trace_special(void *__tr, 1093ftrace_trace_special(void *__tr,
1092 unsigned long arg1, unsigned long arg2, unsigned long arg3, 1094 unsigned long arg1, unsigned long arg2, unsigned long arg3,
@@ -1094,9 +1096,10 @@ ftrace_trace_special(void *__tr,
1094{ 1096{
1095 struct ring_buffer_event *event; 1097 struct ring_buffer_event *event;
1096 struct trace_array *tr = __tr; 1098 struct trace_array *tr = __tr;
1099 struct ring_buffer *buffer = tr->buffer;
1097 struct special_entry *entry; 1100 struct special_entry *entry;
1098 1101
1099 event = trace_buffer_lock_reserve(tr, TRACE_SPECIAL, 1102 event = trace_buffer_lock_reserve(buffer, TRACE_SPECIAL,
1100 sizeof(*entry), 0, pc); 1103 sizeof(*entry), 0, pc);
1101 if (!event) 1104 if (!event)
1102 return; 1105 return;
@@ -1104,7 +1107,7 @@ ftrace_trace_special(void *__tr,
1104 entry->arg1 = arg1; 1107 entry->arg1 = arg1;
1105 entry->arg2 = arg2; 1108 entry->arg2 = arg2;
1106 entry->arg3 = arg3; 1109 entry->arg3 = arg3;
1107 trace_buffer_unlock_commit(tr, event, 0, pc); 1110 trace_buffer_unlock_commit(buffer, event, 0, pc);
1108} 1111}
1109 1112
1110void 1113void
@@ -1115,62 +1118,6 @@ __trace_special(void *__tr, void *__data,
1115} 1118}
1116 1119
1117void 1120void
1118tracing_sched_switch_trace(struct trace_array *tr,
1119 struct task_struct *prev,
1120 struct task_struct *next,
1121 unsigned long flags, int pc)
1122{
1123 struct ftrace_event_call *call = &event_context_switch;
1124 struct ring_buffer_event *event;
1125 struct ctx_switch_entry *entry;
1126
1127 event = trace_buffer_lock_reserve(tr, TRACE_CTX,
1128 sizeof(*entry), flags, pc);
1129 if (!event)
1130 return;
1131 entry = ring_buffer_event_data(event);
1132 entry->prev_pid = prev->pid;
1133 entry->prev_prio = prev->prio;
1134 entry->prev_state = prev->state;
1135 entry->next_pid = next->pid;
1136 entry->next_prio = next->prio;
1137 entry->next_state = next->state;
1138 entry->next_cpu = task_cpu(next);
1139
1140 if (!filter_check_discard(call, entry, tr->buffer, event))
1141 trace_buffer_unlock_commit(tr, event, flags, pc);
1142}
1143
1144void
1145tracing_sched_wakeup_trace(struct trace_array *tr,
1146 struct task_struct *wakee,
1147 struct task_struct *curr,
1148 unsigned long flags, int pc)
1149{
1150 struct ftrace_event_call *call = &event_wakeup;
1151 struct ring_buffer_event *event;
1152 struct ctx_switch_entry *entry;
1153
1154 event = trace_buffer_lock_reserve(tr, TRACE_WAKE,
1155 sizeof(*entry), flags, pc);
1156 if (!event)
1157 return;
1158 entry = ring_buffer_event_data(event);
1159 entry->prev_pid = curr->pid;
1160 entry->prev_prio = curr->prio;
1161 entry->prev_state = curr->state;
1162 entry->next_pid = wakee->pid;
1163 entry->next_prio = wakee->prio;
1164 entry->next_state = wakee->state;
1165 entry->next_cpu = task_cpu(wakee);
1166
1167 if (!filter_check_discard(call, entry, tr->buffer, event))
1168 ring_buffer_unlock_commit(tr->buffer, event);
1169 ftrace_trace_stack(tr, flags, 6, pc);
1170 ftrace_trace_userstack(tr, flags, pc);
1171}
1172
1173void
1174ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) 1121ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
1175{ 1122{
1176 struct trace_array *tr = &global_trace; 1123 struct trace_array *tr = &global_trace;
@@ -1194,68 +1141,6 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
1194 local_irq_restore(flags); 1141 local_irq_restore(flags);
1195} 1142}
1196 1143
1197#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1198int trace_graph_entry(struct ftrace_graph_ent *trace)
1199{
1200 struct trace_array *tr = &global_trace;
1201 struct trace_array_cpu *data;
1202 unsigned long flags;
1203 long disabled;
1204 int ret;
1205 int cpu;
1206 int pc;
1207
1208 if (!ftrace_trace_task(current))
1209 return 0;
1210
1211 if (!ftrace_graph_addr(trace->func))
1212 return 0;
1213
1214 local_irq_save(flags);
1215 cpu = raw_smp_processor_id();
1216 data = tr->data[cpu];
1217 disabled = atomic_inc_return(&data->disabled);
1218 if (likely(disabled == 1)) {
1219 pc = preempt_count();
1220 ret = __trace_graph_entry(tr, trace, flags, pc);
1221 } else {
1222 ret = 0;
1223 }
1224 /* Only do the atomic if it is not already set */
1225 if (!test_tsk_trace_graph(current))
1226 set_tsk_trace_graph(current);
1227
1228 atomic_dec(&data->disabled);
1229 local_irq_restore(flags);
1230
1231 return ret;
1232}
1233
1234void trace_graph_return(struct ftrace_graph_ret *trace)
1235{
1236 struct trace_array *tr = &global_trace;
1237 struct trace_array_cpu *data;
1238 unsigned long flags;
1239 long disabled;
1240 int cpu;
1241 int pc;
1242
1243 local_irq_save(flags);
1244 cpu = raw_smp_processor_id();
1245 data = tr->data[cpu];
1246 disabled = atomic_inc_return(&data->disabled);
1247 if (likely(disabled == 1)) {
1248 pc = preempt_count();
1249 __trace_graph_return(tr, trace, flags, pc);
1250 }
1251 if (!trace->depth)
1252 clear_tsk_trace_graph(current);
1253 atomic_dec(&data->disabled);
1254 local_irq_restore(flags);
1255}
1256#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
1257
1258
1259/** 1144/**
1260 * trace_vbprintk - write binary msg to tracing buffer 1145 * trace_vbprintk - write binary msg to tracing buffer
1261 * 1146 *
@@ -1268,6 +1153,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1268 1153
1269 struct ftrace_event_call *call = &event_bprint; 1154 struct ftrace_event_call *call = &event_bprint;
1270 struct ring_buffer_event *event; 1155 struct ring_buffer_event *event;
1156 struct ring_buffer *buffer;
1271 struct trace_array *tr = &global_trace; 1157 struct trace_array *tr = &global_trace;
1272 struct trace_array_cpu *data; 1158 struct trace_array_cpu *data;
1273 struct bprint_entry *entry; 1159 struct bprint_entry *entry;
@@ -1300,7 +1186,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1300 goto out_unlock; 1186 goto out_unlock;
1301 1187
1302 size = sizeof(*entry) + sizeof(u32) * len; 1188 size = sizeof(*entry) + sizeof(u32) * len;
1303 event = trace_buffer_lock_reserve(tr, TRACE_BPRINT, size, flags, pc); 1189 buffer = tr->buffer;
1190 event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
1191 flags, pc);
1304 if (!event) 1192 if (!event)
1305 goto out_unlock; 1193 goto out_unlock;
1306 entry = ring_buffer_event_data(event); 1194 entry = ring_buffer_event_data(event);
@@ -1308,8 +1196,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1308 entry->fmt = fmt; 1196 entry->fmt = fmt;
1309 1197
1310 memcpy(entry->buf, trace_buf, sizeof(u32) * len); 1198 memcpy(entry->buf, trace_buf, sizeof(u32) * len);
1311 if (!filter_check_discard(call, entry, tr->buffer, event)) 1199 if (!filter_check_discard(call, entry, buffer, event))
1312 ring_buffer_unlock_commit(tr->buffer, event); 1200 ring_buffer_unlock_commit(buffer, event);
1313 1201
1314out_unlock: 1202out_unlock:
1315 __raw_spin_unlock(&trace_buf_lock); 1203 __raw_spin_unlock(&trace_buf_lock);
@@ -1324,14 +1212,30 @@ out:
1324} 1212}
1325EXPORT_SYMBOL_GPL(trace_vbprintk); 1213EXPORT_SYMBOL_GPL(trace_vbprintk);
1326 1214
1327int trace_vprintk(unsigned long ip, const char *fmt, va_list args) 1215int trace_array_printk(struct trace_array *tr,
1216 unsigned long ip, const char *fmt, ...)
1217{
1218 int ret;
1219 va_list ap;
1220
1221 if (!(trace_flags & TRACE_ITER_PRINTK))
1222 return 0;
1223
1224 va_start(ap, fmt);
1225 ret = trace_array_vprintk(tr, ip, fmt, ap);
1226 va_end(ap);
1227 return ret;
1228}
1229
1230int trace_array_vprintk(struct trace_array *tr,
1231 unsigned long ip, const char *fmt, va_list args)
1328{ 1232{
1329 static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; 1233 static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
1330 static char trace_buf[TRACE_BUF_SIZE]; 1234 static char trace_buf[TRACE_BUF_SIZE];
1331 1235
1332 struct ftrace_event_call *call = &event_print; 1236 struct ftrace_event_call *call = &event_print;
1333 struct ring_buffer_event *event; 1237 struct ring_buffer_event *event;
1334 struct trace_array *tr = &global_trace; 1238 struct ring_buffer *buffer;
1335 struct trace_array_cpu *data; 1239 struct trace_array_cpu *data;
1336 int cpu, len = 0, size, pc; 1240 int cpu, len = 0, size, pc;
1337 struct print_entry *entry; 1241 struct print_entry *entry;
@@ -1359,7 +1263,9 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1359 trace_buf[len] = 0; 1263 trace_buf[len] = 0;
1360 1264
1361 size = sizeof(*entry) + len + 1; 1265 size = sizeof(*entry) + len + 1;
1362 event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, irq_flags, pc); 1266 buffer = tr->buffer;
1267 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
1268 irq_flags, pc);
1363 if (!event) 1269 if (!event)
1364 goto out_unlock; 1270 goto out_unlock;
1365 entry = ring_buffer_event_data(event); 1271 entry = ring_buffer_event_data(event);
@@ -1367,8 +1273,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1367 1273
1368 memcpy(&entry->buf, trace_buf, len); 1274 memcpy(&entry->buf, trace_buf, len);
1369 entry->buf[len] = 0; 1275 entry->buf[len] = 0;
1370 if (!filter_check_discard(call, entry, tr->buffer, event)) 1276 if (!filter_check_discard(call, entry, buffer, event))
1371 ring_buffer_unlock_commit(tr->buffer, event); 1277 ring_buffer_unlock_commit(buffer, event);
1372 1278
1373 out_unlock: 1279 out_unlock:
1374 __raw_spin_unlock(&trace_buf_lock); 1280 __raw_spin_unlock(&trace_buf_lock);
@@ -1380,6 +1286,11 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1380 1286
1381 return len; 1287 return len;
1382} 1288}
1289
1290int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1291{
1292 return trace_array_printk(&global_trace, ip, fmt, args);
1293}
1383EXPORT_SYMBOL_GPL(trace_vprintk); 1294EXPORT_SYMBOL_GPL(trace_vprintk);
1384 1295
1385enum trace_file_type { 1296enum trace_file_type {
@@ -1519,6 +1430,37 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
1519 return ent; 1430 return ent;
1520} 1431}
1521 1432
1433static void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1434{
1435 struct trace_array *tr = iter->tr;
1436 struct ring_buffer_event *event;
1437 struct ring_buffer_iter *buf_iter;
1438 unsigned long entries = 0;
1439 u64 ts;
1440
1441 tr->data[cpu]->skipped_entries = 0;
1442
1443 if (!iter->buffer_iter[cpu])
1444 return;
1445
1446 buf_iter = iter->buffer_iter[cpu];
1447 ring_buffer_iter_reset(buf_iter);
1448
1449 /*
1450 * We could have the case with the max latency tracers
1451 * that a reset never took place on a cpu. This is evident
1452 * by the timestamp being before the start of the buffer.
1453 */
1454 while ((event = ring_buffer_iter_peek(buf_iter, &ts))) {
1455 if (ts >= iter->tr->time_start)
1456 break;
1457 entries++;
1458 ring_buffer_read(buf_iter, NULL);
1459 }
1460
1461 tr->data[cpu]->skipped_entries = entries;
1462}
1463
1522/* 1464/*
1523 * No necessary locking here. The worst thing which can 1465 * No necessary locking here. The worst thing which can
1524 * happen is loosing events consumed at the same time 1466 * happen is loosing events consumed at the same time
@@ -1557,10 +1499,9 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1557 1499
1558 if (cpu_file == TRACE_PIPE_ALL_CPU) { 1500 if (cpu_file == TRACE_PIPE_ALL_CPU) {
1559 for_each_tracing_cpu(cpu) 1501 for_each_tracing_cpu(cpu)
1560 ring_buffer_iter_reset(iter->buffer_iter[cpu]); 1502 tracing_iter_reset(iter, cpu);
1561 } else 1503 } else
1562 ring_buffer_iter_reset(iter->buffer_iter[cpu_file]); 1504 tracing_iter_reset(iter, cpu_file);
1563
1564 1505
1565 ftrace_enable_cpu(); 1506 ftrace_enable_cpu();
1566 1507
@@ -1609,16 +1550,32 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1609 struct trace_array *tr = iter->tr; 1550 struct trace_array *tr = iter->tr;
1610 struct trace_array_cpu *data = tr->data[tr->cpu]; 1551 struct trace_array_cpu *data = tr->data[tr->cpu];
1611 struct tracer *type = current_trace; 1552 struct tracer *type = current_trace;
1612 unsigned long total; 1553 unsigned long entries = 0;
1613 unsigned long entries; 1554 unsigned long total = 0;
1555 unsigned long count;
1614 const char *name = "preemption"; 1556 const char *name = "preemption";
1557 int cpu;
1615 1558
1616 if (type) 1559 if (type)
1617 name = type->name; 1560 name = type->name;
1618 1561
1619 entries = ring_buffer_entries(iter->tr->buffer); 1562
1620 total = entries + 1563 for_each_tracing_cpu(cpu) {
1621 ring_buffer_overruns(iter->tr->buffer); 1564 count = ring_buffer_entries_cpu(tr->buffer, cpu);
1565 /*
1566 * If this buffer has skipped entries, then we hold all
1567 * entries for the trace and we need to ignore the
1568 * ones before the time stamp.
1569 */
1570 if (tr->data[cpu]->skipped_entries) {
1571 count -= tr->data[cpu]->skipped_entries;
1572 /* total is the same as the entries */
1573 total += count;
1574 } else
1575 total += count +
1576 ring_buffer_overrun_cpu(tr->buffer, cpu);
1577 entries += count;
1578 }
1622 1579
1623 seq_printf(m, "# %s latency trace v1.1.5 on %s\n", 1580 seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
1624 name, UTS_RELEASE); 1581 name, UTS_RELEASE);
@@ -1660,7 +1617,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1660 seq_puts(m, "\n# => ended at: "); 1617 seq_puts(m, "\n# => ended at: ");
1661 seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags); 1618 seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags);
1662 trace_print_seq(m, &iter->seq); 1619 trace_print_seq(m, &iter->seq);
1663 seq_puts(m, "#\n"); 1620 seq_puts(m, "\n#\n");
1664 } 1621 }
1665 1622
1666 seq_puts(m, "#\n"); 1623 seq_puts(m, "#\n");
@@ -1679,6 +1636,9 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
1679 if (cpumask_test_cpu(iter->cpu, iter->started)) 1636 if (cpumask_test_cpu(iter->cpu, iter->started))
1680 return; 1637 return;
1681 1638
1639 if (iter->tr->data[iter->cpu]->skipped_entries)
1640 return;
1641
1682 cpumask_set_cpu(iter->cpu, iter->started); 1642 cpumask_set_cpu(iter->cpu, iter->started);
1683 1643
1684 /* Don't print started cpu buffer for the first entry of the trace */ 1644 /* Don't print started cpu buffer for the first entry of the trace */
@@ -1941,19 +1901,23 @@ __tracing_open(struct inode *inode, struct file *file)
1941 if (ring_buffer_overruns(iter->tr->buffer)) 1901 if (ring_buffer_overruns(iter->tr->buffer))
1942 iter->iter_flags |= TRACE_FILE_ANNOTATE; 1902 iter->iter_flags |= TRACE_FILE_ANNOTATE;
1943 1903
1904 /* stop the trace while dumping */
1905 tracing_stop();
1906
1944 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { 1907 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
1945 for_each_tracing_cpu(cpu) { 1908 for_each_tracing_cpu(cpu) {
1946 1909
1947 iter->buffer_iter[cpu] = 1910 iter->buffer_iter[cpu] =
1948 ring_buffer_read_start(iter->tr->buffer, cpu); 1911 ring_buffer_read_start(iter->tr->buffer, cpu);
1912 tracing_iter_reset(iter, cpu);
1949 } 1913 }
1950 } else { 1914 } else {
1951 cpu = iter->cpu_file; 1915 cpu = iter->cpu_file;
1952 iter->buffer_iter[cpu] = 1916 iter->buffer_iter[cpu] =
1953 ring_buffer_read_start(iter->tr->buffer, cpu); 1917 ring_buffer_read_start(iter->tr->buffer, cpu);
1918 tracing_iter_reset(iter, cpu);
1954 } 1919 }
1955 1920
1956 /* TODO stop tracer */
1957 ret = seq_open(file, &tracer_seq_ops); 1921 ret = seq_open(file, &tracer_seq_ops);
1958 if (ret < 0) { 1922 if (ret < 0) {
1959 fail_ret = ERR_PTR(ret); 1923 fail_ret = ERR_PTR(ret);
@@ -1963,9 +1927,6 @@ __tracing_open(struct inode *inode, struct file *file)
1963 m = file->private_data; 1927 m = file->private_data;
1964 m->private = iter; 1928 m->private = iter;
1965 1929
1966 /* stop the trace while dumping */
1967 tracing_stop();
1968
1969 mutex_unlock(&trace_types_lock); 1930 mutex_unlock(&trace_types_lock);
1970 1931
1971 return iter; 1932 return iter;
@@ -1976,6 +1937,7 @@ __tracing_open(struct inode *inode, struct file *file)
1976 ring_buffer_read_finish(iter->buffer_iter[cpu]); 1937 ring_buffer_read_finish(iter->buffer_iter[cpu]);
1977 } 1938 }
1978 free_cpumask_var(iter->started); 1939 free_cpumask_var(iter->started);
1940 tracing_start();
1979 fail: 1941 fail:
1980 mutex_unlock(&trace_types_lock); 1942 mutex_unlock(&trace_types_lock);
1981 kfree(iter->trace); 1943 kfree(iter->trace);
@@ -2257,8 +2219,8 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
2257 len += 3; /* "no" and newline */ 2219 len += 3; /* "no" and newline */
2258 } 2220 }
2259 2221
2260 /* +2 for \n and \0 */ 2222 /* +1 for \0 */
2261 buf = kmalloc(len + 2, GFP_KERNEL); 2223 buf = kmalloc(len + 1, GFP_KERNEL);
2262 if (!buf) { 2224 if (!buf) {
2263 mutex_unlock(&trace_types_lock); 2225 mutex_unlock(&trace_types_lock);
2264 return -ENOMEM; 2226 return -ENOMEM;
@@ -2281,7 +2243,7 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
2281 } 2243 }
2282 mutex_unlock(&trace_types_lock); 2244 mutex_unlock(&trace_types_lock);
2283 2245
2284 WARN_ON(r >= len + 2); 2246 WARN_ON(r >= len + 1);
2285 2247
2286 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 2248 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2287 2249
@@ -2292,23 +2254,23 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
2292/* Try to assign a tracer specific option */ 2254/* Try to assign a tracer specific option */
2293static int set_tracer_option(struct tracer *trace, char *cmp, int neg) 2255static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
2294{ 2256{
2295 struct tracer_flags *trace_flags = trace->flags; 2257 struct tracer_flags *tracer_flags = trace->flags;
2296 struct tracer_opt *opts = NULL; 2258 struct tracer_opt *opts = NULL;
2297 int ret = 0, i = 0; 2259 int ret = 0, i = 0;
2298 int len; 2260 int len;
2299 2261
2300 for (i = 0; trace_flags->opts[i].name; i++) { 2262 for (i = 0; tracer_flags->opts[i].name; i++) {
2301 opts = &trace_flags->opts[i]; 2263 opts = &tracer_flags->opts[i];
2302 len = strlen(opts->name); 2264 len = strlen(opts->name);
2303 2265
2304 if (strncmp(cmp, opts->name, len) == 0) { 2266 if (strncmp(cmp, opts->name, len) == 0) {
2305 ret = trace->set_flag(trace_flags->val, 2267 ret = trace->set_flag(tracer_flags->val,
2306 opts->bit, !neg); 2268 opts->bit, !neg);
2307 break; 2269 break;
2308 } 2270 }
2309 } 2271 }
2310 /* Not found */ 2272 /* Not found */
2311 if (!trace_flags->opts[i].name) 2273 if (!tracer_flags->opts[i].name)
2312 return -EINVAL; 2274 return -EINVAL;
2313 2275
2314 /* Refused to handle */ 2276 /* Refused to handle */
@@ -2316,9 +2278,9 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
2316 return ret; 2278 return ret;
2317 2279
2318 if (neg) 2280 if (neg)
2319 trace_flags->val &= ~opts->bit; 2281 tracer_flags->val &= ~opts->bit;
2320 else 2282 else
2321 trace_flags->val |= opts->bit; 2283 tracer_flags->val |= opts->bit;
2322 2284
2323 return 0; 2285 return 0;
2324} 2286}
@@ -2333,22 +2295,6 @@ static void set_tracer_flags(unsigned int mask, int enabled)
2333 trace_flags |= mask; 2295 trace_flags |= mask;
2334 else 2296 else
2335 trace_flags &= ~mask; 2297 trace_flags &= ~mask;
2336
2337 if (mask == TRACE_ITER_GLOBAL_CLK) {
2338 u64 (*func)(void);
2339
2340 if (enabled)
2341 func = trace_clock_global;
2342 else
2343 func = trace_clock_local;
2344
2345 mutex_lock(&trace_types_lock);
2346 ring_buffer_set_clock(global_trace.buffer, func);
2347
2348 if (max_tr.buffer)
2349 ring_buffer_set_clock(max_tr.buffer, func);
2350 mutex_unlock(&trace_types_lock);
2351 }
2352} 2298}
2353 2299
2354static ssize_t 2300static ssize_t
@@ -3316,6 +3262,62 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3316 return cnt; 3262 return cnt;
3317} 3263}
3318 3264
3265static ssize_t tracing_clock_read(struct file *filp, char __user *ubuf,
3266 size_t cnt, loff_t *ppos)
3267{
3268 char buf[64];
3269 int bufiter = 0;
3270 int i;
3271
3272 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++)
3273 bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter,
3274 "%s%s%s%s", i ? " " : "",
3275 i == trace_clock_id ? "[" : "", trace_clocks[i].name,
3276 i == trace_clock_id ? "]" : "");
3277 bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter, "\n");
3278
3279 return simple_read_from_buffer(ubuf, cnt, ppos, buf, bufiter);
3280}
3281
3282static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
3283 size_t cnt, loff_t *fpos)
3284{
3285 char buf[64];
3286 const char *clockstr;
3287 int i;
3288
3289 if (cnt >= sizeof(buf))
3290 return -EINVAL;
3291
3292 if (copy_from_user(&buf, ubuf, cnt))
3293 return -EFAULT;
3294
3295 buf[cnt] = 0;
3296
3297 clockstr = strstrip(buf);
3298
3299 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) {
3300 if (strcmp(trace_clocks[i].name, clockstr) == 0)
3301 break;
3302 }
3303 if (i == ARRAY_SIZE(trace_clocks))
3304 return -EINVAL;
3305
3306 trace_clock_id = i;
3307
3308 mutex_lock(&trace_types_lock);
3309
3310 ring_buffer_set_clock(global_trace.buffer, trace_clocks[i].func);
3311 if (max_tr.buffer)
3312 ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func);
3313
3314 mutex_unlock(&trace_types_lock);
3315
3316 *fpos += cnt;
3317
3318 return cnt;
3319}
3320
3319static const struct file_operations tracing_max_lat_fops = { 3321static const struct file_operations tracing_max_lat_fops = {
3320 .open = tracing_open_generic, 3322 .open = tracing_open_generic,
3321 .read = tracing_max_lat_read, 3323 .read = tracing_max_lat_read,
@@ -3353,6 +3355,12 @@ static const struct file_operations tracing_mark_fops = {
3353 .write = tracing_mark_write, 3355 .write = tracing_mark_write,
3354}; 3356};
3355 3357
3358static const struct file_operations trace_clock_fops = {
3359 .open = tracing_open_generic,
3360 .read = tracing_clock_read,
3361 .write = tracing_clock_write,
3362};
3363
3356struct ftrace_buffer_info { 3364struct ftrace_buffer_info {
3357 struct trace_array *tr; 3365 struct trace_array *tr;
3358 void *spare; 3366 void *spare;
@@ -3633,9 +3641,6 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
3633 cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); 3641 cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
3634 trace_seq_printf(s, "commit overrun: %ld\n", cnt); 3642 trace_seq_printf(s, "commit overrun: %ld\n", cnt);
3635 3643
3636 cnt = ring_buffer_nmi_dropped_cpu(tr->buffer, cpu);
3637 trace_seq_printf(s, "nmi dropped: %ld\n", cnt);
3638
3639 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); 3644 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
3640 3645
3641 kfree(s); 3646 kfree(s);
@@ -4066,11 +4071,13 @@ static __init int tracer_init_debugfs(void)
4066 trace_create_file("current_tracer", 0644, d_tracer, 4071 trace_create_file("current_tracer", 0644, d_tracer,
4067 &global_trace, &set_tracer_fops); 4072 &global_trace, &set_tracer_fops);
4068 4073
4074#ifdef CONFIG_TRACER_MAX_TRACE
4069 trace_create_file("tracing_max_latency", 0644, d_tracer, 4075 trace_create_file("tracing_max_latency", 0644, d_tracer,
4070 &tracing_max_latency, &tracing_max_lat_fops); 4076 &tracing_max_latency, &tracing_max_lat_fops);
4071 4077
4072 trace_create_file("tracing_thresh", 0644, d_tracer, 4078 trace_create_file("tracing_thresh", 0644, d_tracer,
4073 &tracing_thresh, &tracing_max_lat_fops); 4079 &tracing_thresh, &tracing_max_lat_fops);
4080#endif
4074 4081
4075 trace_create_file("README", 0444, d_tracer, 4082 trace_create_file("README", 0444, d_tracer,
4076 NULL, &tracing_readme_fops); 4083 NULL, &tracing_readme_fops);
@@ -4087,6 +4094,9 @@ static __init int tracer_init_debugfs(void)
4087 trace_create_file("saved_cmdlines", 0444, d_tracer, 4094 trace_create_file("saved_cmdlines", 0444, d_tracer,
4088 NULL, &tracing_saved_cmdlines_fops); 4095 NULL, &tracing_saved_cmdlines_fops);
4089 4096
4097 trace_create_file("trace_clock", 0644, d_tracer, NULL,
4098 &trace_clock_fops);
4099
4090#ifdef CONFIG_DYNAMIC_FTRACE 4100#ifdef CONFIG_DYNAMIC_FTRACE
4091 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, 4101 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
4092 &ftrace_update_tot_cnt, &tracing_dyn_info_fops); 4102 &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
@@ -4265,7 +4275,6 @@ void ftrace_dump(void)
4265 4275
4266__init static int tracer_alloc_buffers(void) 4276__init static int tracer_alloc_buffers(void)
4267{ 4277{
4268 struct trace_array_cpu *data;
4269 int ring_buf_size; 4278 int ring_buf_size;
4270 int i; 4279 int i;
4271 int ret = -ENOMEM; 4280 int ret = -ENOMEM;
@@ -4315,7 +4324,7 @@ __init static int tracer_alloc_buffers(void)
4315 4324
4316 /* Allocate the first page for all buffers */ 4325 /* Allocate the first page for all buffers */
4317 for_each_tracing_cpu(i) { 4326 for_each_tracing_cpu(i) {
4318 data = global_trace.data[i] = &per_cpu(global_trace_cpu, i); 4327 global_trace.data[i] = &per_cpu(global_trace_cpu, i);
4319 max_tr.data[i] = &per_cpu(max_data, i); 4328 max_tr.data[i] = &per_cpu(max_data, i);
4320 } 4329 }
4321 4330
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8b9f4f6e9559..fa1dccb579d5 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -34,8 +34,6 @@ enum trace_type {
34 TRACE_GRAPH_ENT, 34 TRACE_GRAPH_ENT,
35 TRACE_USER_STACK, 35 TRACE_USER_STACK,
36 TRACE_HW_BRANCHES, 36 TRACE_HW_BRANCHES,
37 TRACE_SYSCALL_ENTER,
38 TRACE_SYSCALL_EXIT,
39 TRACE_KMEM_ALLOC, 37 TRACE_KMEM_ALLOC,
40 TRACE_KMEM_FREE, 38 TRACE_KMEM_FREE,
41 TRACE_POWER, 39 TRACE_POWER,
@@ -236,9 +234,6 @@ struct trace_array_cpu {
236 atomic_t disabled; 234 atomic_t disabled;
237 void *buffer_page; /* ring buffer spare */ 235 void *buffer_page; /* ring buffer spare */
238 236
239 /* these fields get copied into max-trace: */
240 unsigned long trace_idx;
241 unsigned long overrun;
242 unsigned long saved_latency; 237 unsigned long saved_latency;
243 unsigned long critical_start; 238 unsigned long critical_start;
244 unsigned long critical_end; 239 unsigned long critical_end;
@@ -246,6 +241,7 @@ struct trace_array_cpu {
246 unsigned long nice; 241 unsigned long nice;
247 unsigned long policy; 242 unsigned long policy;
248 unsigned long rt_priority; 243 unsigned long rt_priority;
244 unsigned long skipped_entries;
249 cycle_t preempt_timestamp; 245 cycle_t preempt_timestamp;
250 pid_t pid; 246 pid_t pid;
251 uid_t uid; 247 uid_t uid;
@@ -319,10 +315,6 @@ extern void __ftrace_bad_type(void);
319 TRACE_KMEM_ALLOC); \ 315 TRACE_KMEM_ALLOC); \
320 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ 316 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
321 TRACE_KMEM_FREE); \ 317 TRACE_KMEM_FREE); \
322 IF_ASSIGN(var, ent, struct syscall_trace_enter, \
323 TRACE_SYSCALL_ENTER); \
324 IF_ASSIGN(var, ent, struct syscall_trace_exit, \
325 TRACE_SYSCALL_EXIT); \
326 __ftrace_bad_type(); \ 318 __ftrace_bad_type(); \
327 } while (0) 319 } while (0)
328 320
@@ -423,12 +415,13 @@ void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
423 415
424struct ring_buffer_event; 416struct ring_buffer_event;
425 417
426struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, 418struct ring_buffer_event *
427 int type, 419trace_buffer_lock_reserve(struct ring_buffer *buffer,
428 unsigned long len, 420 int type,
429 unsigned long flags, 421 unsigned long len,
430 int pc); 422 unsigned long flags,
431void trace_buffer_unlock_commit(struct trace_array *tr, 423 int pc);
424void trace_buffer_unlock_commit(struct ring_buffer *buffer,
432 struct ring_buffer_event *event, 425 struct ring_buffer_event *event,
433 unsigned long flags, int pc); 426 unsigned long flags, int pc);
434 427
@@ -467,6 +460,7 @@ void trace_function(struct trace_array *tr,
467 460
468void trace_graph_return(struct ftrace_graph_ret *trace); 461void trace_graph_return(struct ftrace_graph_ret *trace);
469int trace_graph_entry(struct ftrace_graph_ent *trace); 462int trace_graph_entry(struct ftrace_graph_ent *trace);
463void set_graph_array(struct trace_array *tr);
470 464
471void tracing_start_cmdline_record(void); 465void tracing_start_cmdline_record(void);
472void tracing_stop_cmdline_record(void); 466void tracing_stop_cmdline_record(void);
@@ -478,16 +472,40 @@ void unregister_tracer(struct tracer *type);
478 472
479extern unsigned long nsecs_to_usecs(unsigned long nsecs); 473extern unsigned long nsecs_to_usecs(unsigned long nsecs);
480 474
475#ifdef CONFIG_TRACER_MAX_TRACE
481extern unsigned long tracing_max_latency; 476extern unsigned long tracing_max_latency;
482extern unsigned long tracing_thresh; 477extern unsigned long tracing_thresh;
483 478
484void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); 479void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
485void update_max_tr_single(struct trace_array *tr, 480void update_max_tr_single(struct trace_array *tr,
486 struct task_struct *tsk, int cpu); 481 struct task_struct *tsk, int cpu);
482#endif /* CONFIG_TRACER_MAX_TRACE */
487 483
488void __trace_stack(struct trace_array *tr, 484#ifdef CONFIG_STACKTRACE
489 unsigned long flags, 485void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
490 int skip, int pc); 486 int skip, int pc);
487
488void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
489 int pc);
490
491void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
492 int pc);
493#else
494static inline void ftrace_trace_stack(struct trace_array *tr,
495 unsigned long flags, int skip, int pc)
496{
497}
498
499static inline void ftrace_trace_userstack(struct trace_array *tr,
500 unsigned long flags, int pc)
501{
502}
503
504static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
505 int skip, int pc)
506{
507}
508#endif /* CONFIG_STACKTRACE */
491 509
492extern cycle_t ftrace_now(int cpu); 510extern cycle_t ftrace_now(int cpu);
493 511
@@ -513,6 +531,10 @@ extern unsigned long ftrace_update_tot_cnt;
513extern int DYN_FTRACE_TEST_NAME(void); 531extern int DYN_FTRACE_TEST_NAME(void);
514#endif 532#endif
515 533
534extern int ring_buffer_expanded;
535extern bool tracing_selftest_disabled;
536DECLARE_PER_CPU(local_t, ftrace_cpu_disabled);
537
516#ifdef CONFIG_FTRACE_STARTUP_TEST 538#ifdef CONFIG_FTRACE_STARTUP_TEST
517extern int trace_selftest_startup_function(struct tracer *trace, 539extern int trace_selftest_startup_function(struct tracer *trace,
518 struct trace_array *tr); 540 struct trace_array *tr);
@@ -544,9 +566,16 @@ extern int
544trace_vbprintk(unsigned long ip, const char *fmt, va_list args); 566trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
545extern int 567extern int
546trace_vprintk(unsigned long ip, const char *fmt, va_list args); 568trace_vprintk(unsigned long ip, const char *fmt, va_list args);
569extern int
570trace_array_vprintk(struct trace_array *tr,
571 unsigned long ip, const char *fmt, va_list args);
572int trace_array_printk(struct trace_array *tr,
573 unsigned long ip, const char *fmt, ...);
547 574
548extern unsigned long trace_flags; 575extern unsigned long trace_flags;
549 576
577extern int trace_clock_id;
578
550/* Standard output formatting function used for function return traces */ 579/* Standard output formatting function used for function return traces */
551#ifdef CONFIG_FUNCTION_GRAPH_TRACER 580#ifdef CONFIG_FUNCTION_GRAPH_TRACER
552extern enum print_line_t print_graph_function(struct trace_iterator *iter); 581extern enum print_line_t print_graph_function(struct trace_iterator *iter);
@@ -635,9 +664,8 @@ enum trace_iterator_flags {
635 TRACE_ITER_PRINTK_MSGONLY = 0x10000, 664 TRACE_ITER_PRINTK_MSGONLY = 0x10000,
636 TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */ 665 TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */
637 TRACE_ITER_LATENCY_FMT = 0x40000, 666 TRACE_ITER_LATENCY_FMT = 0x40000,
638 TRACE_ITER_GLOBAL_CLK = 0x80000, 667 TRACE_ITER_SLEEP_TIME = 0x80000,
639 TRACE_ITER_SLEEP_TIME = 0x100000, 668 TRACE_ITER_GRAPH_TIME = 0x100000,
640 TRACE_ITER_GRAPH_TIME = 0x200000,
641}; 669};
642 670
643/* 671/*
@@ -734,6 +762,7 @@ struct ftrace_event_field {
734 struct list_head link; 762 struct list_head link;
735 char *name; 763 char *name;
736 char *type; 764 char *type;
765 int filter_type;
737 int offset; 766 int offset;
738 int size; 767 int size;
739 int is_signed; 768 int is_signed;
@@ -743,13 +772,15 @@ struct event_filter {
743 int n_preds; 772 int n_preds;
744 struct filter_pred **preds; 773 struct filter_pred **preds;
745 char *filter_string; 774 char *filter_string;
775 bool no_reset;
746}; 776};
747 777
748struct event_subsystem { 778struct event_subsystem {
749 struct list_head list; 779 struct list_head list;
750 const char *name; 780 const char *name;
751 struct dentry *entry; 781 struct dentry *entry;
752 void *filter; 782 struct event_filter *filter;
783 int nr_events;
753}; 784};
754 785
755struct filter_pred; 786struct filter_pred;
@@ -777,6 +808,7 @@ extern int apply_subsystem_event_filter(struct event_subsystem *system,
777 char *filter_string); 808 char *filter_string);
778extern void print_subsystem_event_filter(struct event_subsystem *system, 809extern void print_subsystem_event_filter(struct event_subsystem *system,
779 struct trace_seq *s); 810 struct trace_seq *s);
811extern int filter_assign_type(const char *type);
780 812
781static inline int 813static inline int
782filter_check_discard(struct ftrace_event_call *call, void *rec, 814filter_check_discard(struct ftrace_event_call *call, void *rec,
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index a29ef23ffb47..19bfc75d467e 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -41,14 +41,12 @@ void disable_boot_trace(void)
41 41
42static int boot_trace_init(struct trace_array *tr) 42static int boot_trace_init(struct trace_array *tr)
43{ 43{
44 int cpu;
45 boot_trace = tr; 44 boot_trace = tr;
46 45
47 if (!tr) 46 if (!tr)
48 return 0; 47 return 0;
49 48
50 for_each_cpu(cpu, cpu_possible_mask) 49 tracing_reset_online_cpus(tr);
51 tracing_reset(tr, cpu);
52 50
53 tracing_sched_switch_assign_trace(tr); 51 tracing_sched_switch_assign_trace(tr);
54 return 0; 52 return 0;
@@ -132,6 +130,7 @@ struct tracer boot_tracer __read_mostly =
132void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) 130void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
133{ 131{
134 struct ring_buffer_event *event; 132 struct ring_buffer_event *event;
133 struct ring_buffer *buffer;
135 struct trace_boot_call *entry; 134 struct trace_boot_call *entry;
136 struct trace_array *tr = boot_trace; 135 struct trace_array *tr = boot_trace;
137 136
@@ -144,13 +143,14 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
144 sprint_symbol(bt->func, (unsigned long)fn); 143 sprint_symbol(bt->func, (unsigned long)fn);
145 preempt_disable(); 144 preempt_disable();
146 145
147 event = trace_buffer_lock_reserve(tr, TRACE_BOOT_CALL, 146 buffer = tr->buffer;
147 event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_CALL,
148 sizeof(*entry), 0, 0); 148 sizeof(*entry), 0, 0);
149 if (!event) 149 if (!event)
150 goto out; 150 goto out;
151 entry = ring_buffer_event_data(event); 151 entry = ring_buffer_event_data(event);
152 entry->boot_call = *bt; 152 entry->boot_call = *bt;
153 trace_buffer_unlock_commit(tr, event, 0, 0); 153 trace_buffer_unlock_commit(buffer, event, 0, 0);
154 out: 154 out:
155 preempt_enable(); 155 preempt_enable();
156} 156}
@@ -158,6 +158,7 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
158void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) 158void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
159{ 159{
160 struct ring_buffer_event *event; 160 struct ring_buffer_event *event;
161 struct ring_buffer *buffer;
161 struct trace_boot_ret *entry; 162 struct trace_boot_ret *entry;
162 struct trace_array *tr = boot_trace; 163 struct trace_array *tr = boot_trace;
163 164
@@ -167,13 +168,14 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
167 sprint_symbol(bt->func, (unsigned long)fn); 168 sprint_symbol(bt->func, (unsigned long)fn);
168 preempt_disable(); 169 preempt_disable();
169 170
170 event = trace_buffer_lock_reserve(tr, TRACE_BOOT_RET, 171 buffer = tr->buffer;
172 event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_RET,
171 sizeof(*entry), 0, 0); 173 sizeof(*entry), 0, 0);
172 if (!event) 174 if (!event)
173 goto out; 175 goto out;
174 entry = ring_buffer_event_data(event); 176 entry = ring_buffer_event_data(event);
175 entry->boot_ret = *bt; 177 entry->boot_ret = *bt;
176 trace_buffer_unlock_commit(tr, event, 0, 0); 178 trace_buffer_unlock_commit(buffer, event, 0, 0);
177 out: 179 out:
178 preempt_enable(); 180 preempt_enable();
179} 181}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e75276a49cf5..78b1ed230177 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -17,6 +17,8 @@
17#include <linux/ctype.h> 17#include <linux/ctype.h>
18#include <linux/delay.h> 18#include <linux/delay.h>
19 19
20#include <asm/setup.h>
21
20#include "trace_output.h" 22#include "trace_output.h"
21 23
22#define TRACE_SYSTEM "TRACE_SYSTEM" 24#define TRACE_SYSTEM "TRACE_SYSTEM"
@@ -25,8 +27,9 @@ DEFINE_MUTEX(event_mutex);
25 27
26LIST_HEAD(ftrace_events); 28LIST_HEAD(ftrace_events);
27 29
28int trace_define_field(struct ftrace_event_call *call, char *type, 30int trace_define_field(struct ftrace_event_call *call, const char *type,
29 char *name, int offset, int size, int is_signed) 31 const char *name, int offset, int size, int is_signed,
32 int filter_type)
30{ 33{
31 struct ftrace_event_field *field; 34 struct ftrace_event_field *field;
32 35
@@ -42,9 +45,15 @@ int trace_define_field(struct ftrace_event_call *call, char *type,
42 if (!field->type) 45 if (!field->type)
43 goto err; 46 goto err;
44 47
48 if (filter_type == FILTER_OTHER)
49 field->filter_type = filter_assign_type(type);
50 else
51 field->filter_type = filter_type;
52
45 field->offset = offset; 53 field->offset = offset;
46 field->size = size; 54 field->size = size;
47 field->is_signed = is_signed; 55 field->is_signed = is_signed;
56
48 list_add(&field->link, &call->fields); 57 list_add(&field->link, &call->fields);
49 58
50 return 0; 59 return 0;
@@ -60,6 +69,29 @@ err:
60} 69}
61EXPORT_SYMBOL_GPL(trace_define_field); 70EXPORT_SYMBOL_GPL(trace_define_field);
62 71
72#define __common_field(type, item) \
73 ret = trace_define_field(call, #type, "common_" #item, \
74 offsetof(typeof(ent), item), \
75 sizeof(ent.item), \
76 is_signed_type(type), FILTER_OTHER); \
77 if (ret) \
78 return ret;
79
80int trace_define_common_fields(struct ftrace_event_call *call)
81{
82 int ret;
83 struct trace_entry ent;
84
85 __common_field(unsigned short, type);
86 __common_field(unsigned char, flags);
87 __common_field(unsigned char, preempt_count);
88 __common_field(int, pid);
89 __common_field(int, tgid);
90
91 return ret;
92}
93EXPORT_SYMBOL_GPL(trace_define_common_fields);
94
63#ifdef CONFIG_MODULES 95#ifdef CONFIG_MODULES
64 96
65static void trace_destroy_fields(struct ftrace_event_call *call) 97static void trace_destroy_fields(struct ftrace_event_call *call)
@@ -84,14 +116,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
84 if (call->enabled) { 116 if (call->enabled) {
85 call->enabled = 0; 117 call->enabled = 0;
86 tracing_stop_cmdline_record(); 118 tracing_stop_cmdline_record();
87 call->unregfunc(); 119 call->unregfunc(call->data);
88 } 120 }
89 break; 121 break;
90 case 1: 122 case 1:
91 if (!call->enabled) { 123 if (!call->enabled) {
92 call->enabled = 1; 124 call->enabled = 1;
93 tracing_start_cmdline_record(); 125 tracing_start_cmdline_record();
94 call->regfunc(); 126 call->regfunc(call->data);
95 } 127 }
96 break; 128 break;
97 } 129 }
@@ -574,7 +606,7 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
574 trace_seq_printf(s, "format:\n"); 606 trace_seq_printf(s, "format:\n");
575 trace_write_header(s); 607 trace_write_header(s);
576 608
577 r = call->show_format(s); 609 r = call->show_format(call, s);
578 if (!r) { 610 if (!r) {
579 /* 611 /*
580 * ug! The format output is bigger than a PAGE!! 612 * ug! The format output is bigger than a PAGE!!
@@ -849,8 +881,10 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
849 881
850 /* First see if we did not already create this dir */ 882 /* First see if we did not already create this dir */
851 list_for_each_entry(system, &event_subsystems, list) { 883 list_for_each_entry(system, &event_subsystems, list) {
852 if (strcmp(system->name, name) == 0) 884 if (strcmp(system->name, name) == 0) {
885 system->nr_events++;
853 return system->entry; 886 return system->entry;
887 }
854 } 888 }
855 889
856 /* need to create new entry */ 890 /* need to create new entry */
@@ -869,6 +903,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
869 return d_events; 903 return d_events;
870 } 904 }
871 905
906 system->nr_events = 1;
872 system->name = kstrdup(name, GFP_KERNEL); 907 system->name = kstrdup(name, GFP_KERNEL);
873 if (!system->name) { 908 if (!system->name) {
874 debugfs_remove(system->entry); 909 debugfs_remove(system->entry);
@@ -920,15 +955,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
920 if (strcmp(call->system, TRACE_SYSTEM) != 0) 955 if (strcmp(call->system, TRACE_SYSTEM) != 0)
921 d_events = event_subsystem_dir(call->system, d_events); 956 d_events = event_subsystem_dir(call->system, d_events);
922 957
923 if (call->raw_init) {
924 ret = call->raw_init();
925 if (ret < 0) {
926 pr_warning("Could not initialize trace point"
927 " events/%s\n", call->name);
928 return ret;
929 }
930 }
931
932 call->dir = debugfs_create_dir(call->name, d_events); 958 call->dir = debugfs_create_dir(call->name, d_events);
933 if (!call->dir) { 959 if (!call->dir) {
934 pr_warning("Could not create debugfs " 960 pr_warning("Could not create debugfs "
@@ -945,7 +971,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
945 id); 971 id);
946 972
947 if (call->define_fields) { 973 if (call->define_fields) {
948 ret = call->define_fields(); 974 ret = call->define_fields(call);
949 if (ret < 0) { 975 if (ret < 0) {
950 pr_warning("Could not initialize trace point" 976 pr_warning("Could not initialize trace point"
951 " events/%s\n", call->name); 977 " events/%s\n", call->name);
@@ -987,6 +1013,32 @@ struct ftrace_module_file_ops {
987 struct file_operations filter; 1013 struct file_operations filter;
988}; 1014};
989 1015
1016static void remove_subsystem_dir(const char *name)
1017{
1018 struct event_subsystem *system;
1019
1020 if (strcmp(name, TRACE_SYSTEM) == 0)
1021 return;
1022
1023 list_for_each_entry(system, &event_subsystems, list) {
1024 if (strcmp(system->name, name) == 0) {
1025 if (!--system->nr_events) {
1026 struct event_filter *filter = system->filter;
1027
1028 debugfs_remove_recursive(system->entry);
1029 list_del(&system->list);
1030 if (filter) {
1031 kfree(filter->filter_string);
1032 kfree(filter);
1033 }
1034 kfree(system->name);
1035 kfree(system);
1036 }
1037 break;
1038 }
1039 }
1040}
1041
990static struct ftrace_module_file_ops * 1042static struct ftrace_module_file_ops *
991trace_create_file_ops(struct module *mod) 1043trace_create_file_ops(struct module *mod)
992{ 1044{
@@ -1027,6 +1079,7 @@ static void trace_module_add_events(struct module *mod)
1027 struct ftrace_module_file_ops *file_ops = NULL; 1079 struct ftrace_module_file_ops *file_ops = NULL;
1028 struct ftrace_event_call *call, *start, *end; 1080 struct ftrace_event_call *call, *start, *end;
1029 struct dentry *d_events; 1081 struct dentry *d_events;
1082 int ret;
1030 1083
1031 start = mod->trace_events; 1084 start = mod->trace_events;
1032 end = mod->trace_events + mod->num_trace_events; 1085 end = mod->trace_events + mod->num_trace_events;
@@ -1042,7 +1095,15 @@ static void trace_module_add_events(struct module *mod)
1042 /* The linker may leave blanks */ 1095 /* The linker may leave blanks */
1043 if (!call->name) 1096 if (!call->name)
1044 continue; 1097 continue;
1045 1098 if (call->raw_init) {
1099 ret = call->raw_init();
1100 if (ret < 0) {
1101 if (ret != -ENOSYS)
1102 pr_warning("Could not initialize trace "
1103 "point events/%s\n", call->name);
1104 continue;
1105 }
1106 }
1046 /* 1107 /*
1047 * This module has events, create file ops for this module 1108 * This module has events, create file ops for this module
1048 * if not already done. 1109 * if not already done.
@@ -1077,6 +1138,7 @@ static void trace_module_remove_events(struct module *mod)
1077 list_del(&call->list); 1138 list_del(&call->list);
1078 trace_destroy_fields(call); 1139 trace_destroy_fields(call);
1079 destroy_preds(call); 1140 destroy_preds(call);
1141 remove_subsystem_dir(call->system);
1080 } 1142 }
1081 } 1143 }
1082 1144
@@ -1133,6 +1195,18 @@ struct notifier_block trace_module_nb = {
1133extern struct ftrace_event_call __start_ftrace_events[]; 1195extern struct ftrace_event_call __start_ftrace_events[];
1134extern struct ftrace_event_call __stop_ftrace_events[]; 1196extern struct ftrace_event_call __stop_ftrace_events[];
1135 1197
1198static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
1199
1200static __init int setup_trace_event(char *str)
1201{
1202 strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE);
1203 ring_buffer_expanded = 1;
1204 tracing_selftest_disabled = 1;
1205
1206 return 1;
1207}
1208__setup("trace_event=", setup_trace_event);
1209
1136static __init int event_trace_init(void) 1210static __init int event_trace_init(void)
1137{ 1211{
1138 struct ftrace_event_call *call; 1212 struct ftrace_event_call *call;
@@ -1140,6 +1214,8 @@ static __init int event_trace_init(void)
1140 struct dentry *entry; 1214 struct dentry *entry;
1141 struct dentry *d_events; 1215 struct dentry *d_events;
1142 int ret; 1216 int ret;
1217 char *buf = bootup_event_buf;
1218 char *token;
1143 1219
1144 d_tracer = tracing_init_dentry(); 1220 d_tracer = tracing_init_dentry();
1145 if (!d_tracer) 1221 if (!d_tracer)
@@ -1179,12 +1255,34 @@ static __init int event_trace_init(void)
1179 /* The linker may leave blanks */ 1255 /* The linker may leave blanks */
1180 if (!call->name) 1256 if (!call->name)
1181 continue; 1257 continue;
1258 if (call->raw_init) {
1259 ret = call->raw_init();
1260 if (ret < 0) {
1261 if (ret != -ENOSYS)
1262 pr_warning("Could not initialize trace "
1263 "point events/%s\n", call->name);
1264 continue;
1265 }
1266 }
1182 list_add(&call->list, &ftrace_events); 1267 list_add(&call->list, &ftrace_events);
1183 event_create_dir(call, d_events, &ftrace_event_id_fops, 1268 event_create_dir(call, d_events, &ftrace_event_id_fops,
1184 &ftrace_enable_fops, &ftrace_event_filter_fops, 1269 &ftrace_enable_fops, &ftrace_event_filter_fops,
1185 &ftrace_event_format_fops); 1270 &ftrace_event_format_fops);
1186 } 1271 }
1187 1272
1273 while (true) {
1274 token = strsep(&buf, ",");
1275
1276 if (!token)
1277 break;
1278 if (!*token)
1279 continue;
1280
1281 ret = ftrace_set_clr_event(token, 1);
1282 if (ret)
1283 pr_warning("Failed to enable trace event: %s\n", token);
1284 }
1285
1188 ret = register_module_notifier(&trace_module_nb); 1286 ret = register_module_notifier(&trace_module_nb);
1189 if (ret) 1287 if (ret)
1190 pr_warning("Failed to register trace events module notifier\n"); 1288 pr_warning("Failed to register trace events module notifier\n");
@@ -1340,6 +1438,7 @@ static void
1340function_test_events_call(unsigned long ip, unsigned long parent_ip) 1438function_test_events_call(unsigned long ip, unsigned long parent_ip)
1341{ 1439{
1342 struct ring_buffer_event *event; 1440 struct ring_buffer_event *event;
1441 struct ring_buffer *buffer;
1343 struct ftrace_entry *entry; 1442 struct ftrace_entry *entry;
1344 unsigned long flags; 1443 unsigned long flags;
1345 long disabled; 1444 long disabled;
@@ -1357,7 +1456,8 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1357 1456
1358 local_save_flags(flags); 1457 local_save_flags(flags);
1359 1458
1360 event = trace_current_buffer_lock_reserve(TRACE_FN, sizeof(*entry), 1459 event = trace_current_buffer_lock_reserve(&buffer,
1460 TRACE_FN, sizeof(*entry),
1361 flags, pc); 1461 flags, pc);
1362 if (!event) 1462 if (!event)
1363 goto out; 1463 goto out;
@@ -1365,7 +1465,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1365 entry->ip = ip; 1465 entry->ip = ip;
1366 entry->parent_ip = parent_ip; 1466 entry->parent_ip = parent_ip;
1367 1467
1368 trace_nowake_buffer_unlock_commit(event, flags, pc); 1468 trace_nowake_buffer_unlock_commit(buffer, event, flags, pc);
1369 1469
1370 out: 1470 out:
1371 atomic_dec(&per_cpu(test_event_disable, cpu)); 1471 atomic_dec(&per_cpu(test_event_disable, cpu));
@@ -1392,10 +1492,10 @@ static __init void event_trace_self_test_with_function(void)
1392 1492
1393static __init int event_trace_self_tests_init(void) 1493static __init int event_trace_self_tests_init(void)
1394{ 1494{
1395 1495 if (!tracing_selftest_disabled) {
1396 event_trace_self_tests(); 1496 event_trace_self_tests();
1397 1497 event_trace_self_test_with_function();
1398 event_trace_self_test_with_function(); 1498 }
1399 1499
1400 return 0; 1500 return 0;
1401} 1501}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index f32dc9d1ea7b..93660fbbf629 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -163,6 +163,20 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
163 return match; 163 return match;
164} 164}
165 165
166/* Filter predicate for char * pointers */
167static int filter_pred_pchar(struct filter_pred *pred, void *event,
168 int val1, int val2)
169{
170 char **addr = (char **)(event + pred->offset);
171 int cmp, match;
172
173 cmp = strncmp(*addr, pred->str_val, pred->str_len);
174
175 match = (!cmp) ^ pred->not;
176
177 return match;
178}
179
166/* 180/*
167 * Filter predicate for dynamic sized arrays of characters. 181 * Filter predicate for dynamic sized arrays of characters.
168 * These are implemented through a list of strings at the end 182 * These are implemented through a list of strings at the end
@@ -176,11 +190,13 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
176static int filter_pred_strloc(struct filter_pred *pred, void *event, 190static int filter_pred_strloc(struct filter_pred *pred, void *event,
177 int val1, int val2) 191 int val1, int val2)
178{ 192{
179 unsigned short str_loc = *(unsigned short *)(event + pred->offset); 193 u32 str_item = *(u32 *)(event + pred->offset);
194 int str_loc = str_item & 0xffff;
195 int str_len = str_item >> 16;
180 char *addr = (char *)(event + str_loc); 196 char *addr = (char *)(event + str_loc);
181 int cmp, match; 197 int cmp, match;
182 198
183 cmp = strncmp(addr, pred->str_val, pred->str_len); 199 cmp = strncmp(addr, pred->str_val, str_len);
184 200
185 match = (!cmp) ^ pred->not; 201 match = (!cmp) ^ pred->not;
186 202
@@ -293,7 +309,7 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
293 struct event_filter *filter = call->filter; 309 struct event_filter *filter = call->filter;
294 310
295 mutex_lock(&event_mutex); 311 mutex_lock(&event_mutex);
296 if (filter->filter_string) 312 if (filter && filter->filter_string)
297 trace_seq_printf(s, "%s\n", filter->filter_string); 313 trace_seq_printf(s, "%s\n", filter->filter_string);
298 else 314 else
299 trace_seq_printf(s, "none\n"); 315 trace_seq_printf(s, "none\n");
@@ -306,7 +322,7 @@ void print_subsystem_event_filter(struct event_subsystem *system,
306 struct event_filter *filter = system->filter; 322 struct event_filter *filter = system->filter;
307 323
308 mutex_lock(&event_mutex); 324 mutex_lock(&event_mutex);
309 if (filter->filter_string) 325 if (filter && filter->filter_string)
310 trace_seq_printf(s, "%s\n", filter->filter_string); 326 trace_seq_printf(s, "%s\n", filter->filter_string);
311 else 327 else
312 trace_seq_printf(s, "none\n"); 328 trace_seq_printf(s, "none\n");
@@ -374,6 +390,9 @@ void destroy_preds(struct ftrace_event_call *call)
374 struct event_filter *filter = call->filter; 390 struct event_filter *filter = call->filter;
375 int i; 391 int i;
376 392
393 if (!filter)
394 return;
395
377 for (i = 0; i < MAX_FILTER_PRED; i++) { 396 for (i = 0; i < MAX_FILTER_PRED; i++) {
378 if (filter->preds[i]) 397 if (filter->preds[i])
379 filter_free_pred(filter->preds[i]); 398 filter_free_pred(filter->preds[i]);
@@ -384,17 +403,19 @@ void destroy_preds(struct ftrace_event_call *call)
384 call->filter = NULL; 403 call->filter = NULL;
385} 404}
386 405
387int init_preds(struct ftrace_event_call *call) 406static int init_preds(struct ftrace_event_call *call)
388{ 407{
389 struct event_filter *filter; 408 struct event_filter *filter;
390 struct filter_pred *pred; 409 struct filter_pred *pred;
391 int i; 410 int i;
392 411
412 if (call->filter)
413 return 0;
414
393 filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL); 415 filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
394 if (!call->filter) 416 if (!call->filter)
395 return -ENOMEM; 417 return -ENOMEM;
396 418
397 call->filter_active = 0;
398 filter->n_preds = 0; 419 filter->n_preds = 0;
399 420
400 filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); 421 filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
@@ -416,30 +437,55 @@ oom:
416 437
417 return -ENOMEM; 438 return -ENOMEM;
418} 439}
419EXPORT_SYMBOL_GPL(init_preds);
420 440
421static void filter_free_subsystem_preds(struct event_subsystem *system) 441static int init_subsystem_preds(struct event_subsystem *system)
422{ 442{
423 struct event_filter *filter = system->filter;
424 struct ftrace_event_call *call; 443 struct ftrace_event_call *call;
425 int i; 444 int err;
426 445
427 if (filter->n_preds) { 446 list_for_each_entry(call, &ftrace_events, list) {
428 for (i = 0; i < filter->n_preds; i++) 447 if (!call->define_fields)
429 filter_free_pred(filter->preds[i]); 448 continue;
430 kfree(filter->preds); 449
431 filter->preds = NULL; 450 if (strcmp(call->system, system->name) != 0)
432 filter->n_preds = 0; 451 continue;
452
453 err = init_preds(call);
454 if (err)
455 return err;
433 } 456 }
434 457
458 return 0;
459}
460
461enum {
462 FILTER_DISABLE_ALL,
463 FILTER_INIT_NO_RESET,
464 FILTER_SKIP_NO_RESET,
465};
466
467static void filter_free_subsystem_preds(struct event_subsystem *system,
468 int flag)
469{
470 struct ftrace_event_call *call;
471
435 list_for_each_entry(call, &ftrace_events, list) { 472 list_for_each_entry(call, &ftrace_events, list) {
436 if (!call->define_fields) 473 if (!call->define_fields)
437 continue; 474 continue;
438 475
439 if (!strcmp(call->system, system->name)) { 476 if (strcmp(call->system, system->name) != 0)
440 filter_disable_preds(call); 477 continue;
441 remove_filter_string(call->filter); 478
479 if (flag == FILTER_INIT_NO_RESET) {
480 call->filter->no_reset = false;
481 continue;
442 } 482 }
483
484 if (flag == FILTER_SKIP_NO_RESET && call->filter->no_reset)
485 continue;
486
487 filter_disable_preds(call);
488 remove_filter_string(call->filter);
443 } 489 }
444} 490}
445 491
@@ -468,12 +514,7 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
468 return 0; 514 return 0;
469} 515}
470 516
471enum { 517int filter_assign_type(const char *type)
472 FILTER_STATIC_STRING = 1,
473 FILTER_DYN_STRING
474};
475
476static int is_string_field(const char *type)
477{ 518{
478 if (strstr(type, "__data_loc") && strstr(type, "char")) 519 if (strstr(type, "__data_loc") && strstr(type, "char"))
479 return FILTER_DYN_STRING; 520 return FILTER_DYN_STRING;
@@ -481,12 +522,19 @@ static int is_string_field(const char *type)
481 if (strchr(type, '[') && strstr(type, "char")) 522 if (strchr(type, '[') && strstr(type, "char"))
482 return FILTER_STATIC_STRING; 523 return FILTER_STATIC_STRING;
483 524
484 return 0; 525 return FILTER_OTHER;
526}
527
528static bool is_string_field(struct ftrace_event_field *field)
529{
530 return field->filter_type == FILTER_DYN_STRING ||
531 field->filter_type == FILTER_STATIC_STRING ||
532 field->filter_type == FILTER_PTR_STRING;
485} 533}
486 534
487static int is_legal_op(struct ftrace_event_field *field, int op) 535static int is_legal_op(struct ftrace_event_field *field, int op)
488{ 536{
489 if (is_string_field(field->type) && (op != OP_EQ && op != OP_NE)) 537 if (is_string_field(field) && (op != OP_EQ && op != OP_NE))
490 return 0; 538 return 0;
491 539
492 return 1; 540 return 1;
@@ -537,22 +585,24 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
537 585
538static int filter_add_pred(struct filter_parse_state *ps, 586static int filter_add_pred(struct filter_parse_state *ps,
539 struct ftrace_event_call *call, 587 struct ftrace_event_call *call,
540 struct filter_pred *pred) 588 struct filter_pred *pred,
589 bool dry_run)
541{ 590{
542 struct ftrace_event_field *field; 591 struct ftrace_event_field *field;
543 filter_pred_fn_t fn; 592 filter_pred_fn_t fn;
544 unsigned long long val; 593 unsigned long long val;
545 int string_type;
546 int ret; 594 int ret;
547 595
548 pred->fn = filter_pred_none; 596 pred->fn = filter_pred_none;
549 597
550 if (pred->op == OP_AND) { 598 if (pred->op == OP_AND) {
551 pred->pop_n = 2; 599 pred->pop_n = 2;
552 return filter_add_pred_fn(ps, call, pred, filter_pred_and); 600 fn = filter_pred_and;
601 goto add_pred_fn;
553 } else if (pred->op == OP_OR) { 602 } else if (pred->op == OP_OR) {
554 pred->pop_n = 2; 603 pred->pop_n = 2;
555 return filter_add_pred_fn(ps, call, pred, filter_pred_or); 604 fn = filter_pred_or;
605 goto add_pred_fn;
556 } 606 }
557 607
558 field = find_event_field(call, pred->field_name); 608 field = find_event_field(call, pred->field_name);
@@ -568,16 +618,17 @@ static int filter_add_pred(struct filter_parse_state *ps,
568 return -EINVAL; 618 return -EINVAL;
569 } 619 }
570 620
571 string_type = is_string_field(field->type); 621 if (is_string_field(field)) {
572 if (string_type) { 622 pred->str_len = field->size;
573 if (string_type == FILTER_STATIC_STRING) 623
624 if (field->filter_type == FILTER_STATIC_STRING)
574 fn = filter_pred_string; 625 fn = filter_pred_string;
575 else 626 else if (field->filter_type == FILTER_DYN_STRING)
576 fn = filter_pred_strloc; 627 fn = filter_pred_strloc;
577 pred->str_len = field->size; 628 else {
578 if (pred->op == OP_NE) 629 fn = filter_pred_pchar;
579 pred->not = 1; 630 pred->str_len = strlen(pred->str_val);
580 return filter_add_pred_fn(ps, call, pred, fn); 631 }
581 } else { 632 } else {
582 if (field->is_signed) 633 if (field->is_signed)
583 ret = strict_strtoll(pred->str_val, 0, &val); 634 ret = strict_strtoll(pred->str_val, 0, &val);
@@ -588,41 +639,33 @@ static int filter_add_pred(struct filter_parse_state *ps,
588 return -EINVAL; 639 return -EINVAL;
589 } 640 }
590 pred->val = val; 641 pred->val = val;
591 }
592 642
593 fn = select_comparison_fn(pred->op, field->size, field->is_signed); 643 fn = select_comparison_fn(pred->op, field->size,
594 if (!fn) { 644 field->is_signed);
595 parse_error(ps, FILT_ERR_INVALID_OP, 0); 645 if (!fn) {
596 return -EINVAL; 646 parse_error(ps, FILT_ERR_INVALID_OP, 0);
647 return -EINVAL;
648 }
597 } 649 }
598 650
599 if (pred->op == OP_NE) 651 if (pred->op == OP_NE)
600 pred->not = 1; 652 pred->not = 1;
601 653
602 return filter_add_pred_fn(ps, call, pred, fn); 654add_pred_fn:
655 if (!dry_run)
656 return filter_add_pred_fn(ps, call, pred, fn);
657 return 0;
603} 658}
604 659
605static int filter_add_subsystem_pred(struct filter_parse_state *ps, 660static int filter_add_subsystem_pred(struct filter_parse_state *ps,
606 struct event_subsystem *system, 661 struct event_subsystem *system,
607 struct filter_pred *pred, 662 struct filter_pred *pred,
608 char *filter_string) 663 char *filter_string,
664 bool dry_run)
609{ 665{
610 struct event_filter *filter = system->filter;
611 struct ftrace_event_call *call; 666 struct ftrace_event_call *call;
612 int err = 0; 667 int err = 0;
613 668 bool fail = true;
614 if (!filter->preds) {
615 filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
616 GFP_KERNEL);
617
618 if (!filter->preds)
619 return -ENOMEM;
620 }
621
622 if (filter->n_preds == MAX_FILTER_PRED) {
623 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
624 return -ENOSPC;
625 }
626 669
627 list_for_each_entry(call, &ftrace_events, list) { 670 list_for_each_entry(call, &ftrace_events, list) {
628 671
@@ -632,19 +675,24 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
632 if (strcmp(call->system, system->name)) 675 if (strcmp(call->system, system->name))
633 continue; 676 continue;
634 677
635 err = filter_add_pred(ps, call, pred); 678 if (call->filter->no_reset)
636 if (err) { 679 continue;
637 filter_free_subsystem_preds(system); 680
638 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); 681 err = filter_add_pred(ps, call, pred, dry_run);
639 goto out; 682 if (err)
640 } 683 call->filter->no_reset = true;
641 replace_filter_string(call->filter, filter_string); 684 else
685 fail = false;
686
687 if (!dry_run)
688 replace_filter_string(call->filter, filter_string);
642 } 689 }
643 690
644 filter->preds[filter->n_preds] = pred; 691 if (fail) {
645 filter->n_preds++; 692 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
646out: 693 return err;
647 return err; 694 }
695 return 0;
648} 696}
649 697
650static void parse_init(struct filter_parse_state *ps, 698static void parse_init(struct filter_parse_state *ps,
@@ -1003,12 +1051,14 @@ static int check_preds(struct filter_parse_state *ps)
1003static int replace_preds(struct event_subsystem *system, 1051static int replace_preds(struct event_subsystem *system,
1004 struct ftrace_event_call *call, 1052 struct ftrace_event_call *call,
1005 struct filter_parse_state *ps, 1053 struct filter_parse_state *ps,
1006 char *filter_string) 1054 char *filter_string,
1055 bool dry_run)
1007{ 1056{
1008 char *operand1 = NULL, *operand2 = NULL; 1057 char *operand1 = NULL, *operand2 = NULL;
1009 struct filter_pred *pred; 1058 struct filter_pred *pred;
1010 struct postfix_elt *elt; 1059 struct postfix_elt *elt;
1011 int err; 1060 int err;
1061 int n_preds = 0;
1012 1062
1013 err = check_preds(ps); 1063 err = check_preds(ps);
1014 if (err) 1064 if (err)
@@ -1027,24 +1077,14 @@ static int replace_preds(struct event_subsystem *system,
1027 continue; 1077 continue;
1028 } 1078 }
1029 1079
1080 if (n_preds++ == MAX_FILTER_PRED) {
1081 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
1082 return -ENOSPC;
1083 }
1084
1030 if (elt->op == OP_AND || elt->op == OP_OR) { 1085 if (elt->op == OP_AND || elt->op == OP_OR) {
1031 pred = create_logical_pred(elt->op); 1086 pred = create_logical_pred(elt->op);
1032 if (!pred) 1087 goto add_pred;
1033 return -ENOMEM;
1034 if (call) {
1035 err = filter_add_pred(ps, call, pred);
1036 filter_free_pred(pred);
1037 } else {
1038 err = filter_add_subsystem_pred(ps, system,
1039 pred, filter_string);
1040 if (err)
1041 filter_free_pred(pred);
1042 }
1043 if (err)
1044 return err;
1045
1046 operand1 = operand2 = NULL;
1047 continue;
1048 } 1088 }
1049 1089
1050 if (!operand1 || !operand2) { 1090 if (!operand1 || !operand2) {
@@ -1053,17 +1093,15 @@ static int replace_preds(struct event_subsystem *system,
1053 } 1093 }
1054 1094
1055 pred = create_pred(elt->op, operand1, operand2); 1095 pred = create_pred(elt->op, operand1, operand2);
1096add_pred:
1056 if (!pred) 1097 if (!pred)
1057 return -ENOMEM; 1098 return -ENOMEM;
1058 if (call) { 1099 if (call)
1059 err = filter_add_pred(ps, call, pred); 1100 err = filter_add_pred(ps, call, pred, false);
1060 filter_free_pred(pred); 1101 else
1061 } else {
1062 err = filter_add_subsystem_pred(ps, system, pred, 1102 err = filter_add_subsystem_pred(ps, system, pred,
1063 filter_string); 1103 filter_string, dry_run);
1064 if (err) 1104 filter_free_pred(pred);
1065 filter_free_pred(pred);
1066 }
1067 if (err) 1105 if (err)
1068 return err; 1106 return err;
1069 1107
@@ -1081,6 +1119,10 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1081 1119
1082 mutex_lock(&event_mutex); 1120 mutex_lock(&event_mutex);
1083 1121
1122 err = init_preds(call);
1123 if (err)
1124 goto out_unlock;
1125
1084 if (!strcmp(strstrip(filter_string), "0")) { 1126 if (!strcmp(strstrip(filter_string), "0")) {
1085 filter_disable_preds(call); 1127 filter_disable_preds(call);
1086 remove_filter_string(call->filter); 1128 remove_filter_string(call->filter);
@@ -1103,7 +1145,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1103 goto out; 1145 goto out;
1104 } 1146 }
1105 1147
1106 err = replace_preds(NULL, call, ps, filter_string); 1148 err = replace_preds(NULL, call, ps, filter_string, false);
1107 if (err) 1149 if (err)
1108 append_filter_err(ps, call->filter); 1150 append_filter_err(ps, call->filter);
1109 1151
@@ -1126,8 +1168,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1126 1168
1127 mutex_lock(&event_mutex); 1169 mutex_lock(&event_mutex);
1128 1170
1171 err = init_subsystem_preds(system);
1172 if (err)
1173 goto out_unlock;
1174
1129 if (!strcmp(strstrip(filter_string), "0")) { 1175 if (!strcmp(strstrip(filter_string), "0")) {
1130 filter_free_subsystem_preds(system); 1176 filter_free_subsystem_preds(system, FILTER_DISABLE_ALL);
1131 remove_filter_string(system->filter); 1177 remove_filter_string(system->filter);
1132 mutex_unlock(&event_mutex); 1178 mutex_unlock(&event_mutex);
1133 return 0; 1179 return 0;
@@ -1138,7 +1184,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1138 if (!ps) 1184 if (!ps)
1139 goto out_unlock; 1185 goto out_unlock;
1140 1186
1141 filter_free_subsystem_preds(system);
1142 replace_filter_string(system->filter, filter_string); 1187 replace_filter_string(system->filter, filter_string);
1143 1188
1144 parse_init(ps, filter_ops, filter_string); 1189 parse_init(ps, filter_ops, filter_string);
@@ -1148,9 +1193,23 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1148 goto out; 1193 goto out;
1149 } 1194 }
1150 1195
1151 err = replace_preds(system, NULL, ps, filter_string); 1196 filter_free_subsystem_preds(system, FILTER_INIT_NO_RESET);
1152 if (err) 1197
1198 /* try to see the filter can be applied to which events */
1199 err = replace_preds(system, NULL, ps, filter_string, true);
1200 if (err) {
1153 append_filter_err(ps, system->filter); 1201 append_filter_err(ps, system->filter);
1202 goto out;
1203 }
1204
1205 filter_free_subsystem_preds(system, FILTER_SKIP_NO_RESET);
1206
1207 /* really apply the filter to the events */
1208 err = replace_preds(system, NULL, ps, filter_string, false);
1209 if (err) {
1210 append_filter_err(ps, system->filter);
1211 filter_free_subsystem_preds(system, 2);
1212 }
1154 1213
1155out: 1214out:
1156 filter_opstack_clear(ps); 1215 filter_opstack_clear(ps);
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index d06cf898dc86..df1bf6e48bb9 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -60,7 +60,8 @@ extern void __bad_type_size(void);
60#undef TRACE_EVENT_FORMAT 60#undef TRACE_EVENT_FORMAT
61#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 61#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
62static int \ 62static int \
63ftrace_format_##call(struct trace_seq *s) \ 63ftrace_format_##call(struct ftrace_event_call *unused, \
64 struct trace_seq *s) \
64{ \ 65{ \
65 struct args field; \ 66 struct args field; \
66 int ret; \ 67 int ret; \
@@ -76,7 +77,8 @@ ftrace_format_##call(struct trace_seq *s) \
76#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ 77#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
77 tpfmt) \ 78 tpfmt) \
78static int \ 79static int \
79ftrace_format_##call(struct trace_seq *s) \ 80ftrace_format_##call(struct ftrace_event_call *unused, \
81 struct trace_seq *s) \
80{ \ 82{ \
81 struct args field; \ 83 struct args field; \
82 int ret; \ 84 int ret; \
@@ -117,7 +119,7 @@ ftrace_format_##call(struct trace_seq *s) \
117 119
118#undef TRACE_EVENT_FORMAT 120#undef TRACE_EVENT_FORMAT
119#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 121#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
120int ftrace_define_fields_##call(void); \ 122int ftrace_define_fields_##call(struct ftrace_event_call *event_call); \
121static int ftrace_raw_init_event_##call(void); \ 123static int ftrace_raw_init_event_##call(void); \
122 \ 124 \
123struct ftrace_event_call __used \ 125struct ftrace_event_call __used \
@@ -133,7 +135,6 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
133static int ftrace_raw_init_event_##call(void) \ 135static int ftrace_raw_init_event_##call(void) \
134{ \ 136{ \
135 INIT_LIST_HEAD(&event_##call.fields); \ 137 INIT_LIST_HEAD(&event_##call.fields); \
136 init_preds(&event_##call); \
137 return 0; \ 138 return 0; \
138} \ 139} \
139 140
@@ -156,7 +157,8 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
156#define TRACE_FIELD(type, item, assign) \ 157#define TRACE_FIELD(type, item, assign) \
157 ret = trace_define_field(event_call, #type, #item, \ 158 ret = trace_define_field(event_call, #type, #item, \
158 offsetof(typeof(field), item), \ 159 offsetof(typeof(field), item), \
159 sizeof(field.item), is_signed_type(type)); \ 160 sizeof(field.item), \
161 is_signed_type(type), FILTER_OTHER); \
160 if (ret) \ 162 if (ret) \
161 return ret; 163 return ret;
162 164
@@ -164,7 +166,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
164#define TRACE_FIELD_SPECIAL(type, item, len, cmd) \ 166#define TRACE_FIELD_SPECIAL(type, item, len, cmd) \
165 ret = trace_define_field(event_call, #type "[" #len "]", #item, \ 167 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
166 offsetof(typeof(field), item), \ 168 offsetof(typeof(field), item), \
167 sizeof(field.item), 0); \ 169 sizeof(field.item), 0, FILTER_OTHER); \
168 if (ret) \ 170 if (ret) \
169 return ret; 171 return ret;
170 172
@@ -172,7 +174,8 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
172#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ 174#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \
173 ret = trace_define_field(event_call, #type, #item, \ 175 ret = trace_define_field(event_call, #type, #item, \
174 offsetof(typeof(field), item), \ 176 offsetof(typeof(field), item), \
175 sizeof(field.item), is_signed); \ 177 sizeof(field.item), is_signed, \
178 FILTER_OTHER); \
176 if (ret) \ 179 if (ret) \
177 return ret; 180 return ret;
178 181
@@ -182,17 +185,14 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
182#undef TRACE_EVENT_FORMAT 185#undef TRACE_EVENT_FORMAT
183#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 186#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
184int \ 187int \
185ftrace_define_fields_##call(void) \ 188ftrace_define_fields_##call(struct ftrace_event_call *event_call) \
186{ \ 189{ \
187 struct ftrace_event_call *event_call = &event_##call; \
188 struct args field; \ 190 struct args field; \
189 int ret; \ 191 int ret; \
190 \ 192 \
191 __common_field(unsigned char, type, 0); \ 193 ret = trace_define_common_fields(event_call); \
192 __common_field(unsigned char, flags, 0); \ 194 if (ret) \
193 __common_field(unsigned char, preempt_count, 0); \ 195 return ret; \
194 __common_field(int, pid, 1); \
195 __common_field(int, tgid, 1); \
196 \ 196 \
197 tstruct; \ 197 tstruct; \
198 \ 198 \
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 75ef000613c3..5b01b94518fc 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -288,11 +288,9 @@ static int
288ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, 288ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
289 struct ftrace_probe_ops *ops, void *data) 289 struct ftrace_probe_ops *ops, void *data)
290{ 290{
291 char str[KSYM_SYMBOL_LEN];
292 long count = (long)data; 291 long count = (long)data;
293 292
294 kallsyms_lookup(ip, NULL, NULL, NULL, str); 293 seq_printf(m, "%pf:", (void *)ip);
295 seq_printf(m, "%s:", str);
296 294
297 if (ops == &traceon_probe_ops) 295 if (ops == &traceon_probe_ops)
298 seq_printf(m, "traceon"); 296 seq_printf(m, "traceon");
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 420ec3487579..b3749a2c3132 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -52,7 +52,7 @@ static struct tracer_flags tracer_flags = {
52 .opts = trace_opts 52 .opts = trace_opts
53}; 53};
54 54
55/* pid on the last trace processed */ 55static struct trace_array *graph_array;
56 56
57 57
58/* Add a function return address to the trace stack on thread info.*/ 58/* Add a function return address to the trace stack on thread info.*/
@@ -166,10 +166,123 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
166 return ret; 166 return ret;
167} 167}
168 168
169static int __trace_graph_entry(struct trace_array *tr,
170 struct ftrace_graph_ent *trace,
171 unsigned long flags,
172 int pc)
173{
174 struct ftrace_event_call *call = &event_funcgraph_entry;
175 struct ring_buffer_event *event;
176 struct ring_buffer *buffer = tr->buffer;
177 struct ftrace_graph_ent_entry *entry;
178
179 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
180 return 0;
181
182 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
183 sizeof(*entry), flags, pc);
184 if (!event)
185 return 0;
186 entry = ring_buffer_event_data(event);
187 entry->graph_ent = *trace;
188 if (!filter_current_check_discard(buffer, call, entry, event))
189 ring_buffer_unlock_commit(buffer, event);
190
191 return 1;
192}
193
194int trace_graph_entry(struct ftrace_graph_ent *trace)
195{
196 struct trace_array *tr = graph_array;
197 struct trace_array_cpu *data;
198 unsigned long flags;
199 long disabled;
200 int ret;
201 int cpu;
202 int pc;
203
204 if (unlikely(!tr))
205 return 0;
206
207 if (!ftrace_trace_task(current))
208 return 0;
209
210 if (!ftrace_graph_addr(trace->func))
211 return 0;
212
213 local_irq_save(flags);
214 cpu = raw_smp_processor_id();
215 data = tr->data[cpu];
216 disabled = atomic_inc_return(&data->disabled);
217 if (likely(disabled == 1)) {
218 pc = preempt_count();
219 ret = __trace_graph_entry(tr, trace, flags, pc);
220 } else {
221 ret = 0;
222 }
223 /* Only do the atomic if it is not already set */
224 if (!test_tsk_trace_graph(current))
225 set_tsk_trace_graph(current);
226
227 atomic_dec(&data->disabled);
228 local_irq_restore(flags);
229
230 return ret;
231}
232
233static void __trace_graph_return(struct trace_array *tr,
234 struct ftrace_graph_ret *trace,
235 unsigned long flags,
236 int pc)
237{
238 struct ftrace_event_call *call = &event_funcgraph_exit;
239 struct ring_buffer_event *event;
240 struct ring_buffer *buffer = tr->buffer;
241 struct ftrace_graph_ret_entry *entry;
242
243 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
244 return;
245
246 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
247 sizeof(*entry), flags, pc);
248 if (!event)
249 return;
250 entry = ring_buffer_event_data(event);
251 entry->ret = *trace;
252 if (!filter_current_check_discard(buffer, call, entry, event))
253 ring_buffer_unlock_commit(buffer, event);
254}
255
256void trace_graph_return(struct ftrace_graph_ret *trace)
257{
258 struct trace_array *tr = graph_array;
259 struct trace_array_cpu *data;
260 unsigned long flags;
261 long disabled;
262 int cpu;
263 int pc;
264
265 local_irq_save(flags);
266 cpu = raw_smp_processor_id();
267 data = tr->data[cpu];
268 disabled = atomic_inc_return(&data->disabled);
269 if (likely(disabled == 1)) {
270 pc = preempt_count();
271 __trace_graph_return(tr, trace, flags, pc);
272 }
273 if (!trace->depth)
274 clear_tsk_trace_graph(current);
275 atomic_dec(&data->disabled);
276 local_irq_restore(flags);
277}
278
169static int graph_trace_init(struct trace_array *tr) 279static int graph_trace_init(struct trace_array *tr)
170{ 280{
171 int ret = register_ftrace_graph(&trace_graph_return, 281 int ret;
172 &trace_graph_entry); 282
283 graph_array = tr;
284 ret = register_ftrace_graph(&trace_graph_return,
285 &trace_graph_entry);
173 if (ret) 286 if (ret)
174 return ret; 287 return ret;
175 tracing_start_cmdline_record(); 288 tracing_start_cmdline_record();
@@ -177,49 +290,30 @@ static int graph_trace_init(struct trace_array *tr)
177 return 0; 290 return 0;
178} 291}
179 292
293void set_graph_array(struct trace_array *tr)
294{
295 graph_array = tr;
296}
297
180static void graph_trace_reset(struct trace_array *tr) 298static void graph_trace_reset(struct trace_array *tr)
181{ 299{
182 tracing_stop_cmdline_record(); 300 tracing_stop_cmdline_record();
183 unregister_ftrace_graph(); 301 unregister_ftrace_graph();
184} 302}
185 303
186static inline int log10_cpu(int nb) 304static int max_bytes_for_cpu;
187{
188 if (nb / 100)
189 return 3;
190 if (nb / 10)
191 return 2;
192 return 1;
193}
194 305
195static enum print_line_t 306static enum print_line_t
196print_graph_cpu(struct trace_seq *s, int cpu) 307print_graph_cpu(struct trace_seq *s, int cpu)
197{ 308{
198 int i;
199 int ret; 309 int ret;
200 int log10_this = log10_cpu(cpu);
201 int log10_all = log10_cpu(cpumask_weight(cpu_online_mask));
202
203 310
204 /* 311 /*
205 * Start with a space character - to make it stand out 312 * Start with a space character - to make it stand out
206 * to the right a bit when trace output is pasted into 313 * to the right a bit when trace output is pasted into
207 * email: 314 * email:
208 */ 315 */
209 ret = trace_seq_printf(s, " "); 316 ret = trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu);
210
211 /*
212 * Tricky - we space the CPU field according to the max
213 * number of online CPUs. On a 2-cpu system it would take
214 * a maximum of 1 digit - on a 128 cpu system it would
215 * take up to 3 digits:
216 */
217 for (i = 0; i < log10_all - log10_this; i++) {
218 ret = trace_seq_printf(s, " ");
219 if (!ret)
220 return TRACE_TYPE_PARTIAL_LINE;
221 }
222 ret = trace_seq_printf(s, "%d) ", cpu);
223 if (!ret) 317 if (!ret)
224 return TRACE_TYPE_PARTIAL_LINE; 318 return TRACE_TYPE_PARTIAL_LINE;
225 319
@@ -565,11 +659,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
565 return TRACE_TYPE_PARTIAL_LINE; 659 return TRACE_TYPE_PARTIAL_LINE;
566 } 660 }
567 661
568 ret = seq_print_ip_sym(s, call->func, 0); 662 ret = trace_seq_printf(s, "%pf();\n", (void *)call->func);
569 if (!ret)
570 return TRACE_TYPE_PARTIAL_LINE;
571
572 ret = trace_seq_printf(s, "();\n");
573 if (!ret) 663 if (!ret)
574 return TRACE_TYPE_PARTIAL_LINE; 664 return TRACE_TYPE_PARTIAL_LINE;
575 665
@@ -612,11 +702,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
612 return TRACE_TYPE_PARTIAL_LINE; 702 return TRACE_TYPE_PARTIAL_LINE;
613 } 703 }
614 704
615 ret = seq_print_ip_sym(s, call->func, 0); 705 ret = trace_seq_printf(s, "%pf() {\n", (void *)call->func);
616 if (!ret)
617 return TRACE_TYPE_PARTIAL_LINE;
618
619 ret = trace_seq_printf(s, "() {\n");
620 if (!ret) 706 if (!ret)
621 return TRACE_TYPE_PARTIAL_LINE; 707 return TRACE_TYPE_PARTIAL_LINE;
622 708
@@ -934,6 +1020,8 @@ static struct tracer graph_trace __read_mostly = {
934 1020
935static __init int init_graph_trace(void) 1021static __init int init_graph_trace(void)
936{ 1022{
1023 max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
1024
937 return register_tracer(&graph_trace); 1025 return register_tracer(&graph_trace);
938} 1026}
939 1027
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index b923d13e2fad..5555b75a0d12 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -178,7 +178,6 @@ out_unlock:
178out: 178out:
179 data->critical_sequence = max_sequence; 179 data->critical_sequence = max_sequence;
180 data->preempt_timestamp = ftrace_now(cpu); 180 data->preempt_timestamp = ftrace_now(cpu);
181 tracing_reset(tr, cpu);
182 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 181 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
183} 182}
184 183
@@ -208,7 +207,6 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
208 data->critical_sequence = max_sequence; 207 data->critical_sequence = max_sequence;
209 data->preempt_timestamp = ftrace_now(cpu); 208 data->preempt_timestamp = ftrace_now(cpu);
210 data->critical_start = parent_ip ? : ip; 209 data->critical_start = parent_ip ? : ip;
211 tracing_reset(tr, cpu);
212 210
213 local_save_flags(flags); 211 local_save_flags(flags);
214 212
@@ -379,6 +377,7 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
379 irqsoff_trace = tr; 377 irqsoff_trace = tr;
380 /* make sure that the tracer is visible */ 378 /* make sure that the tracer is visible */
381 smp_wmb(); 379 smp_wmb();
380 tracing_reset_online_cpus(tr);
382 start_irqsoff_tracer(tr); 381 start_irqsoff_tracer(tr);
383} 382}
384 383
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index d53b45ed0806..c4c9bbda53d3 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -307,11 +307,12 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
307 struct trace_array_cpu *data, 307 struct trace_array_cpu *data,
308 struct mmiotrace_rw *rw) 308 struct mmiotrace_rw *rw)
309{ 309{
310 struct ring_buffer *buffer = tr->buffer;
310 struct ring_buffer_event *event; 311 struct ring_buffer_event *event;
311 struct trace_mmiotrace_rw *entry; 312 struct trace_mmiotrace_rw *entry;
312 int pc = preempt_count(); 313 int pc = preempt_count();
313 314
314 event = trace_buffer_lock_reserve(tr, TRACE_MMIO_RW, 315 event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_RW,
315 sizeof(*entry), 0, pc); 316 sizeof(*entry), 0, pc);
316 if (!event) { 317 if (!event) {
317 atomic_inc(&dropped_count); 318 atomic_inc(&dropped_count);
@@ -319,7 +320,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
319 } 320 }
320 entry = ring_buffer_event_data(event); 321 entry = ring_buffer_event_data(event);
321 entry->rw = *rw; 322 entry->rw = *rw;
322 trace_buffer_unlock_commit(tr, event, 0, pc); 323 trace_buffer_unlock_commit(buffer, event, 0, pc);
323} 324}
324 325
325void mmio_trace_rw(struct mmiotrace_rw *rw) 326void mmio_trace_rw(struct mmiotrace_rw *rw)
@@ -333,11 +334,12 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
333 struct trace_array_cpu *data, 334 struct trace_array_cpu *data,
334 struct mmiotrace_map *map) 335 struct mmiotrace_map *map)
335{ 336{
337 struct ring_buffer *buffer = tr->buffer;
336 struct ring_buffer_event *event; 338 struct ring_buffer_event *event;
337 struct trace_mmiotrace_map *entry; 339 struct trace_mmiotrace_map *entry;
338 int pc = preempt_count(); 340 int pc = preempt_count();
339 341
340 event = trace_buffer_lock_reserve(tr, TRACE_MMIO_MAP, 342 event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_MAP,
341 sizeof(*entry), 0, pc); 343 sizeof(*entry), 0, pc);
342 if (!event) { 344 if (!event) {
343 atomic_inc(&dropped_count); 345 atomic_inc(&dropped_count);
@@ -345,7 +347,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
345 } 347 }
346 entry = ring_buffer_event_data(event); 348 entry = ring_buffer_event_data(event);
347 entry->map = *map; 349 entry->map = *map;
348 trace_buffer_unlock_commit(tr, event, 0, pc); 350 trace_buffer_unlock_commit(buffer, event, 0, pc);
349} 351}
350 352
351void mmio_trace_mapping(struct mmiotrace_map *map) 353void mmio_trace_mapping(struct mmiotrace_map *map)
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index 8a30d9874cd4..fe1a00f1445a 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -38,6 +38,7 @@ static void probe_power_end(struct power_trace *it)
38{ 38{
39 struct ftrace_event_call *call = &event_power; 39 struct ftrace_event_call *call = &event_power;
40 struct ring_buffer_event *event; 40 struct ring_buffer_event *event;
41 struct ring_buffer *buffer;
41 struct trace_power *entry; 42 struct trace_power *entry;
42 struct trace_array_cpu *data; 43 struct trace_array_cpu *data;
43 struct trace_array *tr = power_trace; 44 struct trace_array *tr = power_trace;
@@ -45,18 +46,20 @@ static void probe_power_end(struct power_trace *it)
45 if (!trace_power_enabled) 46 if (!trace_power_enabled)
46 return; 47 return;
47 48
49 buffer = tr->buffer;
50
48 preempt_disable(); 51 preempt_disable();
49 it->end = ktime_get(); 52 it->end = ktime_get();
50 data = tr->data[smp_processor_id()]; 53 data = tr->data[smp_processor_id()];
51 54
52 event = trace_buffer_lock_reserve(tr, TRACE_POWER, 55 event = trace_buffer_lock_reserve(buffer, TRACE_POWER,
53 sizeof(*entry), 0, 0); 56 sizeof(*entry), 0, 0);
54 if (!event) 57 if (!event)
55 goto out; 58 goto out;
56 entry = ring_buffer_event_data(event); 59 entry = ring_buffer_event_data(event);
57 entry->state_data = *it; 60 entry->state_data = *it;
58 if (!filter_check_discard(call, entry, tr->buffer, event)) 61 if (!filter_check_discard(call, entry, buffer, event))
59 trace_buffer_unlock_commit(tr, event, 0, 0); 62 trace_buffer_unlock_commit(buffer, event, 0, 0);
60 out: 63 out:
61 preempt_enable(); 64 preempt_enable();
62} 65}
@@ -66,6 +69,7 @@ static void probe_power_mark(struct power_trace *it, unsigned int type,
66{ 69{
67 struct ftrace_event_call *call = &event_power; 70 struct ftrace_event_call *call = &event_power;
68 struct ring_buffer_event *event; 71 struct ring_buffer_event *event;
72 struct ring_buffer *buffer;
69 struct trace_power *entry; 73 struct trace_power *entry;
70 struct trace_array_cpu *data; 74 struct trace_array_cpu *data;
71 struct trace_array *tr = power_trace; 75 struct trace_array *tr = power_trace;
@@ -73,6 +77,8 @@ static void probe_power_mark(struct power_trace *it, unsigned int type,
73 if (!trace_power_enabled) 77 if (!trace_power_enabled)
74 return; 78 return;
75 79
80 buffer = tr->buffer;
81
76 memset(it, 0, sizeof(struct power_trace)); 82 memset(it, 0, sizeof(struct power_trace));
77 it->state = level; 83 it->state = level;
78 it->type = type; 84 it->type = type;
@@ -81,14 +87,14 @@ static void probe_power_mark(struct power_trace *it, unsigned int type,
81 it->end = it->stamp; 87 it->end = it->stamp;
82 data = tr->data[smp_processor_id()]; 88 data = tr->data[smp_processor_id()];
83 89
84 event = trace_buffer_lock_reserve(tr, TRACE_POWER, 90 event = trace_buffer_lock_reserve(buffer, TRACE_POWER,
85 sizeof(*entry), 0, 0); 91 sizeof(*entry), 0, 0);
86 if (!event) 92 if (!event)
87 goto out; 93 goto out;
88 entry = ring_buffer_event_data(event); 94 entry = ring_buffer_event_data(event);
89 entry->state_data = *it; 95 entry->state_data = *it;
90 if (!filter_check_discard(call, entry, tr->buffer, event)) 96 if (!filter_check_discard(call, entry, buffer, event))
91 trace_buffer_unlock_commit(tr, event, 0, 0); 97 trace_buffer_unlock_commit(buffer, event, 0, 0);
92 out: 98 out:
93 preempt_enable(); 99 preempt_enable();
94} 100}
@@ -144,14 +150,12 @@ static void power_trace_reset(struct trace_array *tr)
144 150
145static int power_trace_init(struct trace_array *tr) 151static int power_trace_init(struct trace_array *tr)
146{ 152{
147 int cpu;
148 power_trace = tr; 153 power_trace = tr;
149 154
150 trace_power_enabled = 1; 155 trace_power_enabled = 1;
151 tracing_power_register(); 156 tracing_power_register();
152 157
153 for_each_cpu(cpu, cpu_possible_mask) 158 tracing_reset_online_cpus(tr);
154 tracing_reset(tr, cpu);
155 return 0; 159 return 0;
156} 160}
157 161
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index a98106dd979c..5fca0f51fde4 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -20,6 +20,35 @@ static int sched_ref;
20static DEFINE_MUTEX(sched_register_mutex); 20static DEFINE_MUTEX(sched_register_mutex);
21static int sched_stopped; 21static int sched_stopped;
22 22
23
24void
25tracing_sched_switch_trace(struct trace_array *tr,
26 struct task_struct *prev,
27 struct task_struct *next,
28 unsigned long flags, int pc)
29{
30 struct ftrace_event_call *call = &event_context_switch;
31 struct ring_buffer *buffer = tr->buffer;
32 struct ring_buffer_event *event;
33 struct ctx_switch_entry *entry;
34
35 event = trace_buffer_lock_reserve(buffer, TRACE_CTX,
36 sizeof(*entry), flags, pc);
37 if (!event)
38 return;
39 entry = ring_buffer_event_data(event);
40 entry->prev_pid = prev->pid;
41 entry->prev_prio = prev->prio;
42 entry->prev_state = prev->state;
43 entry->next_pid = next->pid;
44 entry->next_prio = next->prio;
45 entry->next_state = next->state;
46 entry->next_cpu = task_cpu(next);
47
48 if (!filter_check_discard(call, entry, buffer, event))
49 trace_buffer_unlock_commit(buffer, event, flags, pc);
50}
51
23static void 52static void
24probe_sched_switch(struct rq *__rq, struct task_struct *prev, 53probe_sched_switch(struct rq *__rq, struct task_struct *prev,
25 struct task_struct *next) 54 struct task_struct *next)
@@ -49,6 +78,36 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
49 local_irq_restore(flags); 78 local_irq_restore(flags);
50} 79}
51 80
81void
82tracing_sched_wakeup_trace(struct trace_array *tr,
83 struct task_struct *wakee,
84 struct task_struct *curr,
85 unsigned long flags, int pc)
86{
87 struct ftrace_event_call *call = &event_wakeup;
88 struct ring_buffer_event *event;
89 struct ctx_switch_entry *entry;
90 struct ring_buffer *buffer = tr->buffer;
91
92 event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
93 sizeof(*entry), flags, pc);
94 if (!event)
95 return;
96 entry = ring_buffer_event_data(event);
97 entry->prev_pid = curr->pid;
98 entry->prev_prio = curr->prio;
99 entry->prev_state = curr->state;
100 entry->next_pid = wakee->pid;
101 entry->next_prio = wakee->prio;
102 entry->next_state = wakee->state;
103 entry->next_cpu = task_cpu(wakee);
104
105 if (!filter_check_discard(call, entry, buffer, event))
106 ring_buffer_unlock_commit(buffer, event);
107 ftrace_trace_stack(tr->buffer, flags, 6, pc);
108 ftrace_trace_userstack(tr->buffer, flags, pc);
109}
110
52static void 111static void
53probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success) 112probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
54{ 113{
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index eacb27225173..ad69f105a7c6 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -186,11 +186,6 @@ out:
186 186
187static void __wakeup_reset(struct trace_array *tr) 187static void __wakeup_reset(struct trace_array *tr)
188{ 188{
189 int cpu;
190
191 for_each_possible_cpu(cpu)
192 tracing_reset(tr, cpu);
193
194 wakeup_cpu = -1; 189 wakeup_cpu = -1;
195 wakeup_prio = -1; 190 wakeup_prio = -1;
196 191
@@ -204,6 +199,8 @@ static void wakeup_reset(struct trace_array *tr)
204{ 199{
205 unsigned long flags; 200 unsigned long flags;
206 201
202 tracing_reset_online_cpus(tr);
203
207 local_irq_save(flags); 204 local_irq_save(flags);
208 __raw_spin_lock(&wakeup_lock); 205 __raw_spin_lock(&wakeup_lock);
209 __wakeup_reset(tr); 206 __wakeup_reset(tr);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 00dd6485bdd7..d2cdbabb4ead 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -288,6 +288,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
288 * to detect and recover from possible hangs 288 * to detect and recover from possible hangs
289 */ 289 */
290 tracing_reset_online_cpus(tr); 290 tracing_reset_online_cpus(tr);
291 set_graph_array(tr);
291 ret = register_ftrace_graph(&trace_graph_return, 292 ret = register_ftrace_graph(&trace_graph_return,
292 &trace_graph_entry_watchdog); 293 &trace_graph_entry_watchdog);
293 if (ret) { 294 if (ret) {
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 6a2a9d484cd6..0f6facb050a1 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -186,43 +186,33 @@ static const struct file_operations stack_max_size_fops = {
186}; 186};
187 187
188static void * 188static void *
189t_next(struct seq_file *m, void *v, loff_t *pos) 189__next(struct seq_file *m, loff_t *pos)
190{ 190{
191 long i; 191 long n = *pos - 1;
192 192
193 (*pos)++; 193 if (n >= max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX)
194
195 if (v == SEQ_START_TOKEN)
196 i = 0;
197 else {
198 i = *(long *)v;
199 i++;
200 }
201
202 if (i >= max_stack_trace.nr_entries ||
203 stack_dump_trace[i] == ULONG_MAX)
204 return NULL; 194 return NULL;
205 195
206 m->private = (void *)i; 196 m->private = (void *)n;
207
208 return &m->private; 197 return &m->private;
209} 198}
210 199
211static void *t_start(struct seq_file *m, loff_t *pos) 200static void *
201t_next(struct seq_file *m, void *v, loff_t *pos)
212{ 202{
213 void *t = SEQ_START_TOKEN; 203 (*pos)++;
214 loff_t l = 0; 204 return __next(m, pos);
205}
215 206
207static void *t_start(struct seq_file *m, loff_t *pos)
208{
216 local_irq_disable(); 209 local_irq_disable();
217 __raw_spin_lock(&max_stack_lock); 210 __raw_spin_lock(&max_stack_lock);
218 211
219 if (*pos == 0) 212 if (*pos == 0)
220 return SEQ_START_TOKEN; 213 return SEQ_START_TOKEN;
221 214
222 for (; t && l < *pos; t = t_next(m, t, &l)) 215 return __next(m, pos);
223 ;
224
225 return t;
226} 216}
227 217
228static void t_stop(struct seq_file *m, void *p) 218static void t_stop(struct seq_file *m, void *p)
@@ -234,15 +224,8 @@ static void t_stop(struct seq_file *m, void *p)
234static int trace_lookup_stack(struct seq_file *m, long i) 224static int trace_lookup_stack(struct seq_file *m, long i)
235{ 225{
236 unsigned long addr = stack_dump_trace[i]; 226 unsigned long addr = stack_dump_trace[i];
237#ifdef CONFIG_KALLSYMS
238 char str[KSYM_SYMBOL_LEN];
239
240 sprint_symbol(str, addr);
241 227
242 return seq_printf(m, "%s\n", str); 228 return seq_printf(m, "%pF\n", (void *)addr);
243#else
244 return seq_printf(m, "%p\n", (void*)addr);
245#endif
246} 229}
247 230
248static void print_disabled(struct seq_file *m) 231static void print_disabled(struct seq_file *m)
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index aea321c82fa0..a4bb239eb987 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -49,7 +49,8 @@ static struct dentry *stat_dir;
49 * but it will at least advance closer to the next one 49 * but it will at least advance closer to the next one
50 * to be released. 50 * to be released.
51 */ 51 */
52static struct rb_node *release_next(struct rb_node *node) 52static struct rb_node *release_next(struct tracer_stat *ts,
53 struct rb_node *node)
53{ 54{
54 struct stat_node *snode; 55 struct stat_node *snode;
55 struct rb_node *parent = rb_parent(node); 56 struct rb_node *parent = rb_parent(node);
@@ -67,6 +68,8 @@ static struct rb_node *release_next(struct rb_node *node)
67 parent->rb_right = NULL; 68 parent->rb_right = NULL;
68 69
69 snode = container_of(node, struct stat_node, node); 70 snode = container_of(node, struct stat_node, node);
71 if (ts->stat_release)
72 ts->stat_release(snode->stat);
70 kfree(snode); 73 kfree(snode);
71 74
72 return parent; 75 return parent;
@@ -78,7 +81,7 @@ static void __reset_stat_session(struct stat_session *session)
78 struct rb_node *node = session->stat_root.rb_node; 81 struct rb_node *node = session->stat_root.rb_node;
79 82
80 while (node) 83 while (node)
81 node = release_next(node); 84 node = release_next(session->ts, node);
82 85
83 session->stat_root = RB_ROOT; 86 session->stat_root = RB_ROOT;
84} 87}
@@ -200,17 +203,21 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos)
200{ 203{
201 struct stat_session *session = s->private; 204 struct stat_session *session = s->private;
202 struct rb_node *node; 205 struct rb_node *node;
206 int n = *pos;
203 int i; 207 int i;
204 208
205 /* Prevent from tracer switch or rbtree modification */ 209 /* Prevent from tracer switch or rbtree modification */
206 mutex_lock(&session->stat_mutex); 210 mutex_lock(&session->stat_mutex);
207 211
208 /* If we are in the beginning of the file, print the headers */ 212 /* If we are in the beginning of the file, print the headers */
209 if (!*pos && session->ts->stat_headers) 213 if (session->ts->stat_headers) {
210 return SEQ_START_TOKEN; 214 if (n == 0)
215 return SEQ_START_TOKEN;
216 n--;
217 }
211 218
212 node = rb_first(&session->stat_root); 219 node = rb_first(&session->stat_root);
213 for (i = 0; node && i < *pos; i++) 220 for (i = 0; node && i < n; i++)
214 node = rb_next(node); 221 node = rb_next(node);
215 222
216 return node; 223 return node;
diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h
index f3546a2cd826..8f03914b9a6a 100644
--- a/kernel/trace/trace_stat.h
+++ b/kernel/trace/trace_stat.h
@@ -18,6 +18,8 @@ struct tracer_stat {
18 int (*stat_cmp)(void *p1, void *p2); 18 int (*stat_cmp)(void *p1, void *p2);
19 /* Print a stat entry */ 19 /* Print a stat entry */
20 int (*stat_show)(struct seq_file *s, void *p); 20 int (*stat_show)(struct seq_file *s, void *p);
21 /* Release an entry */
22 void (*stat_release)(void *stat);
21 /* Print the headers of your stat entries */ 23 /* Print the headers of your stat entries */
22 int (*stat_headers)(struct seq_file *s); 24 int (*stat_headers)(struct seq_file *s);
23}; 25};
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 5e579645ac86..8712ce3c6a0e 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,30 +1,18 @@
1#include <trace/syscall.h> 1#include <trace/syscall.h>
2#include <trace/events/syscalls.h>
2#include <linux/kernel.h> 3#include <linux/kernel.h>
4#include <linux/ftrace.h>
5#include <linux/perf_counter.h>
3#include <asm/syscall.h> 6#include <asm/syscall.h>
4 7
5#include "trace_output.h" 8#include "trace_output.h"
6#include "trace.h" 9#include "trace.h"
7 10
8/* Keep a counter of the syscall tracing users */
9static int refcount;
10
11/* Prevent from races on thread flags toggling */
12static DEFINE_MUTEX(syscall_trace_lock); 11static DEFINE_MUTEX(syscall_trace_lock);
13 12static int sys_refcount_enter;
14/* Option to display the parameters types */ 13static int sys_refcount_exit;
15enum { 14static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
16 TRACE_SYSCALLS_OPT_TYPES = 0x1, 15static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
17};
18
19static struct tracer_opt syscalls_opts[] = {
20 { TRACER_OPT(syscall_arg_type, TRACE_SYSCALLS_OPT_TYPES) },
21 { }
22};
23
24static struct tracer_flags syscalls_flags = {
25 .val = 0, /* By default: no parameters types */
26 .opts = syscalls_opts
27};
28 16
29enum print_line_t 17enum print_line_t
30print_syscall_enter(struct trace_iterator *iter, int flags) 18print_syscall_enter(struct trace_iterator *iter, int flags)
@@ -35,35 +23,46 @@ print_syscall_enter(struct trace_iterator *iter, int flags)
35 struct syscall_metadata *entry; 23 struct syscall_metadata *entry;
36 int i, ret, syscall; 24 int i, ret, syscall;
37 25
38 trace_assign_type(trace, ent); 26 trace = (typeof(trace))ent;
39
40 syscall = trace->nr; 27 syscall = trace->nr;
41
42 entry = syscall_nr_to_meta(syscall); 28 entry = syscall_nr_to_meta(syscall);
29
43 if (!entry) 30 if (!entry)
44 goto end; 31 goto end;
45 32
33 if (entry->enter_id != ent->type) {
34 WARN_ON_ONCE(1);
35 goto end;
36 }
37
46 ret = trace_seq_printf(s, "%s(", entry->name); 38 ret = trace_seq_printf(s, "%s(", entry->name);
47 if (!ret) 39 if (!ret)
48 return TRACE_TYPE_PARTIAL_LINE; 40 return TRACE_TYPE_PARTIAL_LINE;
49 41
50 for (i = 0; i < entry->nb_args; i++) { 42 for (i = 0; i < entry->nb_args; i++) {
51 /* parameter types */ 43 /* parameter types */
52 if (syscalls_flags.val & TRACE_SYSCALLS_OPT_TYPES) { 44 if (trace_flags & TRACE_ITER_VERBOSE) {
53 ret = trace_seq_printf(s, "%s ", entry->types[i]); 45 ret = trace_seq_printf(s, "%s ", entry->types[i]);
54 if (!ret) 46 if (!ret)
55 return TRACE_TYPE_PARTIAL_LINE; 47 return TRACE_TYPE_PARTIAL_LINE;
56 } 48 }
57 /* parameter values */ 49 /* parameter values */
58 ret = trace_seq_printf(s, "%s: %lx%s ", entry->args[i], 50 ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i],
59 trace->args[i], 51 trace->args[i],
60 i == entry->nb_args - 1 ? ")" : ","); 52 i == entry->nb_args - 1 ? "" : ", ");
61 if (!ret) 53 if (!ret)
62 return TRACE_TYPE_PARTIAL_LINE; 54 return TRACE_TYPE_PARTIAL_LINE;
63 } 55 }
64 56
57 ret = trace_seq_putc(s, ')');
58 if (!ret)
59 return TRACE_TYPE_PARTIAL_LINE;
60
65end: 61end:
66 trace_seq_printf(s, "\n"); 62 ret = trace_seq_putc(s, '\n');
63 if (!ret)
64 return TRACE_TYPE_PARTIAL_LINE;
65
67 return TRACE_TYPE_HANDLED; 66 return TRACE_TYPE_HANDLED;
68} 67}
69 68
@@ -77,16 +76,20 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
77 struct syscall_metadata *entry; 76 struct syscall_metadata *entry;
78 int ret; 77 int ret;
79 78
80 trace_assign_type(trace, ent); 79 trace = (typeof(trace))ent;
81
82 syscall = trace->nr; 80 syscall = trace->nr;
83
84 entry = syscall_nr_to_meta(syscall); 81 entry = syscall_nr_to_meta(syscall);
82
85 if (!entry) { 83 if (!entry) {
86 trace_seq_printf(s, "\n"); 84 trace_seq_printf(s, "\n");
87 return TRACE_TYPE_HANDLED; 85 return TRACE_TYPE_HANDLED;
88 } 86 }
89 87
88 if (entry->exit_id != ent->type) {
89 WARN_ON_ONCE(1);
90 return TRACE_TYPE_UNHANDLED;
91 }
92
90 ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, 93 ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
91 trace->ret); 94 trace->ret);
92 if (!ret) 95 if (!ret)
@@ -95,62 +98,140 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
95 return TRACE_TYPE_HANDLED; 98 return TRACE_TYPE_HANDLED;
96} 99}
97 100
98void start_ftrace_syscalls(void) 101extern char *__bad_type_size(void);
102
103#define SYSCALL_FIELD(type, name) \
104 sizeof(type) != sizeof(trace.name) ? \
105 __bad_type_size() : \
106 #type, #name, offsetof(typeof(trace), name), sizeof(trace.name)
107
108int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
99{ 109{
100 unsigned long flags; 110 int i;
101 struct task_struct *g, *t; 111 int nr;
112 int ret;
113 struct syscall_metadata *entry;
114 struct syscall_trace_enter trace;
115 int offset = offsetof(struct syscall_trace_enter, args);
102 116
103 mutex_lock(&syscall_trace_lock); 117 nr = syscall_name_to_nr(call->data);
118 entry = syscall_nr_to_meta(nr);
104 119
105 /* Don't enable the flag on the tasks twice */ 120 if (!entry)
106 if (++refcount != 1) 121 return 0;
107 goto unlock;
108 122
109 arch_init_ftrace_syscalls(); 123 ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n",
110 read_lock_irqsave(&tasklist_lock, flags); 124 SYSCALL_FIELD(int, nr));
125 if (!ret)
126 return 0;
111 127
112 do_each_thread(g, t) { 128 for (i = 0; i < entry->nb_args; i++) {
113 set_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); 129 ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i],
114 } while_each_thread(g, t); 130 entry->args[i]);
131 if (!ret)
132 return 0;
133 ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;\n", offset,
134 sizeof(unsigned long));
135 if (!ret)
136 return 0;
137 offset += sizeof(unsigned long);
138 }
115 139
116 read_unlock_irqrestore(&tasklist_lock, flags); 140 trace_seq_puts(s, "\nprint fmt: \"");
141 for (i = 0; i < entry->nb_args; i++) {
142 ret = trace_seq_printf(s, "%s: 0x%%0%zulx%s", entry->args[i],
143 sizeof(unsigned long),
144 i == entry->nb_args - 1 ? "" : ", ");
145 if (!ret)
146 return 0;
147 }
148 trace_seq_putc(s, '"');
117 149
118unlock: 150 for (i = 0; i < entry->nb_args; i++) {
119 mutex_unlock(&syscall_trace_lock); 151 ret = trace_seq_printf(s, ", ((unsigned long)(REC->%s))",
152 entry->args[i]);
153 if (!ret)
154 return 0;
155 }
156
157 return trace_seq_putc(s, '\n');
120} 158}
121 159
122void stop_ftrace_syscalls(void) 160int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
123{ 161{
124 unsigned long flags; 162 int ret;
125 struct task_struct *g, *t; 163 struct syscall_trace_exit trace;
126 164
127 mutex_lock(&syscall_trace_lock); 165 ret = trace_seq_printf(s,
166 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
167 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n",
168 SYSCALL_FIELD(int, nr),
169 SYSCALL_FIELD(unsigned long, ret));
170 if (!ret)
171 return 0;
128 172
129 /* There are perhaps still some users */ 173 return trace_seq_printf(s, "\nprint fmt: \"0x%%lx\", REC->ret\n");
130 if (--refcount) 174}
131 goto unlock;
132 175
133 read_lock_irqsave(&tasklist_lock, flags); 176int syscall_enter_define_fields(struct ftrace_event_call *call)
177{
178 struct syscall_trace_enter trace;
179 struct syscall_metadata *meta;
180 int ret;
181 int nr;
182 int i;
183 int offset = offsetof(typeof(trace), args);
184
185 nr = syscall_name_to_nr(call->data);
186 meta = syscall_nr_to_meta(nr);
187
188 if (!meta)
189 return 0;
190
191 ret = trace_define_common_fields(call);
192 if (ret)
193 return ret;
194
195 for (i = 0; i < meta->nb_args; i++) {
196 ret = trace_define_field(call, meta->types[i],
197 meta->args[i], offset,
198 sizeof(unsigned long), 0,
199 FILTER_OTHER);
200 offset += sizeof(unsigned long);
201 }
134 202
135 do_each_thread(g, t) { 203 return ret;
136 clear_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); 204}
137 } while_each_thread(g, t);
138 205
139 read_unlock_irqrestore(&tasklist_lock, flags); 206int syscall_exit_define_fields(struct ftrace_event_call *call)
207{
208 struct syscall_trace_exit trace;
209 int ret;
140 210
141unlock: 211 ret = trace_define_common_fields(call);
142 mutex_unlock(&syscall_trace_lock); 212 if (ret)
213 return ret;
214
215 ret = trace_define_field(call, SYSCALL_FIELD(unsigned long, ret), 0,
216 FILTER_OTHER);
217
218 return ret;
143} 219}
144 220
145void ftrace_syscall_enter(struct pt_regs *regs) 221void ftrace_syscall_enter(struct pt_regs *regs, long id)
146{ 222{
147 struct syscall_trace_enter *entry; 223 struct syscall_trace_enter *entry;
148 struct syscall_metadata *sys_data; 224 struct syscall_metadata *sys_data;
149 struct ring_buffer_event *event; 225 struct ring_buffer_event *event;
226 struct ring_buffer *buffer;
150 int size; 227 int size;
151 int syscall_nr; 228 int syscall_nr;
152 229
153 syscall_nr = syscall_get_nr(current, regs); 230 syscall_nr = syscall_get_nr(current, regs);
231 if (syscall_nr < 0)
232 return;
233 if (!test_bit(syscall_nr, enabled_enter_syscalls))
234 return;
154 235
155 sys_data = syscall_nr_to_meta(syscall_nr); 236 sys_data = syscall_nr_to_meta(syscall_nr);
156 if (!sys_data) 237 if (!sys_data)
@@ -158,8 +239,8 @@ void ftrace_syscall_enter(struct pt_regs *regs)
158 239
159 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; 240 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
160 241
161 event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_ENTER, size, 242 event = trace_current_buffer_lock_reserve(&buffer, sys_data->enter_id,
162 0, 0); 243 size, 0, 0);
163 if (!event) 244 if (!event)
164 return; 245 return;
165 246
@@ -167,24 +248,30 @@ void ftrace_syscall_enter(struct pt_regs *regs)
167 entry->nr = syscall_nr; 248 entry->nr = syscall_nr;
168 syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); 249 syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
169 250
170 trace_current_buffer_unlock_commit(event, 0, 0); 251 if (!filter_current_check_discard(buffer, sys_data->enter_event,
171 trace_wake_up(); 252 entry, event))
253 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
172} 254}
173 255
174void ftrace_syscall_exit(struct pt_regs *regs) 256void ftrace_syscall_exit(struct pt_regs *regs, long ret)
175{ 257{
176 struct syscall_trace_exit *entry; 258 struct syscall_trace_exit *entry;
177 struct syscall_metadata *sys_data; 259 struct syscall_metadata *sys_data;
178 struct ring_buffer_event *event; 260 struct ring_buffer_event *event;
261 struct ring_buffer *buffer;
179 int syscall_nr; 262 int syscall_nr;
180 263
181 syscall_nr = syscall_get_nr(current, regs); 264 syscall_nr = syscall_get_nr(current, regs);
265 if (syscall_nr < 0)
266 return;
267 if (!test_bit(syscall_nr, enabled_exit_syscalls))
268 return;
182 269
183 sys_data = syscall_nr_to_meta(syscall_nr); 270 sys_data = syscall_nr_to_meta(syscall_nr);
184 if (!sys_data) 271 if (!sys_data)
185 return; 272 return;
186 273
187 event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_EXIT, 274 event = trace_current_buffer_lock_reserve(&buffer, sys_data->exit_id,
188 sizeof(*entry), 0, 0); 275 sizeof(*entry), 0, 0);
189 if (!event) 276 if (!event)
190 return; 277 return;
@@ -193,58 +280,244 @@ void ftrace_syscall_exit(struct pt_regs *regs)
193 entry->nr = syscall_nr; 280 entry->nr = syscall_nr;
194 entry->ret = syscall_get_return_value(current, regs); 281 entry->ret = syscall_get_return_value(current, regs);
195 282
196 trace_current_buffer_unlock_commit(event, 0, 0); 283 if (!filter_current_check_discard(buffer, sys_data->exit_event,
197 trace_wake_up(); 284 entry, event))
285 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
198} 286}
199 287
200static int init_syscall_tracer(struct trace_array *tr) 288int reg_event_syscall_enter(void *ptr)
201{ 289{
202 start_ftrace_syscalls(); 290 int ret = 0;
291 int num;
292 char *name;
293
294 name = (char *)ptr;
295 num = syscall_name_to_nr(name);
296 if (num < 0 || num >= NR_syscalls)
297 return -ENOSYS;
298 mutex_lock(&syscall_trace_lock);
299 if (!sys_refcount_enter)
300 ret = register_trace_sys_enter(ftrace_syscall_enter);
301 if (ret) {
302 pr_info("event trace: Could not activate"
303 "syscall entry trace point");
304 } else {
305 set_bit(num, enabled_enter_syscalls);
306 sys_refcount_enter++;
307 }
308 mutex_unlock(&syscall_trace_lock);
309 return ret;
310}
311
312void unreg_event_syscall_enter(void *ptr)
313{
314 int num;
315 char *name;
203 316
204 return 0; 317 name = (char *)ptr;
318 num = syscall_name_to_nr(name);
319 if (num < 0 || num >= NR_syscalls)
320 return;
321 mutex_lock(&syscall_trace_lock);
322 sys_refcount_enter--;
323 clear_bit(num, enabled_enter_syscalls);
324 if (!sys_refcount_enter)
325 unregister_trace_sys_enter(ftrace_syscall_enter);
326 mutex_unlock(&syscall_trace_lock);
205} 327}
206 328
207static void reset_syscall_tracer(struct trace_array *tr) 329int reg_event_syscall_exit(void *ptr)
208{ 330{
209 stop_ftrace_syscalls(); 331 int ret = 0;
210 tracing_reset_online_cpus(tr); 332 int num;
333 char *name;
334
335 name = (char *)ptr;
336 num = syscall_name_to_nr(name);
337 if (num < 0 || num >= NR_syscalls)
338 return -ENOSYS;
339 mutex_lock(&syscall_trace_lock);
340 if (!sys_refcount_exit)
341 ret = register_trace_sys_exit(ftrace_syscall_exit);
342 if (ret) {
343 pr_info("event trace: Could not activate"
344 "syscall exit trace point");
345 } else {
346 set_bit(num, enabled_exit_syscalls);
347 sys_refcount_exit++;
348 }
349 mutex_unlock(&syscall_trace_lock);
350 return ret;
211} 351}
212 352
213static struct trace_event syscall_enter_event = { 353void unreg_event_syscall_exit(void *ptr)
214 .type = TRACE_SYSCALL_ENTER, 354{
215 .trace = print_syscall_enter, 355 int num;
216}; 356 char *name;
357
358 name = (char *)ptr;
359 num = syscall_name_to_nr(name);
360 if (num < 0 || num >= NR_syscalls)
361 return;
362 mutex_lock(&syscall_trace_lock);
363 sys_refcount_exit--;
364 clear_bit(num, enabled_exit_syscalls);
365 if (!sys_refcount_exit)
366 unregister_trace_sys_exit(ftrace_syscall_exit);
367 mutex_unlock(&syscall_trace_lock);
368}
217 369
218static struct trace_event syscall_exit_event = { 370struct trace_event event_syscall_enter = {
219 .type = TRACE_SYSCALL_EXIT, 371 .trace = print_syscall_enter,
220 .trace = print_syscall_exit,
221}; 372};
222 373
223static struct tracer syscall_tracer __read_mostly = { 374struct trace_event event_syscall_exit = {
224 .name = "syscall", 375 .trace = print_syscall_exit,
225 .init = init_syscall_tracer,
226 .reset = reset_syscall_tracer,
227 .flags = &syscalls_flags,
228}; 376};
229 377
230__init int register_ftrace_syscalls(void) 378#ifdef CONFIG_EVENT_PROFILE
379
380static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls);
381static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls);
382static int sys_prof_refcount_enter;
383static int sys_prof_refcount_exit;
384
385static void prof_syscall_enter(struct pt_regs *regs, long id)
231{ 386{
232 int ret; 387 struct syscall_trace_enter *rec;
388 struct syscall_metadata *sys_data;
389 int syscall_nr;
390 int size;
233 391
234 ret = register_ftrace_event(&syscall_enter_event); 392 syscall_nr = syscall_get_nr(current, regs);
235 if (!ret) { 393 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
236 printk(KERN_WARNING "event %d failed to register\n", 394 return;
237 syscall_enter_event.type); 395
238 WARN_ON_ONCE(1); 396 sys_data = syscall_nr_to_meta(syscall_nr);
397 if (!sys_data)
398 return;
399
400 /* get the size after alignment with the u32 buffer size field */
401 size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
402 size = ALIGN(size + sizeof(u32), sizeof(u64));
403 size -= sizeof(u32);
404
405 do {
406 char raw_data[size];
407
408 /* zero the dead bytes from align to not leak stack to user */
409 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
410
411 rec = (struct syscall_trace_enter *) raw_data;
412 tracing_generic_entry_update(&rec->ent, 0, 0);
413 rec->ent.type = sys_data->enter_id;
414 rec->nr = syscall_nr;
415 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
416 (unsigned long *)&rec->args);
417 perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size);
418 } while(0);
419}
420
421int reg_prof_syscall_enter(char *name)
422{
423 int ret = 0;
424 int num;
425
426 num = syscall_name_to_nr(name);
427 if (num < 0 || num >= NR_syscalls)
428 return -ENOSYS;
429
430 mutex_lock(&syscall_trace_lock);
431 if (!sys_prof_refcount_enter)
432 ret = register_trace_sys_enter(prof_syscall_enter);
433 if (ret) {
434 pr_info("event trace: Could not activate"
435 "syscall entry trace point");
436 } else {
437 set_bit(num, enabled_prof_enter_syscalls);
438 sys_prof_refcount_enter++;
239 } 439 }
440 mutex_unlock(&syscall_trace_lock);
441 return ret;
442}
240 443
241 ret = register_ftrace_event(&syscall_exit_event); 444void unreg_prof_syscall_enter(char *name)
242 if (!ret) { 445{
243 printk(KERN_WARNING "event %d failed to register\n", 446 int num;
244 syscall_exit_event.type); 447
245 WARN_ON_ONCE(1); 448 num = syscall_name_to_nr(name);
449 if (num < 0 || num >= NR_syscalls)
450 return;
451
452 mutex_lock(&syscall_trace_lock);
453 sys_prof_refcount_enter--;
454 clear_bit(num, enabled_prof_enter_syscalls);
455 if (!sys_prof_refcount_enter)
456 unregister_trace_sys_enter(prof_syscall_enter);
457 mutex_unlock(&syscall_trace_lock);
458}
459
460static void prof_syscall_exit(struct pt_regs *regs, long ret)
461{
462 struct syscall_metadata *sys_data;
463 struct syscall_trace_exit rec;
464 int syscall_nr;
465
466 syscall_nr = syscall_get_nr(current, regs);
467 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
468 return;
469
470 sys_data = syscall_nr_to_meta(syscall_nr);
471 if (!sys_data)
472 return;
473
474 tracing_generic_entry_update(&rec.ent, 0, 0);
475 rec.ent.type = sys_data->exit_id;
476 rec.nr = syscall_nr;
477 rec.ret = syscall_get_return_value(current, regs);
478
479 perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec));
480}
481
482int reg_prof_syscall_exit(char *name)
483{
484 int ret = 0;
485 int num;
486
487 num = syscall_name_to_nr(name);
488 if (num < 0 || num >= NR_syscalls)
489 return -ENOSYS;
490
491 mutex_lock(&syscall_trace_lock);
492 if (!sys_prof_refcount_exit)
493 ret = register_trace_sys_exit(prof_syscall_exit);
494 if (ret) {
495 pr_info("event trace: Could not activate"
496 "syscall entry trace point");
497 } else {
498 set_bit(num, enabled_prof_exit_syscalls);
499 sys_prof_refcount_exit++;
246 } 500 }
501 mutex_unlock(&syscall_trace_lock);
502 return ret;
503}
247 504
248 return register_tracer(&syscall_tracer); 505void unreg_prof_syscall_exit(char *name)
506{
507 int num;
508
509 num = syscall_name_to_nr(name);
510 if (num < 0 || num >= NR_syscalls)
511 return;
512
513 mutex_lock(&syscall_trace_lock);
514 sys_prof_refcount_exit--;
515 clear_bit(num, enabled_prof_exit_syscalls);
516 if (!sys_prof_refcount_exit)
517 unregister_trace_sys_exit(prof_syscall_exit);
518 mutex_unlock(&syscall_trace_lock);
249} 519}
250device_initcall(register_ftrace_syscalls); 520
521#endif
522
523
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 97fcea4acce1..40cafb07dffd 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -9,6 +9,7 @@
9#include <trace/events/workqueue.h> 9#include <trace/events/workqueue.h>
10#include <linux/list.h> 10#include <linux/list.h>
11#include <linux/percpu.h> 11#include <linux/percpu.h>
12#include <linux/kref.h>
12#include "trace_stat.h" 13#include "trace_stat.h"
13#include "trace.h" 14#include "trace.h"
14 15
@@ -16,6 +17,7 @@
16/* A cpu workqueue thread */ 17/* A cpu workqueue thread */
17struct cpu_workqueue_stats { 18struct cpu_workqueue_stats {
18 struct list_head list; 19 struct list_head list;
20 struct kref kref;
19 int cpu; 21 int cpu;
20 pid_t pid; 22 pid_t pid;
21/* Can be inserted from interrupt or user context, need to be atomic */ 23/* Can be inserted from interrupt or user context, need to be atomic */
@@ -39,6 +41,11 @@ struct workqueue_global_stats {
39static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat); 41static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat);
40#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu)) 42#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu))
41 43
44static void cpu_workqueue_stat_free(struct kref *kref)
45{
46 kfree(container_of(kref, struct cpu_workqueue_stats, kref));
47}
48
42/* Insertion of a work */ 49/* Insertion of a work */
43static void 50static void
44probe_workqueue_insertion(struct task_struct *wq_thread, 51probe_workqueue_insertion(struct task_struct *wq_thread,
@@ -96,8 +103,8 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
96 return; 103 return;
97 } 104 }
98 INIT_LIST_HEAD(&cws->list); 105 INIT_LIST_HEAD(&cws->list);
106 kref_init(&cws->kref);
99 cws->cpu = cpu; 107 cws->cpu = cpu;
100
101 cws->pid = wq_thread->pid; 108 cws->pid = wq_thread->pid;
102 109
103 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); 110 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
@@ -118,7 +125,7 @@ static void probe_workqueue_destruction(struct task_struct *wq_thread)
118 list) { 125 list) {
119 if (node->pid == wq_thread->pid) { 126 if (node->pid == wq_thread->pid) {
120 list_del(&node->list); 127 list_del(&node->list);
121 kfree(node); 128 kref_put(&node->kref, cpu_workqueue_stat_free);
122 goto found; 129 goto found;
123 } 130 }
124 } 131 }
@@ -137,9 +144,11 @@ static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
137 144
138 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); 145 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
139 146
140 if (!list_empty(&workqueue_cpu_stat(cpu)->list)) 147 if (!list_empty(&workqueue_cpu_stat(cpu)->list)) {
141 ret = list_entry(workqueue_cpu_stat(cpu)->list.next, 148 ret = list_entry(workqueue_cpu_stat(cpu)->list.next,
142 struct cpu_workqueue_stats, list); 149 struct cpu_workqueue_stats, list);
150 kref_get(&ret->kref);
151 }
143 152
144 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); 153 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
145 154
@@ -162,9 +171,9 @@ static void *workqueue_stat_start(struct tracer_stat *trace)
162static void *workqueue_stat_next(void *prev, int idx) 171static void *workqueue_stat_next(void *prev, int idx)
163{ 172{
164 struct cpu_workqueue_stats *prev_cws = prev; 173 struct cpu_workqueue_stats *prev_cws = prev;
174 struct cpu_workqueue_stats *ret;
165 int cpu = prev_cws->cpu; 175 int cpu = prev_cws->cpu;
166 unsigned long flags; 176 unsigned long flags;
167 void *ret = NULL;
168 177
169 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); 178 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
170 if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) { 179 if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) {
@@ -175,11 +184,14 @@ static void *workqueue_stat_next(void *prev, int idx)
175 return NULL; 184 return NULL;
176 } while (!(ret = workqueue_stat_start_cpu(cpu))); 185 } while (!(ret = workqueue_stat_start_cpu(cpu)));
177 return ret; 186 return ret;
187 } else {
188 ret = list_entry(prev_cws->list.next,
189 struct cpu_workqueue_stats, list);
190 kref_get(&ret->kref);
178 } 191 }
179 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); 192 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
180 193
181 return list_entry(prev_cws->list.next, struct cpu_workqueue_stats, 194 return ret;
182 list);
183} 195}
184 196
185static int workqueue_stat_show(struct seq_file *s, void *p) 197static int workqueue_stat_show(struct seq_file *s, void *p)
@@ -203,6 +215,13 @@ static int workqueue_stat_show(struct seq_file *s, void *p)
203 return 0; 215 return 0;
204} 216}
205 217
218static void workqueue_stat_release(void *stat)
219{
220 struct cpu_workqueue_stats *node = stat;
221
222 kref_put(&node->kref, cpu_workqueue_stat_free);
223}
224
206static int workqueue_stat_headers(struct seq_file *s) 225static int workqueue_stat_headers(struct seq_file *s)
207{ 226{
208 seq_printf(s, "# CPU INSERTED EXECUTED NAME\n"); 227 seq_printf(s, "# CPU INSERTED EXECUTED NAME\n");
@@ -215,6 +234,7 @@ struct tracer_stat workqueue_stats __read_mostly = {
215 .stat_start = workqueue_stat_start, 234 .stat_start = workqueue_stat_start,
216 .stat_next = workqueue_stat_next, 235 .stat_next = workqueue_stat_next,
217 .stat_show = workqueue_stat_show, 236 .stat_show = workqueue_stat_show,
237 .stat_release = workqueue_stat_release,
218 .stat_headers = workqueue_stat_headers 238 .stat_headers = workqueue_stat_headers
219}; 239};
220 240
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 1ef5d3a601c7..9489a0a9b1be 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -24,6 +24,7 @@
24#include <linux/tracepoint.h> 24#include <linux/tracepoint.h>
25#include <linux/err.h> 25#include <linux/err.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/sched.h>
27 28
28extern struct tracepoint __start___tracepoints[]; 29extern struct tracepoint __start___tracepoints[];
29extern struct tracepoint __stop___tracepoints[]; 30extern struct tracepoint __stop___tracepoints[];
@@ -242,6 +243,11 @@ static void set_tracepoint(struct tracepoint_entry **entry,
242{ 243{
243 WARN_ON(strcmp((*entry)->name, elem->name) != 0); 244 WARN_ON(strcmp((*entry)->name, elem->name) != 0);
244 245
246 if (elem->regfunc && !elem->state && active)
247 elem->regfunc();
248 else if (elem->unregfunc && elem->state && !active)
249 elem->unregfunc();
250
245 /* 251 /*
246 * rcu_assign_pointer has a smp_wmb() which makes sure that the new 252 * rcu_assign_pointer has a smp_wmb() which makes sure that the new
247 * probe callbacks array is consistent before setting a pointer to it. 253 * probe callbacks array is consistent before setting a pointer to it.
@@ -261,6 +267,9 @@ static void set_tracepoint(struct tracepoint_entry **entry,
261 */ 267 */
262static void disable_tracepoint(struct tracepoint *elem) 268static void disable_tracepoint(struct tracepoint *elem)
263{ 269{
270 if (elem->unregfunc && elem->state)
271 elem->unregfunc();
272
264 elem->state = 0; 273 elem->state = 0;
265 rcu_assign_pointer(elem->funcs, NULL); 274 rcu_assign_pointer(elem->funcs, NULL);
266} 275}
@@ -554,9 +563,6 @@ int tracepoint_module_notify(struct notifier_block *self,
554 563
555 switch (val) { 564 switch (val) {
556 case MODULE_STATE_COMING: 565 case MODULE_STATE_COMING:
557 tracepoint_update_probe_range(mod->tracepoints,
558 mod->tracepoints + mod->num_tracepoints);
559 break;
560 case MODULE_STATE_GOING: 566 case MODULE_STATE_GOING:
561 tracepoint_update_probe_range(mod->tracepoints, 567 tracepoint_update_probe_range(mod->tracepoints,
562 mod->tracepoints + mod->num_tracepoints); 568 mod->tracepoints + mod->num_tracepoints);
@@ -577,3 +583,41 @@ static int init_tracepoints(void)
577__initcall(init_tracepoints); 583__initcall(init_tracepoints);
578 584
579#endif /* CONFIG_MODULES */ 585#endif /* CONFIG_MODULES */
586
587#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
588
589/* NB: reg/unreg are called while guarded with the tracepoints_mutex */
590static int sys_tracepoint_refcount;
591
592void syscall_regfunc(void)
593{
594 unsigned long flags;
595 struct task_struct *g, *t;
596
597 if (!sys_tracepoint_refcount) {
598 read_lock_irqsave(&tasklist_lock, flags);
599 do_each_thread(g, t) {
600 /* Skip kernel threads. */
601 if (t->mm)
602 set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
603 } while_each_thread(g, t);
604 read_unlock_irqrestore(&tasklist_lock, flags);
605 }
606 sys_tracepoint_refcount++;
607}
608
609void syscall_unregfunc(void)
610{
611 unsigned long flags;
612 struct task_struct *g, *t;
613
614 sys_tracepoint_refcount--;
615 if (!sys_tracepoint_refcount) {
616 read_lock_irqsave(&tasklist_lock, flags);
617 do_each_thread(g, t) {
618 clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
619 } while_each_thread(g, t);
620 read_unlock_irqrestore(&tasklist_lock, flags);
621 }
622}
623#endif
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0668795d8818..addfe2df93b1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -317,8 +317,6 @@ static int worker_thread(void *__cwq)
317 if (cwq->wq->freezeable) 317 if (cwq->wq->freezeable)
318 set_freezable(); 318 set_freezable();
319 319
320 set_user_nice(current, -5);
321
322 for (;;) { 320 for (;;) {
323 prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); 321 prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
324 if (!freezing(current) && 322 if (!freezing(current) &&
@@ -600,7 +598,12 @@ static struct workqueue_struct *keventd_wq __read_mostly;
600 * schedule_work - put work task in global workqueue 598 * schedule_work - put work task in global workqueue
601 * @work: job to be done 599 * @work: job to be done
602 * 600 *
603 * This puts a job in the kernel-global workqueue. 601 * Returns zero if @work was already on the kernel-global workqueue and
602 * non-zero otherwise.
603 *
604 * This puts a job in the kernel-global workqueue if it was not already
605 * queued and leaves it in the same position on the kernel-global
606 * workqueue otherwise.
604 */ 607 */
605int schedule_work(struct work_struct *work) 608int schedule_work(struct work_struct *work)
606{ 609{
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index fbb87cf138c5..7dbd5d9c29a4 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -740,7 +740,7 @@ config RCU_TORTURE_TEST_RUNNABLE
740 740
741config RCU_CPU_STALL_DETECTOR 741config RCU_CPU_STALL_DETECTOR
742 bool "Check for stalled CPUs delaying RCU grace periods" 742 bool "Check for stalled CPUs delaying RCU grace periods"
743 depends on CLASSIC_RCU || TREE_RCU 743 depends on TREE_RCU || TREE_PREEMPT_RCU
744 default n 744 default n
745 help 745 help
746 This option causes RCU to printk information on which 746 This option causes RCU to printk information on which
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index bffe6d7ef9d9..ac25cd28e807 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -114,46 +114,11 @@ setup_io_tlb_npages(char *str)
114__setup("swiotlb=", setup_io_tlb_npages); 114__setup("swiotlb=", setup_io_tlb_npages);
115/* make io_tlb_overflow tunable too? */ 115/* make io_tlb_overflow tunable too? */
116 116
117void * __weak __init swiotlb_alloc_boot(size_t size, unsigned long nslabs) 117/* Note that this doesn't work with highmem page */
118{
119 return alloc_bootmem_low_pages(size);
120}
121
122void * __weak swiotlb_alloc(unsigned order, unsigned long nslabs)
123{
124 return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order);
125}
126
127dma_addr_t __weak swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr)
128{
129 return paddr;
130}
131
132phys_addr_t __weak swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr)
133{
134 return baddr;
135}
136
137static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev, 118static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev,
138 volatile void *address) 119 volatile void *address)
139{ 120{
140 return swiotlb_phys_to_bus(hwdev, virt_to_phys(address)); 121 return phys_to_dma(hwdev, virt_to_phys(address));
141}
142
143void * __weak swiotlb_bus_to_virt(struct device *hwdev, dma_addr_t address)
144{
145 return phys_to_virt(swiotlb_bus_to_phys(hwdev, address));
146}
147
148int __weak swiotlb_arch_address_needs_mapping(struct device *hwdev,
149 dma_addr_t addr, size_t size)
150{
151 return !is_buffer_dma_capable(dma_get_mask(hwdev), addr, size);
152}
153
154int __weak swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size)
155{
156 return 0;
157} 122}
158 123
159static void swiotlb_print_info(unsigned long bytes) 124static void swiotlb_print_info(unsigned long bytes)
@@ -189,7 +154,7 @@ swiotlb_init_with_default_size(size_t default_size)
189 /* 154 /*
190 * Get IO TLB memory from the low pages 155 * Get IO TLB memory from the low pages
191 */ 156 */
192 io_tlb_start = swiotlb_alloc_boot(bytes, io_tlb_nslabs); 157 io_tlb_start = alloc_bootmem_low_pages(bytes);
193 if (!io_tlb_start) 158 if (!io_tlb_start)
194 panic("Cannot allocate SWIOTLB buffer"); 159 panic("Cannot allocate SWIOTLB buffer");
195 io_tlb_end = io_tlb_start + bytes; 160 io_tlb_end = io_tlb_start + bytes;
@@ -245,7 +210,8 @@ swiotlb_late_init_with_default_size(size_t default_size)
245 bytes = io_tlb_nslabs << IO_TLB_SHIFT; 210 bytes = io_tlb_nslabs << IO_TLB_SHIFT;
246 211
247 while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { 212 while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
248 io_tlb_start = swiotlb_alloc(order, io_tlb_nslabs); 213 io_tlb_start = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN,
214 order);
249 if (io_tlb_start) 215 if (io_tlb_start)
250 break; 216 break;
251 order--; 217 order--;
@@ -315,20 +281,10 @@ cleanup1:
315 return -ENOMEM; 281 return -ENOMEM;
316} 282}
317 283
318static inline int 284static int is_swiotlb_buffer(phys_addr_t paddr)
319address_needs_mapping(struct device *hwdev, dma_addr_t addr, size_t size)
320{ 285{
321 return swiotlb_arch_address_needs_mapping(hwdev, addr, size); 286 return paddr >= virt_to_phys(io_tlb_start) &&
322} 287 paddr < virt_to_phys(io_tlb_end);
323
324static inline int range_needs_mapping(phys_addr_t paddr, size_t size)
325{
326 return swiotlb_force || swiotlb_arch_range_needs_mapping(paddr, size);
327}
328
329static int is_swiotlb_buffer(char *addr)
330{
331 return addr >= io_tlb_start && addr < io_tlb_end;
332} 288}
333 289
334/* 290/*
@@ -561,9 +517,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
561 dma_mask = hwdev->coherent_dma_mask; 517 dma_mask = hwdev->coherent_dma_mask;
562 518
563 ret = (void *)__get_free_pages(flags, order); 519 ret = (void *)__get_free_pages(flags, order);
564 if (ret && 520 if (ret && swiotlb_virt_to_bus(hwdev, ret) + size > dma_mask) {
565 !is_buffer_dma_capable(dma_mask, swiotlb_virt_to_bus(hwdev, ret),
566 size)) {
567 /* 521 /*
568 * The allocated memory isn't reachable by the device. 522 * The allocated memory isn't reachable by the device.
569 */ 523 */
@@ -585,7 +539,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
585 dev_addr = swiotlb_virt_to_bus(hwdev, ret); 539 dev_addr = swiotlb_virt_to_bus(hwdev, ret);
586 540
587 /* Confirm address can be DMA'd by device */ 541 /* Confirm address can be DMA'd by device */
588 if (!is_buffer_dma_capable(dma_mask, dev_addr, size)) { 542 if (dev_addr + size > dma_mask) {
589 printk("hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016Lx\n", 543 printk("hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016Lx\n",
590 (unsigned long long)dma_mask, 544 (unsigned long long)dma_mask,
591 (unsigned long long)dev_addr); 545 (unsigned long long)dev_addr);
@@ -601,11 +555,13 @@ EXPORT_SYMBOL(swiotlb_alloc_coherent);
601 555
602void 556void
603swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, 557swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
604 dma_addr_t dma_handle) 558 dma_addr_t dev_addr)
605{ 559{
560 phys_addr_t paddr = dma_to_phys(hwdev, dev_addr);
561
606 WARN_ON(irqs_disabled()); 562 WARN_ON(irqs_disabled());
607 if (!is_swiotlb_buffer(vaddr)) 563 if (!is_swiotlb_buffer(paddr))
608 free_pages((unsigned long) vaddr, get_order(size)); 564 free_pages((unsigned long)vaddr, get_order(size));
609 else 565 else
610 /* DMA_TO_DEVICE to avoid memcpy in unmap_single */ 566 /* DMA_TO_DEVICE to avoid memcpy in unmap_single */
611 do_unmap_single(hwdev, vaddr, size, DMA_TO_DEVICE); 567 do_unmap_single(hwdev, vaddr, size, DMA_TO_DEVICE);
@@ -625,12 +581,15 @@ swiotlb_full(struct device *dev, size_t size, int dir, int do_panic)
625 printk(KERN_ERR "DMA: Out of SW-IOMMU space for %zu bytes at " 581 printk(KERN_ERR "DMA: Out of SW-IOMMU space for %zu bytes at "
626 "device %s\n", size, dev ? dev_name(dev) : "?"); 582 "device %s\n", size, dev ? dev_name(dev) : "?");
627 583
628 if (size > io_tlb_overflow && do_panic) { 584 if (size <= io_tlb_overflow || !do_panic)
629 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) 585 return;
630 panic("DMA: Memory would be corrupted\n"); 586
631 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) 587 if (dir == DMA_BIDIRECTIONAL)
632 panic("DMA: Random memory would be DMAed\n"); 588 panic("DMA: Random memory could be DMA accessed\n");
633 } 589 if (dir == DMA_FROM_DEVICE)
590 panic("DMA: Random memory could be DMA written\n");
591 if (dir == DMA_TO_DEVICE)
592 panic("DMA: Random memory could be DMA read\n");
634} 593}
635 594
636/* 595/*
@@ -646,7 +605,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
646 struct dma_attrs *attrs) 605 struct dma_attrs *attrs)
647{ 606{
648 phys_addr_t phys = page_to_phys(page) + offset; 607 phys_addr_t phys = page_to_phys(page) + offset;
649 dma_addr_t dev_addr = swiotlb_phys_to_bus(dev, phys); 608 dma_addr_t dev_addr = phys_to_dma(dev, phys);
650 void *map; 609 void *map;
651 610
652 BUG_ON(dir == DMA_NONE); 611 BUG_ON(dir == DMA_NONE);
@@ -655,8 +614,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
655 * we can safely return the device addr and not worry about bounce 614 * we can safely return the device addr and not worry about bounce
656 * buffering it. 615 * buffering it.
657 */ 616 */
658 if (!address_needs_mapping(dev, dev_addr, size) && 617 if (dma_capable(dev, dev_addr, size) && !swiotlb_force)
659 !range_needs_mapping(phys, size))
660 return dev_addr; 618 return dev_addr;
661 619
662 /* 620 /*
@@ -673,7 +631,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
673 /* 631 /*
674 * Ensure that the address returned is DMA'ble 632 * Ensure that the address returned is DMA'ble
675 */ 633 */
676 if (address_needs_mapping(dev, dev_addr, size)) 634 if (!dma_capable(dev, dev_addr, size))
677 panic("map_single: bounce buffer is not DMA'ble"); 635 panic("map_single: bounce buffer is not DMA'ble");
678 636
679 return dev_addr; 637 return dev_addr;
@@ -691,19 +649,25 @@ EXPORT_SYMBOL_GPL(swiotlb_map_page);
691static void unmap_single(struct device *hwdev, dma_addr_t dev_addr, 649static void unmap_single(struct device *hwdev, dma_addr_t dev_addr,
692 size_t size, int dir) 650 size_t size, int dir)
693{ 651{
694 char *dma_addr = swiotlb_bus_to_virt(hwdev, dev_addr); 652 phys_addr_t paddr = dma_to_phys(hwdev, dev_addr);
695 653
696 BUG_ON(dir == DMA_NONE); 654 BUG_ON(dir == DMA_NONE);
697 655
698 if (is_swiotlb_buffer(dma_addr)) { 656 if (is_swiotlb_buffer(paddr)) {
699 do_unmap_single(hwdev, dma_addr, size, dir); 657 do_unmap_single(hwdev, phys_to_virt(paddr), size, dir);
700 return; 658 return;
701 } 659 }
702 660
703 if (dir != DMA_FROM_DEVICE) 661 if (dir != DMA_FROM_DEVICE)
704 return; 662 return;
705 663
706 dma_mark_clean(dma_addr, size); 664 /*
665 * phys_to_virt doesn't work with hihgmem page but we could
666 * call dma_mark_clean() with hihgmem page here. However, we
667 * are fine since dma_mark_clean() is null on POWERPC. We can
668 * make dma_mark_clean() take a physical address if necessary.
669 */
670 dma_mark_clean(phys_to_virt(paddr), size);
707} 671}
708 672
709void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, 673void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
@@ -728,19 +692,19 @@ static void
728swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, 692swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
729 size_t size, int dir, int target) 693 size_t size, int dir, int target)
730{ 694{
731 char *dma_addr = swiotlb_bus_to_virt(hwdev, dev_addr); 695 phys_addr_t paddr = dma_to_phys(hwdev, dev_addr);
732 696
733 BUG_ON(dir == DMA_NONE); 697 BUG_ON(dir == DMA_NONE);
734 698
735 if (is_swiotlb_buffer(dma_addr)) { 699 if (is_swiotlb_buffer(paddr)) {
736 sync_single(hwdev, dma_addr, size, dir, target); 700 sync_single(hwdev, phys_to_virt(paddr), size, dir, target);
737 return; 701 return;
738 } 702 }
739 703
740 if (dir != DMA_FROM_DEVICE) 704 if (dir != DMA_FROM_DEVICE)
741 return; 705 return;
742 706
743 dma_mark_clean(dma_addr, size); 707 dma_mark_clean(phys_to_virt(paddr), size);
744} 708}
745 709
746void 710void
@@ -817,10 +781,10 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
817 781
818 for_each_sg(sgl, sg, nelems, i) { 782 for_each_sg(sgl, sg, nelems, i) {
819 phys_addr_t paddr = sg_phys(sg); 783 phys_addr_t paddr = sg_phys(sg);
820 dma_addr_t dev_addr = swiotlb_phys_to_bus(hwdev, paddr); 784 dma_addr_t dev_addr = phys_to_dma(hwdev, paddr);
821 785
822 if (range_needs_mapping(paddr, sg->length) || 786 if (swiotlb_force ||
823 address_needs_mapping(hwdev, dev_addr, sg->length)) { 787 !dma_capable(hwdev, dev_addr, sg->length)) {
824 void *map = map_single(hwdev, sg_phys(sg), 788 void *map = map_single(hwdev, sg_phys(sg),
825 sg->length, dir); 789 sg->length, dir);
826 if (!map) { 790 if (!map) {
diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl
index 911ba7ffab84..090d300d7394 100755
--- a/scripts/recordmcount.pl
+++ b/scripts/recordmcount.pl
@@ -57,7 +57,6 @@
57# call mcount (offset: 0x5) 57# call mcount (offset: 0x5)
58# [...] 58# [...]
59# ret 59# ret
60# .globl my_func
61# other_func: 60# other_func:
62# [...] 61# [...]
63# call mcount (offset: 0x1b) 62# call mcount (offset: 0x1b)
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index 6be696b0a2bb..0ff23de9e453 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -91,6 +91,10 @@ OPTIONS
91--no-samples:: 91--no-samples::
92 Don't sample. 92 Don't sample.
93 93
94-R::
95--raw-samples::
96Collect raw sample records from all opened counters (typically for tracepoint counters).
97
94SEE ALSO 98SEE ALSO
95-------- 99--------
96linkperf:perf-stat[1], linkperf:perf-list[1] 100linkperf:perf-stat[1], linkperf:perf-list[1]
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index e72e93110782..59f0b846cd71 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -27,6 +27,9 @@ OPTIONS
27-n 27-n
28--show-nr-samples 28--show-nr-samples
29 Show the number of samples for each symbol 29 Show the number of samples for each symbol
30-T
31--threads
32 Show per-thread event counters
30-C:: 33-C::
31--comms=:: 34--comms=::
32 Only consider symbols in these comms. CSV that understands 35 Only consider symbols in these comms. CSV that understands
@@ -48,6 +51,16 @@ OPTIONS
48 all occurances of this separator in symbol names (and other output) 51 all occurances of this separator in symbol names (and other output)
49 with a '.' character, that thus it's the only non valid separator. 52 with a '.' character, that thus it's the only non valid separator.
50 53
54-g [type,min]::
55--call-graph::
56 Display callchains using type and min percent threshold.
57 type can be either:
58 - flat: single column, linear exposure of callchains.
59 - graph: use a graph tree, displaying absolute overhead rates.
60 - fractal: like graph, but displays relative rates. Each branch of
61 the tree is considered as a new profiled object. +
62 Default: fractal,0.5.
63
51SEE ALSO 64SEE ALSO
52-------- 65--------
53linkperf:perf-stat[1] 66linkperf:perf-stat[1]
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index c045b4271e57..9f8d207a91bf 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -166,7 +166,35 @@ endif
166 166
167# CFLAGS and LDFLAGS are for the users to override from the command line. 167# CFLAGS and LDFLAGS are for the users to override from the command line.
168 168
169CFLAGS = $(M64) -ggdb3 -Wall -Wextra -Wstrict-prototypes -Wmissing-declarations -Wmissing-prototypes -std=gnu99 -Wdeclaration-after-statement -Werror -O6 169#
170# Include saner warnings here, which can catch bugs:
171#
172
173EXTRA_WARNINGS := -Wcast-align
174EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wformat
175EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wformat-security
176EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wformat-y2k
177EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wshadow
178EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Winit-self
179EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wpacked
180EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wredundant-decls
181EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wstack-protector
182EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wstrict-aliasing=3
183EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wswitch-default
184EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wswitch-enum
185EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wno-system-headers
186EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wundef
187EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wvolatile-register-var
188EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wwrite-strings
189EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wbad-function-cast
190EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wmissing-declarations
191EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wmissing-prototypes
192EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wnested-externs
193EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wold-style-definition
194EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wstrict-prototypes
195EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wdeclaration-after-statement
196
197CFLAGS = $(M64) -ggdb3 -Wall -Wextra -std=gnu99 -Werror -O6 -fstack-protector-all -D_FORTIFY_SOURCE=2 $(EXTRA_WARNINGS)
170LDFLAGS = -lpthread -lrt -lelf -lm 198LDFLAGS = -lpthread -lrt -lelf -lm
171ALL_CFLAGS = $(CFLAGS) 199ALL_CFLAGS = $(CFLAGS)
172ALL_LDFLAGS = $(LDFLAGS) 200ALL_LDFLAGS = $(LDFLAGS)
@@ -310,6 +338,7 @@ LIB_H += util/sigchain.h
310LIB_H += util/symbol.h 338LIB_H += util/symbol.h
311LIB_H += util/module.h 339LIB_H += util/module.h
312LIB_H += util/color.h 340LIB_H += util/color.h
341LIB_H += util/values.h
313 342
314LIB_OBJS += util/abspath.o 343LIB_OBJS += util/abspath.o
315LIB_OBJS += util/alias.o 344LIB_OBJS += util/alias.o
@@ -337,6 +366,13 @@ LIB_OBJS += util/color.o
337LIB_OBJS += util/pager.o 366LIB_OBJS += util/pager.o
338LIB_OBJS += util/header.o 367LIB_OBJS += util/header.o
339LIB_OBJS += util/callchain.o 368LIB_OBJS += util/callchain.o
369LIB_OBJS += util/values.o
370LIB_OBJS += util/debug.o
371LIB_OBJS += util/map.o
372LIB_OBJS += util/thread.o
373LIB_OBJS += util/trace-event-parse.o
374LIB_OBJS += util/trace-event-read.o
375LIB_OBJS += util/trace-event-info.o
340 376
341BUILTIN_OBJS += builtin-annotate.o 377BUILTIN_OBJS += builtin-annotate.o
342BUILTIN_OBJS += builtin-help.o 378BUILTIN_OBJS += builtin-help.o
@@ -345,6 +381,7 @@ BUILTIN_OBJS += builtin-record.o
345BUILTIN_OBJS += builtin-report.o 381BUILTIN_OBJS += builtin-report.o
346BUILTIN_OBJS += builtin-stat.o 382BUILTIN_OBJS += builtin-stat.o
347BUILTIN_OBJS += builtin-top.o 383BUILTIN_OBJS += builtin-top.o
384BUILTIN_OBJS += builtin-trace.o
348 385
349PERFLIBS = $(LIB_FILE) 386PERFLIBS = $(LIB_FILE)
350 387
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 5e17de984dc8..043d85b7e254 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -17,16 +17,13 @@
17#include "util/string.h" 17#include "util/string.h"
18 18
19#include "perf.h" 19#include "perf.h"
20#include "util/debug.h"
20 21
21#include "util/parse-options.h" 22#include "util/parse-options.h"
22#include "util/parse-events.h" 23#include "util/parse-events.h"
23 24#include "util/thread.h"
24#define SHOW_KERNEL 1
25#define SHOW_USER 2
26#define SHOW_HV 4
27 25
28static char const *input_name = "perf.data"; 26static char const *input_name = "perf.data";
29static char *vmlinux = "vmlinux";
30 27
31static char default_sort_order[] = "comm,symbol"; 28static char default_sort_order[] = "comm,symbol";
32static char *sort_order = default_sort_order; 29static char *sort_order = default_sort_order;
@@ -35,13 +32,6 @@ static int force;
35static int input; 32static int input;
36static int show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV; 33static int show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
37 34
38static int dump_trace = 0;
39#define dprintf(x...) do { if (dump_trace) printf(x); } while (0)
40
41static int verbose;
42
43static int modules;
44
45static int full_paths; 35static int full_paths;
46 36
47static int print_line; 37static int print_line;
@@ -49,39 +39,8 @@ static int print_line;
49static unsigned long page_size; 39static unsigned long page_size;
50static unsigned long mmap_window = 32; 40static unsigned long mmap_window = 32;
51 41
52struct ip_event { 42static struct rb_root threads;
53 struct perf_event_header header; 43static struct thread *last_match;
54 u64 ip;
55 u32 pid, tid;
56};
57
58struct mmap_event {
59 struct perf_event_header header;
60 u32 pid, tid;
61 u64 start;
62 u64 len;
63 u64 pgoff;
64 char filename[PATH_MAX];
65};
66
67struct comm_event {
68 struct perf_event_header header;
69 u32 pid, tid;
70 char comm[16];
71};
72
73struct fork_event {
74 struct perf_event_header header;
75 u32 pid, ppid;
76};
77
78typedef union event_union {
79 struct perf_event_header header;
80 struct ip_event ip;
81 struct mmap_event mmap;
82 struct comm_event comm;
83 struct fork_event fork;
84} event_t;
85 44
86 45
87struct sym_ext { 46struct sym_ext {
@@ -90,323 +49,6 @@ struct sym_ext {
90 char *path; 49 char *path;
91}; 50};
92 51
93static LIST_HEAD(dsos);
94static struct dso *kernel_dso;
95static struct dso *vdso;
96
97
98static void dsos__add(struct dso *dso)
99{
100 list_add_tail(&dso->node, &dsos);
101}
102
103static struct dso *dsos__find(const char *name)
104{
105 struct dso *pos;
106
107 list_for_each_entry(pos, &dsos, node)
108 if (strcmp(pos->name, name) == 0)
109 return pos;
110 return NULL;
111}
112
113static struct dso *dsos__findnew(const char *name)
114{
115 struct dso *dso = dsos__find(name);
116 int nr;
117
118 if (dso)
119 return dso;
120
121 dso = dso__new(name, 0);
122 if (!dso)
123 goto out_delete_dso;
124
125 nr = dso__load(dso, NULL, verbose);
126 if (nr < 0) {
127 if (verbose)
128 fprintf(stderr, "Failed to open: %s\n", name);
129 goto out_delete_dso;
130 }
131 if (!nr && verbose) {
132 fprintf(stderr,
133 "No symbols found in: %s, maybe install a debug package?\n",
134 name);
135 }
136
137 dsos__add(dso);
138
139 return dso;
140
141out_delete_dso:
142 dso__delete(dso);
143 return NULL;
144}
145
146static void dsos__fprintf(FILE *fp)
147{
148 struct dso *pos;
149
150 list_for_each_entry(pos, &dsos, node)
151 dso__fprintf(pos, fp);
152}
153
154static struct symbol *vdso__find_symbol(struct dso *dso, u64 ip)
155{
156 return dso__find_symbol(dso, ip);
157}
158
159static int load_kernel(void)
160{
161 int err;
162
163 kernel_dso = dso__new("[kernel]", 0);
164 if (!kernel_dso)
165 return -1;
166
167 err = dso__load_kernel(kernel_dso, vmlinux, NULL, verbose, modules);
168 if (err <= 0) {
169 dso__delete(kernel_dso);
170 kernel_dso = NULL;
171 } else
172 dsos__add(kernel_dso);
173
174 vdso = dso__new("[vdso]", 0);
175 if (!vdso)
176 return -1;
177
178 vdso->find_symbol = vdso__find_symbol;
179
180 dsos__add(vdso);
181
182 return err;
183}
184
185struct map {
186 struct list_head node;
187 u64 start;
188 u64 end;
189 u64 pgoff;
190 u64 (*map_ip)(struct map *, u64);
191 struct dso *dso;
192};
193
194static u64 map__map_ip(struct map *map, u64 ip)
195{
196 return ip - map->start + map->pgoff;
197}
198
199static u64 vdso__map_ip(struct map *map __used, u64 ip)
200{
201 return ip;
202}
203
204static struct map *map__new(struct mmap_event *event)
205{
206 struct map *self = malloc(sizeof(*self));
207
208 if (self != NULL) {
209 const char *filename = event->filename;
210
211 self->start = event->start;
212 self->end = event->start + event->len;
213 self->pgoff = event->pgoff;
214
215 self->dso = dsos__findnew(filename);
216 if (self->dso == NULL)
217 goto out_delete;
218
219 if (self->dso == vdso)
220 self->map_ip = vdso__map_ip;
221 else
222 self->map_ip = map__map_ip;
223 }
224 return self;
225out_delete:
226 free(self);
227 return NULL;
228}
229
230static struct map *map__clone(struct map *self)
231{
232 struct map *map = malloc(sizeof(*self));
233
234 if (!map)
235 return NULL;
236
237 memcpy(map, self, sizeof(*self));
238
239 return map;
240}
241
242static int map__overlap(struct map *l, struct map *r)
243{
244 if (l->start > r->start) {
245 struct map *t = l;
246 l = r;
247 r = t;
248 }
249
250 if (l->end > r->start)
251 return 1;
252
253 return 0;
254}
255
256static size_t map__fprintf(struct map *self, FILE *fp)
257{
258 return fprintf(fp, " %Lx-%Lx %Lx %s\n",
259 self->start, self->end, self->pgoff, self->dso->name);
260}
261
262
263struct thread {
264 struct rb_node rb_node;
265 struct list_head maps;
266 pid_t pid;
267 char *comm;
268};
269
270static struct thread *thread__new(pid_t pid)
271{
272 struct thread *self = malloc(sizeof(*self));
273
274 if (self != NULL) {
275 self->pid = pid;
276 self->comm = malloc(32);
277 if (self->comm)
278 snprintf(self->comm, 32, ":%d", self->pid);
279 INIT_LIST_HEAD(&self->maps);
280 }
281
282 return self;
283}
284
285static int thread__set_comm(struct thread *self, const char *comm)
286{
287 if (self->comm)
288 free(self->comm);
289 self->comm = strdup(comm);
290 return self->comm ? 0 : -ENOMEM;
291}
292
293static size_t thread__fprintf(struct thread *self, FILE *fp)
294{
295 struct map *pos;
296 size_t ret = fprintf(fp, "Thread %d %s\n", self->pid, self->comm);
297
298 list_for_each_entry(pos, &self->maps, node)
299 ret += map__fprintf(pos, fp);
300
301 return ret;
302}
303
304
305static struct rb_root threads;
306static struct thread *last_match;
307
308static struct thread *threads__findnew(pid_t pid)
309{
310 struct rb_node **p = &threads.rb_node;
311 struct rb_node *parent = NULL;
312 struct thread *th;
313
314 /*
315 * Font-end cache - PID lookups come in blocks,
316 * so most of the time we dont have to look up
317 * the full rbtree:
318 */
319 if (last_match && last_match->pid == pid)
320 return last_match;
321
322 while (*p != NULL) {
323 parent = *p;
324 th = rb_entry(parent, struct thread, rb_node);
325
326 if (th->pid == pid) {
327 last_match = th;
328 return th;
329 }
330
331 if (pid < th->pid)
332 p = &(*p)->rb_left;
333 else
334 p = &(*p)->rb_right;
335 }
336
337 th = thread__new(pid);
338 if (th != NULL) {
339 rb_link_node(&th->rb_node, parent, p);
340 rb_insert_color(&th->rb_node, &threads);
341 last_match = th;
342 }
343
344 return th;
345}
346
347static void thread__insert_map(struct thread *self, struct map *map)
348{
349 struct map *pos, *tmp;
350
351 list_for_each_entry_safe(pos, tmp, &self->maps, node) {
352 if (map__overlap(pos, map)) {
353 list_del_init(&pos->node);
354 /* XXX leaks dsos */
355 free(pos);
356 }
357 }
358
359 list_add_tail(&map->node, &self->maps);
360}
361
362static int thread__fork(struct thread *self, struct thread *parent)
363{
364 struct map *map;
365
366 if (self->comm)
367 free(self->comm);
368 self->comm = strdup(parent->comm);
369 if (!self->comm)
370 return -ENOMEM;
371
372 list_for_each_entry(map, &parent->maps, node) {
373 struct map *new = map__clone(map);
374 if (!new)
375 return -ENOMEM;
376 thread__insert_map(self, new);
377 }
378
379 return 0;
380}
381
382static struct map *thread__find_map(struct thread *self, u64 ip)
383{
384 struct map *pos;
385
386 if (self == NULL)
387 return NULL;
388
389 list_for_each_entry(pos, &self->maps, node)
390 if (ip >= pos->start && ip <= pos->end)
391 return pos;
392
393 return NULL;
394}
395
396static size_t threads__fprintf(FILE *fp)
397{
398 size_t ret = 0;
399 struct rb_node *nd;
400
401 for (nd = rb_first(&threads); nd; nd = rb_next(nd)) {
402 struct thread *pos = rb_entry(nd, struct thread, rb_node);
403
404 ret += thread__fprintf(pos, fp);
405 }
406
407 return ret;
408}
409
410/* 52/*
411 * histogram, sorted on item, collects counts 53 * histogram, sorted on item, collects counts
412 */ 54 */
@@ -433,7 +75,7 @@ struct hist_entry {
433struct sort_entry { 75struct sort_entry {
434 struct list_head list; 76 struct list_head list;
435 77
436 char *header; 78 const char *header;
437 79
438 int64_t (*cmp)(struct hist_entry *, struct hist_entry *); 80 int64_t (*cmp)(struct hist_entry *, struct hist_entry *);
439 int64_t (*collapse)(struct hist_entry *, struct hist_entry *); 81 int64_t (*collapse)(struct hist_entry *, struct hist_entry *);
@@ -577,7 +219,7 @@ static struct sort_entry sort_sym = {
577static int sort__need_collapse = 0; 219static int sort__need_collapse = 0;
578 220
579struct sort_dimension { 221struct sort_dimension {
580 char *name; 222 const char *name;
581 struct sort_entry *entry; 223 struct sort_entry *entry;
582 int taken; 224 int taken;
583}; 225};
@@ -830,17 +472,6 @@ static void output__resort(void)
830 } 472 }
831} 473}
832 474
833static void register_idle_thread(void)
834{
835 struct thread *thread = threads__findnew(0);
836
837 if (thread == NULL ||
838 thread__set_comm(thread, "[idle]")) {
839 fprintf(stderr, "problem inserting idle task.\n");
840 exit(-1);
841 }
842}
843
844static unsigned long total = 0, 475static unsigned long total = 0,
845 total_mmap = 0, 476 total_mmap = 0,
846 total_comm = 0, 477 total_comm = 0,
@@ -853,18 +484,20 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
853 char level; 484 char level;
854 int show = 0; 485 int show = 0;
855 struct dso *dso = NULL; 486 struct dso *dso = NULL;
856 struct thread *thread = threads__findnew(event->ip.pid); 487 struct thread *thread;
857 u64 ip = event->ip.ip; 488 u64 ip = event->ip.ip;
858 struct map *map = NULL; 489 struct map *map = NULL;
859 490
860 dprintf("%p [%p]: PERF_EVENT (IP, %d): %d: %p\n", 491 thread = threads__findnew(event->ip.pid, &threads, &last_match);
492
493 dump_printf("%p [%p]: PERF_EVENT (IP, %d): %d: %p\n",
861 (void *)(offset + head), 494 (void *)(offset + head),
862 (void *)(long)(event->header.size), 495 (void *)(long)(event->header.size),
863 event->header.misc, 496 event->header.misc,
864 event->ip.pid, 497 event->ip.pid,
865 (void *)(long)ip); 498 (void *)(long)ip);
866 499
867 dprintf(" ... thread: %s:%d\n", thread->comm, thread->pid); 500 dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
868 501
869 if (thread == NULL) { 502 if (thread == NULL) {
870 fprintf(stderr, "problem processing %d event, skipping it.\n", 503 fprintf(stderr, "problem processing %d event, skipping it.\n",
@@ -878,7 +511,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
878 511
879 dso = kernel_dso; 512 dso = kernel_dso;
880 513
881 dprintf(" ...... dso: %s\n", dso->name); 514 dump_printf(" ...... dso: %s\n", dso->name);
882 515
883 } else if (event->header.misc & PERF_EVENT_MISC_USER) { 516 } else if (event->header.misc & PERF_EVENT_MISC_USER) {
884 517
@@ -899,12 +532,12 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
899 if ((long long)ip < 0) 532 if ((long long)ip < 0)
900 dso = kernel_dso; 533 dso = kernel_dso;
901 } 534 }
902 dprintf(" ...... dso: %s\n", dso ? dso->name : "<not found>"); 535 dump_printf(" ...... dso: %s\n", dso ? dso->name : "<not found>");
903 536
904 } else { 537 } else {
905 show = SHOW_HV; 538 show = SHOW_HV;
906 level = 'H'; 539 level = 'H';
907 dprintf(" ...... dso: [hypervisor]\n"); 540 dump_printf(" ...... dso: [hypervisor]\n");
908 } 541 }
909 542
910 if (show & show_mask) { 543 if (show & show_mask) {
@@ -927,10 +560,12 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
927static int 560static int
928process_mmap_event(event_t *event, unsigned long offset, unsigned long head) 561process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
929{ 562{
930 struct thread *thread = threads__findnew(event->mmap.pid); 563 struct thread *thread;
931 struct map *map = map__new(&event->mmap); 564 struct map *map = map__new(&event->mmap, NULL, 0);
565
566 thread = threads__findnew(event->mmap.pid, &threads, &last_match);
932 567
933 dprintf("%p [%p]: PERF_EVENT_MMAP %d: [%p(%p) @ %p]: %s\n", 568 dump_printf("%p [%p]: PERF_EVENT_MMAP %d: [%p(%p) @ %p]: %s\n",
934 (void *)(offset + head), 569 (void *)(offset + head),
935 (void *)(long)(event->header.size), 570 (void *)(long)(event->header.size),
936 event->mmap.pid, 571 event->mmap.pid,
@@ -940,7 +575,7 @@ process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
940 event->mmap.filename); 575 event->mmap.filename);
941 576
942 if (thread == NULL || map == NULL) { 577 if (thread == NULL || map == NULL) {
943 dprintf("problem processing PERF_EVENT_MMAP, skipping event.\n"); 578 dump_printf("problem processing PERF_EVENT_MMAP, skipping event.\n");
944 return 0; 579 return 0;
945 } 580 }
946 581
@@ -953,16 +588,17 @@ process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
953static int 588static int
954process_comm_event(event_t *event, unsigned long offset, unsigned long head) 589process_comm_event(event_t *event, unsigned long offset, unsigned long head)
955{ 590{
956 struct thread *thread = threads__findnew(event->comm.pid); 591 struct thread *thread;
957 592
958 dprintf("%p [%p]: PERF_EVENT_COMM: %s:%d\n", 593 thread = threads__findnew(event->comm.pid, &threads, &last_match);
594 dump_printf("%p [%p]: PERF_EVENT_COMM: %s:%d\n",
959 (void *)(offset + head), 595 (void *)(offset + head),
960 (void *)(long)(event->header.size), 596 (void *)(long)(event->header.size),
961 event->comm.comm, event->comm.pid); 597 event->comm.comm, event->comm.pid);
962 598
963 if (thread == NULL || 599 if (thread == NULL ||
964 thread__set_comm(thread, event->comm.comm)) { 600 thread__set_comm(thread, event->comm.comm)) {
965 dprintf("problem processing PERF_EVENT_COMM, skipping event.\n"); 601 dump_printf("problem processing PERF_EVENT_COMM, skipping event.\n");
966 return -1; 602 return -1;
967 } 603 }
968 total_comm++; 604 total_comm++;
@@ -973,10 +609,12 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head)
973static int 609static int
974process_fork_event(event_t *event, unsigned long offset, unsigned long head) 610process_fork_event(event_t *event, unsigned long offset, unsigned long head)
975{ 611{
976 struct thread *thread = threads__findnew(event->fork.pid); 612 struct thread *thread;
977 struct thread *parent = threads__findnew(event->fork.ppid); 613 struct thread *parent;
978 614
979 dprintf("%p [%p]: PERF_EVENT_FORK: %d:%d\n", 615 thread = threads__findnew(event->fork.pid, &threads, &last_match);
616 parent = threads__findnew(event->fork.ppid, &threads, &last_match);
617 dump_printf("%p [%p]: PERF_EVENT_FORK: %d:%d\n",
980 (void *)(offset + head), 618 (void *)(offset + head),
981 (void *)(long)(event->header.size), 619 (void *)(long)(event->header.size),
982 event->fork.pid, event->fork.ppid); 620 event->fork.pid, event->fork.ppid);
@@ -989,7 +627,7 @@ process_fork_event(event_t *event, unsigned long offset, unsigned long head)
989 return 0; 627 return 0;
990 628
991 if (!thread || !parent || thread__fork(thread, parent)) { 629 if (!thread || !parent || thread__fork(thread, parent)) {
992 dprintf("problem processing PERF_EVENT_FORK, skipping event.\n"); 630 dump_printf("problem processing PERF_EVENT_FORK, skipping event.\n");
993 return -1; 631 return -1;
994 } 632 }
995 total_fork++; 633 total_fork++;
@@ -1075,7 +713,7 @@ parse_line(FILE *file, struct symbol *sym, u64 start, u64 len)
1075 const char *path = NULL; 713 const char *path = NULL;
1076 unsigned int hits = 0; 714 unsigned int hits = 0;
1077 double percent = 0.0; 715 double percent = 0.0;
1078 char *color; 716 const char *color;
1079 struct sym_ext *sym_ext = sym->priv; 717 struct sym_ext *sym_ext = sym->priv;
1080 718
1081 offset = line_ip - start; 719 offset = line_ip - start;
@@ -1157,7 +795,7 @@ static void free_source_line(struct symbol *sym, int len)
1157 795
1158/* Get the filename:line for the colored entries */ 796/* Get the filename:line for the colored entries */
1159static void 797static void
1160get_source_line(struct symbol *sym, u64 start, int len, char *filename) 798get_source_line(struct symbol *sym, u64 start, int len, const char *filename)
1161{ 799{
1162 int i; 800 int i;
1163 char cmd[PATH_MAX * 2]; 801 char cmd[PATH_MAX * 2];
@@ -1203,7 +841,7 @@ get_source_line(struct symbol *sym, u64 start, int len, char *filename)
1203 } 841 }
1204} 842}
1205 843
1206static void print_summary(char *filename) 844static void print_summary(const char *filename)
1207{ 845{
1208 struct sym_ext *sym_ext; 846 struct sym_ext *sym_ext;
1209 struct rb_node *node; 847 struct rb_node *node;
@@ -1219,7 +857,7 @@ static void print_summary(char *filename)
1219 node = rb_first(&root_sym_ext); 857 node = rb_first(&root_sym_ext);
1220 while (node) { 858 while (node) {
1221 double percent; 859 double percent;
1222 char *color; 860 const char *color;
1223 char *path; 861 char *path;
1224 862
1225 sym_ext = rb_entry(node, struct sym_ext, node); 863 sym_ext = rb_entry(node, struct sym_ext, node);
@@ -1234,7 +872,7 @@ static void print_summary(char *filename)
1234 872
1235static void annotate_sym(struct dso *dso, struct symbol *sym) 873static void annotate_sym(struct dso *dso, struct symbol *sym)
1236{ 874{
1237 char *filename = dso->name, *d_filename; 875 const char *filename = dso->name, *d_filename;
1238 u64 start, end, len; 876 u64 start, end, len;
1239 char command[PATH_MAX*2]; 877 char command[PATH_MAX*2];
1240 FILE *file; 878 FILE *file;
@@ -1244,7 +882,7 @@ static void annotate_sym(struct dso *dso, struct symbol *sym)
1244 if (sym->module) 882 if (sym->module)
1245 filename = sym->module->path; 883 filename = sym->module->path;
1246 else if (dso == kernel_dso) 884 else if (dso == kernel_dso)
1247 filename = vmlinux; 885 filename = vmlinux_name;
1248 886
1249 start = sym->obj_start; 887 start = sym->obj_start;
1250 if (!start) 888 if (!start)
@@ -1316,12 +954,12 @@ static int __cmd_annotate(void)
1316 int ret, rc = EXIT_FAILURE; 954 int ret, rc = EXIT_FAILURE;
1317 unsigned long offset = 0; 955 unsigned long offset = 0;
1318 unsigned long head = 0; 956 unsigned long head = 0;
1319 struct stat stat; 957 struct stat input_stat;
1320 event_t *event; 958 event_t *event;
1321 uint32_t size; 959 uint32_t size;
1322 char *buf; 960 char *buf;
1323 961
1324 register_idle_thread(); 962 register_idle_thread(&threads, &last_match);
1325 963
1326 input = open(input_name, O_RDONLY); 964 input = open(input_name, O_RDONLY);
1327 if (input < 0) { 965 if (input < 0) {
@@ -1329,18 +967,18 @@ static int __cmd_annotate(void)
1329 exit(-1); 967 exit(-1);
1330 } 968 }
1331 969
1332 ret = fstat(input, &stat); 970 ret = fstat(input, &input_stat);
1333 if (ret < 0) { 971 if (ret < 0) {
1334 perror("failed to stat file"); 972 perror("failed to stat file");
1335 exit(-1); 973 exit(-1);
1336 } 974 }
1337 975
1338 if (!force && (stat.st_uid != geteuid())) { 976 if (!force && input_stat.st_uid && (input_stat.st_uid != geteuid())) {
1339 fprintf(stderr, "file: %s not owned by current user\n", input_name); 977 fprintf(stderr, "file: %s not owned by current user or root\n", input_name);
1340 exit(-1); 978 exit(-1);
1341 } 979 }
1342 980
1343 if (!stat.st_size) { 981 if (!input_stat.st_size) {
1344 fprintf(stderr, "zero-sized file, nothing to do!\n"); 982 fprintf(stderr, "zero-sized file, nothing to do!\n");
1345 exit(0); 983 exit(0);
1346 } 984 }
@@ -1367,10 +1005,10 @@ more:
1367 1005
1368 if (head + event->header.size >= page_size * mmap_window) { 1006 if (head + event->header.size >= page_size * mmap_window) {
1369 unsigned long shift = page_size * (head / page_size); 1007 unsigned long shift = page_size * (head / page_size);
1370 int ret; 1008 int munmap_ret;
1371 1009
1372 ret = munmap(buf, page_size * mmap_window); 1010 munmap_ret = munmap(buf, page_size * mmap_window);
1373 assert(ret == 0); 1011 assert(munmap_ret == 0);
1374 1012
1375 offset += shift; 1013 offset += shift;
1376 head -= shift; 1014 head -= shift;
@@ -1379,14 +1017,14 @@ more:
1379 1017
1380 size = event->header.size; 1018 size = event->header.size;
1381 1019
1382 dprintf("%p [%p]: event: %d\n", 1020 dump_printf("%p [%p]: event: %d\n",
1383 (void *)(offset + head), 1021 (void *)(offset + head),
1384 (void *)(long)event->header.size, 1022 (void *)(long)event->header.size,
1385 event->header.type); 1023 event->header.type);
1386 1024
1387 if (!size || process_event(event, offset, head) < 0) { 1025 if (!size || process_event(event, offset, head) < 0) {
1388 1026
1389 dprintf("%p [%p]: skipping unknown header type: %d\n", 1027 dump_printf("%p [%p]: skipping unknown header type: %d\n",
1390 (void *)(offset + head), 1028 (void *)(offset + head),
1391 (void *)(long)(event->header.size), 1029 (void *)(long)(event->header.size),
1392 event->header.type); 1030 event->header.type);
@@ -1406,23 +1044,23 @@ more:
1406 1044
1407 head += size; 1045 head += size;
1408 1046
1409 if (offset + head < (unsigned long)stat.st_size) 1047 if (offset + head < (unsigned long)input_stat.st_size)
1410 goto more; 1048 goto more;
1411 1049
1412 rc = EXIT_SUCCESS; 1050 rc = EXIT_SUCCESS;
1413 close(input); 1051 close(input);
1414 1052
1415 dprintf(" IP events: %10ld\n", total); 1053 dump_printf(" IP events: %10ld\n", total);
1416 dprintf(" mmap events: %10ld\n", total_mmap); 1054 dump_printf(" mmap events: %10ld\n", total_mmap);
1417 dprintf(" comm events: %10ld\n", total_comm); 1055 dump_printf(" comm events: %10ld\n", total_comm);
1418 dprintf(" fork events: %10ld\n", total_fork); 1056 dump_printf(" fork events: %10ld\n", total_fork);
1419 dprintf(" unknown events: %10ld\n", total_unknown); 1057 dump_printf(" unknown events: %10ld\n", total_unknown);
1420 1058
1421 if (dump_trace) 1059 if (dump_trace)
1422 return 0; 1060 return 0;
1423 1061
1424 if (verbose >= 3) 1062 if (verbose >= 3)
1425 threads__fprintf(stdout); 1063 threads__fprintf(stdout, &threads);
1426 1064
1427 if (verbose >= 2) 1065 if (verbose >= 2)
1428 dsos__fprintf(stdout); 1066 dsos__fprintf(stdout);
@@ -1450,7 +1088,7 @@ static const struct option options[] = {
1450 "be more verbose (show symbol address, etc)"), 1088 "be more verbose (show symbol address, etc)"),
1451 OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, 1089 OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1452 "dump raw trace in ASCII"), 1090 "dump raw trace in ASCII"),
1453 OPT_STRING('k', "vmlinux", &vmlinux, "file", "vmlinux pathname"), 1091 OPT_STRING('k', "vmlinux", &vmlinux_name, "file", "vmlinux pathname"),
1454 OPT_BOOLEAN('m', "modules", &modules, 1092 OPT_BOOLEAN('m', "modules", &modules,
1455 "load module symbols - WARNING: use only with -k and LIVE kernel"), 1093 "load module symbols - WARNING: use only with -k and LIVE kernel"),
1456 OPT_BOOLEAN('l', "print-line", &print_line, 1094 OPT_BOOLEAN('l', "print-line", &print_line,
diff --git a/tools/perf/builtin-help.c b/tools/perf/builtin-help.c
index 2599d86a733b..4fb8734a796e 100644
--- a/tools/perf/builtin-help.c
+++ b/tools/perf/builtin-help.c
@@ -456,6 +456,7 @@ int cmd_help(int argc, const char **argv, const char *prefix __used)
456 break; 456 break;
457 case HELP_FORMAT_WEB: 457 case HELP_FORMAT_WEB:
458 show_html_page(argv[0]); 458 show_html_page(argv[0]);
459 default:
459 break; 460 break;
460 } 461 }
461 462
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 89a5ddcd1ded..99a12fe86e9f 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -15,6 +15,9 @@
15#include "util/string.h" 15#include "util/string.h"
16 16
17#include "util/header.h" 17#include "util/header.h"
18#include "util/event.h"
19#include "util/debug.h"
20#include "util/trace-event.h"
18 21
19#include <unistd.h> 22#include <unistd.h>
20#include <sched.h> 23#include <sched.h>
@@ -42,7 +45,6 @@ static int inherit = 1;
42static int force = 0; 45static int force = 0;
43static int append_file = 0; 46static int append_file = 0;
44static int call_graph = 0; 47static int call_graph = 0;
45static int verbose = 0;
46static int inherit_stat = 0; 48static int inherit_stat = 0;
47static int no_samples = 0; 49static int no_samples = 0;
48static int sample_address = 0; 50static int sample_address = 0;
@@ -62,24 +64,6 @@ static int file_new = 1;
62 64
63struct perf_header *header; 65struct perf_header *header;
64 66
65struct mmap_event {
66 struct perf_event_header header;
67 u32 pid;
68 u32 tid;
69 u64 start;
70 u64 len;
71 u64 pgoff;
72 char filename[PATH_MAX];
73};
74
75struct comm_event {
76 struct perf_event_header header;
77 u32 pid;
78 u32 tid;
79 char comm[16];
80};
81
82
83struct mmap_data { 67struct mmap_data {
84 int counter; 68 int counter;
85 void *base; 69 void *base;
@@ -419,8 +403,11 @@ static void create_counter(int counter, int cpu, pid_t pid)
419 if (call_graph) 403 if (call_graph)
420 attr->sample_type |= PERF_SAMPLE_CALLCHAIN; 404 attr->sample_type |= PERF_SAMPLE_CALLCHAIN;
421 405
422 if (raw_samples) 406 if (raw_samples) {
407 attr->sample_type |= PERF_SAMPLE_TIME;
423 attr->sample_type |= PERF_SAMPLE_RAW; 408 attr->sample_type |= PERF_SAMPLE_RAW;
409 attr->sample_type |= PERF_SAMPLE_CPU;
410 }
424 411
425 attr->mmap = track; 412 attr->mmap = track;
426 attr->comm = track; 413 attr->comm = track;
@@ -563,6 +550,17 @@ static int __cmd_record(int argc, const char **argv)
563 else 550 else
564 header = perf_header__new(); 551 header = perf_header__new();
565 552
553
554 if (raw_samples) {
555 read_tracing_data(attrs, nr_counters);
556 } else {
557 for (i = 0; i < nr_counters; i++) {
558 if (attrs[i].sample_type & PERF_SAMPLE_RAW) {
559 read_tracing_data(attrs, nr_counters);
560 break;
561 }
562 }
563 }
566 atexit(atexit_header); 564 atexit(atexit_header);
567 565
568 if (!system_wide) { 566 if (!system_wide) {
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 8b2ec882e6e0..cdf9a8d27bb9 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -17,19 +17,18 @@
17#include "util/string.h" 17#include "util/string.h"
18#include "util/callchain.h" 18#include "util/callchain.h"
19#include "util/strlist.h" 19#include "util/strlist.h"
20#include "util/values.h"
20 21
21#include "perf.h" 22#include "perf.h"
23#include "util/debug.h"
22#include "util/header.h" 24#include "util/header.h"
23 25
24#include "util/parse-options.h" 26#include "util/parse-options.h"
25#include "util/parse-events.h" 27#include "util/parse-events.h"
26 28
27#define SHOW_KERNEL 1 29#include "util/thread.h"
28#define SHOW_USER 2
29#define SHOW_HV 4
30 30
31static char const *input_name = "perf.data"; 31static char const *input_name = "perf.data";
32static char *vmlinux = NULL;
33 32
34static char default_sort_order[] = "comm,dso,symbol"; 33static char default_sort_order[] = "comm,dso,symbol";
35static char *sort_order = default_sort_order; 34static char *sort_order = default_sort_order;
@@ -42,18 +41,15 @@ static int force;
42static int input; 41static int input;
43static int show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV; 42static int show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
44 43
45static int dump_trace = 0;
46#define dprintf(x...) do { if (dump_trace) printf(x); } while (0)
47#define cdprintf(x...) do { if (dump_trace) color_fprintf(stdout, color, x); } while (0)
48
49static int verbose;
50#define eprintf(x...) do { if (verbose) fprintf(stderr, x); } while (0)
51
52static int modules;
53
54static int full_paths; 44static int full_paths;
55static int show_nr_samples; 45static int show_nr_samples;
56 46
47static int show_threads;
48static struct perf_read_values show_threads_values;
49
50static char default_pretty_printing_style[] = "normal";
51static char *pretty_printing_style = default_pretty_printing_style;
52
57static unsigned long page_size; 53static unsigned long page_size;
58static unsigned long mmap_window = 32; 54static unsigned long mmap_window = 32;
59 55
@@ -67,6 +63,15 @@ static char callchain_default_opt[] = "fractal,0.5";
67 63
68static int callchain; 64static int callchain;
69 65
66static char __cwd[PATH_MAX];
67static char *cwd = __cwd;
68static int cwdlen;
69
70static struct rb_root threads;
71static struct thread *last_match;
72
73static struct perf_header *header;
74
70static 75static
71struct callchain_param callchain_param = { 76struct callchain_param callchain_param = {
72 .mode = CHAIN_GRAPH_REL, 77 .mode = CHAIN_GRAPH_REL,
@@ -75,59 +80,6 @@ struct callchain_param callchain_param = {
75 80
76static u64 sample_type; 81static u64 sample_type;
77 82
78struct ip_event {
79 struct perf_event_header header;
80 u64 ip;
81 u32 pid, tid;
82 unsigned char __more_data[];
83};
84
85struct mmap_event {
86 struct perf_event_header header;
87 u32 pid, tid;
88 u64 start;
89 u64 len;
90 u64 pgoff;
91 char filename[PATH_MAX];
92};
93
94struct comm_event {
95 struct perf_event_header header;
96 u32 pid, tid;
97 char comm[16];
98};
99
100struct fork_event {
101 struct perf_event_header header;
102 u32 pid, ppid;
103 u32 tid, ptid;
104};
105
106struct lost_event {
107 struct perf_event_header header;
108 u64 id;
109 u64 lost;
110};
111
112struct read_event {
113 struct perf_event_header header;
114 u32 pid,tid;
115 u64 value;
116 u64 time_enabled;
117 u64 time_running;
118 u64 id;
119};
120
121typedef union event_union {
122 struct perf_event_header header;
123 struct ip_event ip;
124 struct mmap_event mmap;
125 struct comm_event comm;
126 struct fork_event fork;
127 struct lost_event lost;
128 struct read_event read;
129} event_t;
130
131static int repsep_fprintf(FILE *fp, const char *fmt, ...) 83static int repsep_fprintf(FILE *fp, const char *fmt, ...)
132{ 84{
133 int n; 85 int n;
@@ -141,6 +93,7 @@ static int repsep_fprintf(FILE *fp, const char *fmt, ...)
141 n = vasprintf(&bf, fmt, ap); 93 n = vasprintf(&bf, fmt, ap);
142 if (n > 0) { 94 if (n > 0) {
143 char *sep = bf; 95 char *sep = bf;
96
144 while (1) { 97 while (1) {
145 sep = strchr(sep, *field_sep); 98 sep = strchr(sep, *field_sep);
146 if (sep == NULL) 99 if (sep == NULL)
@@ -155,396 +108,10 @@ static int repsep_fprintf(FILE *fp, const char *fmt, ...)
155 return n; 108 return n;
156} 109}
157 110
158static LIST_HEAD(dsos);
159static struct dso *kernel_dso;
160static struct dso *vdso;
161static struct dso *hypervisor_dso;
162
163static void dsos__add(struct dso *dso)
164{
165 list_add_tail(&dso->node, &dsos);
166}
167
168static struct dso *dsos__find(const char *name)
169{
170 struct dso *pos;
171
172 list_for_each_entry(pos, &dsos, node)
173 if (strcmp(pos->name, name) == 0)
174 return pos;
175 return NULL;
176}
177
178static struct dso *dsos__findnew(const char *name)
179{
180 struct dso *dso = dsos__find(name);
181 int nr;
182
183 if (dso)
184 return dso;
185
186 dso = dso__new(name, 0);
187 if (!dso)
188 goto out_delete_dso;
189
190 nr = dso__load(dso, NULL, verbose);
191 if (nr < 0) {
192 eprintf("Failed to open: %s\n", name);
193 goto out_delete_dso;
194 }
195 if (!nr)
196 eprintf("No symbols found in: %s, maybe install a debug package?\n", name);
197
198 dsos__add(dso);
199
200 return dso;
201
202out_delete_dso:
203 dso__delete(dso);
204 return NULL;
205}
206
207static void dsos__fprintf(FILE *fp)
208{
209 struct dso *pos;
210
211 list_for_each_entry(pos, &dsos, node)
212 dso__fprintf(pos, fp);
213}
214
215static struct symbol *vdso__find_symbol(struct dso *dso, u64 ip)
216{
217 return dso__find_symbol(dso, ip);
218}
219
220static int load_kernel(void)
221{
222 int err;
223
224 kernel_dso = dso__new("[kernel]", 0);
225 if (!kernel_dso)
226 return -1;
227
228 err = dso__load_kernel(kernel_dso, vmlinux, NULL, verbose, modules);
229 if (err <= 0) {
230 dso__delete(kernel_dso);
231 kernel_dso = NULL;
232 } else
233 dsos__add(kernel_dso);
234
235 vdso = dso__new("[vdso]", 0);
236 if (!vdso)
237 return -1;
238
239 vdso->find_symbol = vdso__find_symbol;
240
241 dsos__add(vdso);
242
243 hypervisor_dso = dso__new("[hypervisor]", 0);
244 if (!hypervisor_dso)
245 return -1;
246 dsos__add(hypervisor_dso);
247
248 return err;
249}
250
251static char __cwd[PATH_MAX];
252static char *cwd = __cwd;
253static int cwdlen;
254
255static int strcommon(const char *pathname)
256{
257 int n = 0;
258
259 while (n < cwdlen && pathname[n] == cwd[n])
260 ++n;
261
262 return n;
263}
264
265struct map {
266 struct list_head node;
267 u64 start;
268 u64 end;
269 u64 pgoff;
270 u64 (*map_ip)(struct map *, u64);
271 struct dso *dso;
272};
273
274static u64 map__map_ip(struct map *map, u64 ip)
275{
276 return ip - map->start + map->pgoff;
277}
278
279static u64 vdso__map_ip(struct map *map __used, u64 ip)
280{
281 return ip;
282}
283
284static inline int is_anon_memory(const char *filename)
285{
286 return strcmp(filename, "//anon") == 0;
287}
288
289static struct map *map__new(struct mmap_event *event)
290{
291 struct map *self = malloc(sizeof(*self));
292
293 if (self != NULL) {
294 const char *filename = event->filename;
295 char newfilename[PATH_MAX];
296 int anon;
297
298 if (cwd) {
299 int n = strcommon(filename);
300
301 if (n == cwdlen) {
302 snprintf(newfilename, sizeof(newfilename),
303 ".%s", filename + n);
304 filename = newfilename;
305 }
306 }
307
308 anon = is_anon_memory(filename);
309
310 if (anon) {
311 snprintf(newfilename, sizeof(newfilename), "/tmp/perf-%d.map", event->pid);
312 filename = newfilename;
313 }
314
315 self->start = event->start;
316 self->end = event->start + event->len;
317 self->pgoff = event->pgoff;
318
319 self->dso = dsos__findnew(filename);
320 if (self->dso == NULL)
321 goto out_delete;
322
323 if (self->dso == vdso || anon)
324 self->map_ip = vdso__map_ip;
325 else
326 self->map_ip = map__map_ip;
327 }
328 return self;
329out_delete:
330 free(self);
331 return NULL;
332}
333
334static struct map *map__clone(struct map *self)
335{
336 struct map *map = malloc(sizeof(*self));
337
338 if (!map)
339 return NULL;
340
341 memcpy(map, self, sizeof(*self));
342
343 return map;
344}
345
346static int map__overlap(struct map *l, struct map *r)
347{
348 if (l->start > r->start) {
349 struct map *t = l;
350 l = r;
351 r = t;
352 }
353
354 if (l->end > r->start)
355 return 1;
356
357 return 0;
358}
359
360static size_t map__fprintf(struct map *self, FILE *fp)
361{
362 return fprintf(fp, " %Lx-%Lx %Lx %s\n",
363 self->start, self->end, self->pgoff, self->dso->name);
364}
365
366
367struct thread {
368 struct rb_node rb_node;
369 struct list_head maps;
370 pid_t pid;
371 char *comm;
372};
373
374static struct thread *thread__new(pid_t pid)
375{
376 struct thread *self = malloc(sizeof(*self));
377
378 if (self != NULL) {
379 self->pid = pid;
380 self->comm = malloc(32);
381 if (self->comm)
382 snprintf(self->comm, 32, ":%d", self->pid);
383 INIT_LIST_HEAD(&self->maps);
384 }
385
386 return self;
387}
388
389static unsigned int dsos__col_width, 111static unsigned int dsos__col_width,
390 comms__col_width, 112 comms__col_width,
391 threads__col_width; 113 threads__col_width;
392 114
393static int thread__set_comm(struct thread *self, const char *comm)
394{
395 if (self->comm)
396 free(self->comm);
397 self->comm = strdup(comm);
398 if (!self->comm)
399 return -ENOMEM;
400
401 if (!col_width_list_str && !field_sep &&
402 (!comm_list || strlist__has_entry(comm_list, comm))) {
403 unsigned int slen = strlen(comm);
404 if (slen > comms__col_width) {
405 comms__col_width = slen;
406 threads__col_width = slen + 6;
407 }
408 }
409
410 return 0;
411}
412
413static size_t thread__fprintf(struct thread *self, FILE *fp)
414{
415 struct map *pos;
416 size_t ret = fprintf(fp, "Thread %d %s\n", self->pid, self->comm);
417
418 list_for_each_entry(pos, &self->maps, node)
419 ret += map__fprintf(pos, fp);
420
421 return ret;
422}
423
424
425static struct rb_root threads;
426static struct thread *last_match;
427
428static struct thread *threads__findnew(pid_t pid)
429{
430 struct rb_node **p = &threads.rb_node;
431 struct rb_node *parent = NULL;
432 struct thread *th;
433
434 /*
435 * Font-end cache - PID lookups come in blocks,
436 * so most of the time we dont have to look up
437 * the full rbtree:
438 */
439 if (last_match && last_match->pid == pid)
440 return last_match;
441
442 while (*p != NULL) {
443 parent = *p;
444 th = rb_entry(parent, struct thread, rb_node);
445
446 if (th->pid == pid) {
447 last_match = th;
448 return th;
449 }
450
451 if (pid < th->pid)
452 p = &(*p)->rb_left;
453 else
454 p = &(*p)->rb_right;
455 }
456
457 th = thread__new(pid);
458 if (th != NULL) {
459 rb_link_node(&th->rb_node, parent, p);
460 rb_insert_color(&th->rb_node, &threads);
461 last_match = th;
462 }
463
464 return th;
465}
466
467static void thread__insert_map(struct thread *self, struct map *map)
468{
469 struct map *pos, *tmp;
470
471 list_for_each_entry_safe(pos, tmp, &self->maps, node) {
472 if (map__overlap(pos, map)) {
473 if (verbose >= 2) {
474 printf("overlapping maps:\n");
475 map__fprintf(map, stdout);
476 map__fprintf(pos, stdout);
477 }
478
479 if (map->start <= pos->start && map->end > pos->start)
480 pos->start = map->end;
481
482 if (map->end >= pos->end && map->start < pos->end)
483 pos->end = map->start;
484
485 if (verbose >= 2) {
486 printf("after collision:\n");
487 map__fprintf(pos, stdout);
488 }
489
490 if (pos->start >= pos->end) {
491 list_del_init(&pos->node);
492 free(pos);
493 }
494 }
495 }
496
497 list_add_tail(&map->node, &self->maps);
498}
499
500static int thread__fork(struct thread *self, struct thread *parent)
501{
502 struct map *map;
503
504 if (self->comm)
505 free(self->comm);
506 self->comm = strdup(parent->comm);
507 if (!self->comm)
508 return -ENOMEM;
509
510 list_for_each_entry(map, &parent->maps, node) {
511 struct map *new = map__clone(map);
512 if (!new)
513 return -ENOMEM;
514 thread__insert_map(self, new);
515 }
516
517 return 0;
518}
519
520static struct map *thread__find_map(struct thread *self, u64 ip)
521{
522 struct map *pos;
523
524 if (self == NULL)
525 return NULL;
526
527 list_for_each_entry(pos, &self->maps, node)
528 if (ip >= pos->start && ip <= pos->end)
529 return pos;
530
531 return NULL;
532}
533
534static size_t threads__fprintf(FILE *fp)
535{
536 size_t ret = 0;
537 struct rb_node *nd;
538
539 for (nd = rb_first(&threads); nd; nd = rb_next(nd)) {
540 struct thread *pos = rb_entry(nd, struct thread, rb_node);
541
542 ret += thread__fprintf(pos, fp);
543 }
544
545 return ret;
546}
547
548/* 115/*
549 * histogram, sorted on item, collects counts 116 * histogram, sorted on item, collects counts
550 */ 117 */
@@ -574,7 +141,7 @@ struct hist_entry {
574struct sort_entry { 141struct sort_entry {
575 struct list_head list; 142 struct list_head list;
576 143
577 char *header; 144 const char *header;
578 145
579 int64_t (*cmp)(struct hist_entry *, struct hist_entry *); 146 int64_t (*cmp)(struct hist_entry *, struct hist_entry *);
580 int64_t (*collapse)(struct hist_entry *, struct hist_entry *); 147 int64_t (*collapse)(struct hist_entry *, struct hist_entry *);
@@ -758,7 +325,7 @@ static int sort__need_collapse = 0;
758static int sort__has_parent = 0; 325static int sort__has_parent = 0;
759 326
760struct sort_dimension { 327struct sort_dimension {
761 char *name; 328 const char *name;
762 struct sort_entry *entry; 329 struct sort_entry *entry;
763 int taken; 330 int taken;
764}; 331};
@@ -773,7 +340,7 @@ static struct sort_dimension sort_dimensions[] = {
773 340
774static LIST_HEAD(hist_entry__sort_list); 341static LIST_HEAD(hist_entry__sort_list);
775 342
776static int sort_dimension__add(char *tok) 343static int sort_dimension__add(const char *tok)
777{ 344{
778 unsigned int i; 345 unsigned int i;
779 346
@@ -1032,6 +599,7 @@ hist_entry_callchain__fprintf(FILE *fp, struct hist_entry *self,
1032 case CHAIN_GRAPH_REL: 599 case CHAIN_GRAPH_REL:
1033 ret += callchain__fprintf_graph(fp, chain, 600 ret += callchain__fprintf_graph(fp, chain,
1034 total_samples, 1, 1); 601 total_samples, 1, 1);
602 case CHAIN_NONE:
1035 default: 603 default:
1036 break; 604 break;
1037 } 605 }
@@ -1098,6 +666,34 @@ static void dso__calc_col_width(struct dso *self)
1098 self->slen_calculated = 1; 666 self->slen_calculated = 1;
1099} 667}
1100 668
669static void thread__comm_adjust(struct thread *self)
670{
671 char *comm = self->comm;
672
673 if (!col_width_list_str && !field_sep &&
674 (!comm_list || strlist__has_entry(comm_list, comm))) {
675 unsigned int slen = strlen(comm);
676
677 if (slen > comms__col_width) {
678 comms__col_width = slen;
679 threads__col_width = slen + 6;
680 }
681 }
682}
683
684static int thread__set_comm_adjust(struct thread *self, const char *comm)
685{
686 int ret = thread__set_comm(self, comm);
687
688 if (ret)
689 return ret;
690
691 thread__comm_adjust(self);
692
693 return 0;
694}
695
696
1101static struct symbol * 697static struct symbol *
1102resolve_symbol(struct thread *thread, struct map **mapp, 698resolve_symbol(struct thread *thread, struct map **mapp,
1103 struct dso **dsop, u64 *ipp) 699 struct dso **dsop, u64 *ipp)
@@ -1141,8 +737,8 @@ got_map:
1141 if ((long long)ip < 0) 737 if ((long long)ip < 0)
1142 dso = kernel_dso; 738 dso = kernel_dso;
1143 } 739 }
1144 dprintf(" ...... dso: %s\n", dso ? dso->name : "<not found>"); 740 dump_printf(" ...... dso: %s\n", dso ? dso->name : "<not found>");
1145 dprintf(" ...... map: %Lx -> %Lx\n", *ipp, ip); 741 dump_printf(" ...... map: %Lx -> %Lx\n", *ipp, ip);
1146 *ipp = ip; 742 *ipp = ip;
1147 743
1148 if (dsop) 744 if (dsop)
@@ -1398,6 +994,9 @@ static size_t output__fprintf(FILE *fp, u64 total_samples)
1398 size_t ret = 0; 994 size_t ret = 0;
1399 unsigned int width; 995 unsigned int width;
1400 char *col_width = col_width_list_str; 996 char *col_width = col_width_list_str;
997 int raw_printing_style;
998
999 raw_printing_style = !strcmp(pretty_printing_style, "raw");
1401 1000
1402 init_rem_hits(); 1001 init_rem_hits();
1403 1002
@@ -1474,18 +1073,11 @@ print_entries:
1474 1073
1475 free(rem_sq_bracket); 1074 free(rem_sq_bracket);
1476 1075
1477 return ret; 1076 if (show_threads)
1478} 1077 perf_read_values_display(fp, &show_threads_values,
1078 raw_printing_style);
1479 1079
1480static void register_idle_thread(void) 1080 return ret;
1481{
1482 struct thread *thread = threads__findnew(0);
1483
1484 if (thread == NULL ||
1485 thread__set_comm(thread, "[idle]")) {
1486 fprintf(stderr, "problem inserting idle task.\n");
1487 exit(-1);
1488 }
1489} 1081}
1490 1082
1491static unsigned long total = 0, 1083static unsigned long total = 0,
@@ -1514,7 +1106,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
1514 char level; 1106 char level;
1515 int show = 0; 1107 int show = 0;
1516 struct dso *dso = NULL; 1108 struct dso *dso = NULL;
1517 struct thread *thread = threads__findnew(event->ip.pid); 1109 struct thread *thread;
1518 u64 ip = event->ip.ip; 1110 u64 ip = event->ip.ip;
1519 u64 period = 1; 1111 u64 period = 1;
1520 struct map *map = NULL; 1112 struct map *map = NULL;
@@ -1522,12 +1114,14 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
1522 struct ip_callchain *chain = NULL; 1114 struct ip_callchain *chain = NULL;
1523 int cpumode; 1115 int cpumode;
1524 1116
1117 thread = threads__findnew(event->ip.pid, &threads, &last_match);
1118
1525 if (sample_type & PERF_SAMPLE_PERIOD) { 1119 if (sample_type & PERF_SAMPLE_PERIOD) {
1526 period = *(u64 *)more_data; 1120 period = *(u64 *)more_data;
1527 more_data += sizeof(u64); 1121 more_data += sizeof(u64);
1528 } 1122 }
1529 1123
1530 dprintf("%p [%p]: PERF_EVENT_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n", 1124 dump_printf("%p [%p]: PERF_EVENT_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n",
1531 (void *)(offset + head), 1125 (void *)(offset + head),
1532 (void *)(long)(event->header.size), 1126 (void *)(long)(event->header.size),
1533 event->header.misc, 1127 event->header.misc,
@@ -1540,7 +1134,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
1540 1134
1541 chain = (void *)more_data; 1135 chain = (void *)more_data;
1542 1136
1543 dprintf("... chain: nr:%Lu\n", chain->nr); 1137 dump_printf("... chain: nr:%Lu\n", chain->nr);
1544 1138
1545 if (validate_chain(chain, event) < 0) { 1139 if (validate_chain(chain, event) < 0) {
1546 eprintf("call-chain problem with event, skipping it.\n"); 1140 eprintf("call-chain problem with event, skipping it.\n");
@@ -1549,11 +1143,11 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
1549 1143
1550 if (dump_trace) { 1144 if (dump_trace) {
1551 for (i = 0; i < chain->nr; i++) 1145 for (i = 0; i < chain->nr; i++)
1552 dprintf("..... %2d: %016Lx\n", i, chain->ips[i]); 1146 dump_printf("..... %2d: %016Lx\n", i, chain->ips[i]);
1553 } 1147 }
1554 } 1148 }
1555 1149
1556 dprintf(" ... thread: %s:%d\n", thread->comm, thread->pid); 1150 dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
1557 1151
1558 if (thread == NULL) { 1152 if (thread == NULL) {
1559 eprintf("problem processing %d event, skipping it.\n", 1153 eprintf("problem processing %d event, skipping it.\n",
@@ -1572,7 +1166,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
1572 1166
1573 dso = kernel_dso; 1167 dso = kernel_dso;
1574 1168
1575 dprintf(" ...... dso: %s\n", dso->name); 1169 dump_printf(" ...... dso: %s\n", dso->name);
1576 1170
1577 } else if (cpumode == PERF_EVENT_MISC_USER) { 1171 } else if (cpumode == PERF_EVENT_MISC_USER) {
1578 1172
@@ -1585,7 +1179,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
1585 1179
1586 dso = hypervisor_dso; 1180 dso = hypervisor_dso;
1587 1181
1588 dprintf(" ...... dso: [hypervisor]\n"); 1182 dump_printf(" ...... dso: [hypervisor]\n");
1589 } 1183 }
1590 1184
1591 if (show & show_mask) { 1185 if (show & show_mask) {
@@ -1611,10 +1205,12 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
1611static int 1205static int
1612process_mmap_event(event_t *event, unsigned long offset, unsigned long head) 1206process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
1613{ 1207{
1614 struct thread *thread = threads__findnew(event->mmap.pid); 1208 struct thread *thread;
1615 struct map *map = map__new(&event->mmap); 1209 struct map *map = map__new(&event->mmap, cwd, cwdlen);
1616 1210
1617 dprintf("%p [%p]: PERF_EVENT_MMAP %d/%d: [%p(%p) @ %p]: %s\n", 1211 thread = threads__findnew(event->mmap.pid, &threads, &last_match);
1212
1213 dump_printf("%p [%p]: PERF_EVENT_MMAP %d/%d: [%p(%p) @ %p]: %s\n",
1618 (void *)(offset + head), 1214 (void *)(offset + head),
1619 (void *)(long)(event->header.size), 1215 (void *)(long)(event->header.size),
1620 event->mmap.pid, 1216 event->mmap.pid,
@@ -1625,7 +1221,7 @@ process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
1625 event->mmap.filename); 1221 event->mmap.filename);
1626 1222
1627 if (thread == NULL || map == NULL) { 1223 if (thread == NULL || map == NULL) {
1628 dprintf("problem processing PERF_EVENT_MMAP, skipping event.\n"); 1224 dump_printf("problem processing PERF_EVENT_MMAP, skipping event.\n");
1629 return 0; 1225 return 0;
1630 } 1226 }
1631 1227
@@ -1638,16 +1234,18 @@ process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
1638static int 1234static int
1639process_comm_event(event_t *event, unsigned long offset, unsigned long head) 1235process_comm_event(event_t *event, unsigned long offset, unsigned long head)
1640{ 1236{
1641 struct thread *thread = threads__findnew(event->comm.pid); 1237 struct thread *thread;
1238
1239 thread = threads__findnew(event->comm.pid, &threads, &last_match);
1642 1240
1643 dprintf("%p [%p]: PERF_EVENT_COMM: %s:%d\n", 1241 dump_printf("%p [%p]: PERF_EVENT_COMM: %s:%d\n",
1644 (void *)(offset + head), 1242 (void *)(offset + head),
1645 (void *)(long)(event->header.size), 1243 (void *)(long)(event->header.size),
1646 event->comm.comm, event->comm.pid); 1244 event->comm.comm, event->comm.pid);
1647 1245
1648 if (thread == NULL || 1246 if (thread == NULL ||
1649 thread__set_comm(thread, event->comm.comm)) { 1247 thread__set_comm_adjust(thread, event->comm.comm)) {
1650 dprintf("problem processing PERF_EVENT_COMM, skipping event.\n"); 1248 dump_printf("problem processing PERF_EVENT_COMM, skipping event.\n");
1651 return -1; 1249 return -1;
1652 } 1250 }
1653 total_comm++; 1251 total_comm++;
@@ -1658,10 +1256,13 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head)
1658static int 1256static int
1659process_task_event(event_t *event, unsigned long offset, unsigned long head) 1257process_task_event(event_t *event, unsigned long offset, unsigned long head)
1660{ 1258{
1661 struct thread *thread = threads__findnew(event->fork.pid); 1259 struct thread *thread;
1662 struct thread *parent = threads__findnew(event->fork.ppid); 1260 struct thread *parent;
1663 1261
1664 dprintf("%p [%p]: PERF_EVENT_%s: (%d:%d):(%d:%d)\n", 1262 thread = threads__findnew(event->fork.pid, &threads, &last_match);
1263 parent = threads__findnew(event->fork.ppid, &threads, &last_match);
1264
1265 dump_printf("%p [%p]: PERF_EVENT_%s: (%d:%d):(%d:%d)\n",
1665 (void *)(offset + head), 1266 (void *)(offset + head),
1666 (void *)(long)(event->header.size), 1267 (void *)(long)(event->header.size),
1667 event->header.type == PERF_EVENT_FORK ? "FORK" : "EXIT", 1268 event->header.type == PERF_EVENT_FORK ? "FORK" : "EXIT",
@@ -1679,7 +1280,7 @@ process_task_event(event_t *event, unsigned long offset, unsigned long head)
1679 return 0; 1280 return 0;
1680 1281
1681 if (!thread || !parent || thread__fork(thread, parent)) { 1282 if (!thread || !parent || thread__fork(thread, parent)) {
1682 dprintf("problem processing PERF_EVENT_FORK, skipping event.\n"); 1283 dump_printf("problem processing PERF_EVENT_FORK, skipping event.\n");
1683 return -1; 1284 return -1;
1684 } 1285 }
1685 total_fork++; 1286 total_fork++;
@@ -1690,7 +1291,7 @@ process_task_event(event_t *event, unsigned long offset, unsigned long head)
1690static int 1291static int
1691process_lost_event(event_t *event, unsigned long offset, unsigned long head) 1292process_lost_event(event_t *event, unsigned long offset, unsigned long head)
1692{ 1293{
1693 dprintf("%p [%p]: PERF_EVENT_LOST: id:%Ld: lost:%Ld\n", 1294 dump_printf("%p [%p]: PERF_EVENT_LOST: id:%Ld: lost:%Ld\n",
1694 (void *)(offset + head), 1295 (void *)(offset + head),
1695 (void *)(long)(event->header.size), 1296 (void *)(long)(event->header.size),
1696 event->lost.id, 1297 event->lost.id,
@@ -1701,67 +1302,24 @@ process_lost_event(event_t *event, unsigned long offset, unsigned long head)
1701 return 0; 1302 return 0;
1702} 1303}
1703 1304
1704static void trace_event(event_t *event) 1305static int
1705{ 1306process_read_event(event_t *event, unsigned long offset, unsigned long head)
1706 unsigned char *raw_event = (void *)event;
1707 char *color = PERF_COLOR_BLUE;
1708 int i, j;
1709
1710 if (!dump_trace)
1711 return;
1712
1713 dprintf(".");
1714 cdprintf("\n. ... raw event: size %d bytes\n", event->header.size);
1715
1716 for (i = 0; i < event->header.size; i++) {
1717 if ((i & 15) == 0) {
1718 dprintf(".");
1719 cdprintf(" %04x: ", i);
1720 }
1721
1722 cdprintf(" %02x", raw_event[i]);
1723
1724 if (((i & 15) == 15) || i == event->header.size-1) {
1725 cdprintf(" ");
1726 for (j = 0; j < 15-(i & 15); j++)
1727 cdprintf(" ");
1728 for (j = 0; j < (i & 15); j++) {
1729 if (isprint(raw_event[i-15+j]))
1730 cdprintf("%c", raw_event[i-15+j]);
1731 else
1732 cdprintf(".");
1733 }
1734 cdprintf("\n");
1735 }
1736 }
1737 dprintf(".\n");
1738}
1739
1740static struct perf_header *header;
1741
1742static struct perf_counter_attr *perf_header__find_attr(u64 id)
1743{ 1307{
1744 int i; 1308 struct perf_counter_attr *attr;
1745 1309
1746 for (i = 0; i < header->attrs; i++) { 1310 attr = perf_header__find_attr(event->read.id, header);
1747 struct perf_header_attr *attr = header->attr[i];
1748 int j;
1749 1311
1750 for (j = 0; j < attr->ids; j++) { 1312 if (show_threads) {
1751 if (attr->id[j] == id) 1313 const char *name = attr ? __event_name(attr->type, attr->config)
1752 return &attr->attr; 1314 : "unknown";
1753 } 1315 perf_read_values_add_value(&show_threads_values,
1316 event->read.pid, event->read.tid,
1317 event->read.id,
1318 name,
1319 event->read.value);
1754 } 1320 }
1755 1321
1756 return NULL; 1322 dump_printf("%p [%p]: PERF_EVENT_READ: %d %d %s %Lu\n",
1757}
1758
1759static int
1760process_read_event(event_t *event, unsigned long offset, unsigned long head)
1761{
1762 struct perf_counter_attr *attr = perf_header__find_attr(event->read.id);
1763
1764 dprintf("%p [%p]: PERF_EVENT_READ: %d %d %s %Lu\n",
1765 (void *)(offset + head), 1323 (void *)(offset + head),
1766 (void *)(long)(event->header.size), 1324 (void *)(long)(event->header.size),
1767 event->read.pid, 1325 event->read.pid,
@@ -1813,34 +1371,22 @@ process_event(event_t *event, unsigned long offset, unsigned long head)
1813 return 0; 1371 return 0;
1814} 1372}
1815 1373
1816static u64 perf_header__sample_type(void)
1817{
1818 u64 sample_type = 0;
1819 int i;
1820
1821 for (i = 0; i < header->attrs; i++) {
1822 struct perf_header_attr *attr = header->attr[i];
1823
1824 if (!sample_type)
1825 sample_type = attr->attr.sample_type;
1826 else if (sample_type != attr->attr.sample_type)
1827 die("non matching sample_type");
1828 }
1829
1830 return sample_type;
1831}
1832
1833static int __cmd_report(void) 1374static int __cmd_report(void)
1834{ 1375{
1835 int ret, rc = EXIT_FAILURE; 1376 int ret, rc = EXIT_FAILURE;
1836 unsigned long offset = 0; 1377 unsigned long offset = 0;
1837 unsigned long head, shift; 1378 unsigned long head, shift;
1838 struct stat stat; 1379 struct stat input_stat;
1380 struct thread *idle;
1839 event_t *event; 1381 event_t *event;
1840 uint32_t size; 1382 uint32_t size;
1841 char *buf; 1383 char *buf;
1842 1384
1843 register_idle_thread(); 1385 idle = register_idle_thread(&threads, &last_match);
1386 thread__comm_adjust(idle);
1387
1388 if (show_threads)
1389 perf_read_values_init(&show_threads_values);
1844 1390
1845 input = open(input_name, O_RDONLY); 1391 input = open(input_name, O_RDONLY);
1846 if (input < 0) { 1392 if (input < 0) {
@@ -1851,18 +1397,18 @@ static int __cmd_report(void)
1851 exit(-1); 1397 exit(-1);
1852 } 1398 }
1853 1399
1854 ret = fstat(input, &stat); 1400 ret = fstat(input, &input_stat);
1855 if (ret < 0) { 1401 if (ret < 0) {
1856 perror("failed to stat file"); 1402 perror("failed to stat file");
1857 exit(-1); 1403 exit(-1);
1858 } 1404 }
1859 1405
1860 if (!force && (stat.st_uid != geteuid())) { 1406 if (!force && input_stat.st_uid && (input_stat.st_uid != geteuid())) {
1861 fprintf(stderr, "file: %s not owned by current user\n", input_name); 1407 fprintf(stderr, "file: %s not owned by current user or root\n", input_name);
1862 exit(-1); 1408 exit(-1);
1863 } 1409 }
1864 1410
1865 if (!stat.st_size) { 1411 if (!input_stat.st_size) {
1866 fprintf(stderr, "zero-sized file, nothing to do!\n"); 1412 fprintf(stderr, "zero-sized file, nothing to do!\n");
1867 exit(0); 1413 exit(0);
1868 } 1414 }
@@ -1870,7 +1416,7 @@ static int __cmd_report(void)
1870 header = perf_header__read(input); 1416 header = perf_header__read(input);
1871 head = header->data_offset; 1417 head = header->data_offset;
1872 1418
1873 sample_type = perf_header__sample_type(); 1419 sample_type = perf_header__sample_type(header);
1874 1420
1875 if (!(sample_type & PERF_SAMPLE_CALLCHAIN)) { 1421 if (!(sample_type & PERF_SAMPLE_CALLCHAIN)) {
1876 if (sort__has_parent) { 1422 if (sort__has_parent) {
@@ -1880,7 +1426,7 @@ static int __cmd_report(void)
1880 exit(-1); 1426 exit(-1);
1881 } 1427 }
1882 if (callchain) { 1428 if (callchain) {
1883 fprintf(stderr, "selected -c but no callchain data." 1429 fprintf(stderr, "selected -g but no callchain data."
1884 " Did you call perf record without" 1430 " Did you call perf record without"
1885 " -g?\n"); 1431 " -g?\n");
1886 exit(-1); 1432 exit(-1);
@@ -1930,12 +1476,12 @@ more:
1930 size = 8; 1476 size = 8;
1931 1477
1932 if (head + event->header.size >= page_size * mmap_window) { 1478 if (head + event->header.size >= page_size * mmap_window) {
1933 int ret; 1479 int munmap_ret;
1934 1480
1935 shift = page_size * (head / page_size); 1481 shift = page_size * (head / page_size);
1936 1482
1937 ret = munmap(buf, page_size * mmap_window); 1483 munmap_ret = munmap(buf, page_size * mmap_window);
1938 assert(ret == 0); 1484 assert(munmap_ret == 0);
1939 1485
1940 offset += shift; 1486 offset += shift;
1941 head -= shift; 1487 head -= shift;
@@ -1944,14 +1490,14 @@ more:
1944 1490
1945 size = event->header.size; 1491 size = event->header.size;
1946 1492
1947 dprintf("\n%p [%p]: event: %d\n", 1493 dump_printf("\n%p [%p]: event: %d\n",
1948 (void *)(offset + head), 1494 (void *)(offset + head),
1949 (void *)(long)event->header.size, 1495 (void *)(long)event->header.size,
1950 event->header.type); 1496 event->header.type);
1951 1497
1952 if (!size || process_event(event, offset, head) < 0) { 1498 if (!size || process_event(event, offset, head) < 0) {
1953 1499
1954 dprintf("%p [%p]: skipping unknown header type: %d\n", 1500 dump_printf("%p [%p]: skipping unknown header type: %d\n",
1955 (void *)(offset + head), 1501 (void *)(offset + head),
1956 (void *)(long)(event->header.size), 1502 (void *)(long)(event->header.size),
1957 event->header.type); 1503 event->header.type);
@@ -1974,25 +1520,25 @@ more:
1974 if (offset + head >= header->data_offset + header->data_size) 1520 if (offset + head >= header->data_offset + header->data_size)
1975 goto done; 1521 goto done;
1976 1522
1977 if (offset + head < (unsigned long)stat.st_size) 1523 if (offset + head < (unsigned long)input_stat.st_size)
1978 goto more; 1524 goto more;
1979 1525
1980done: 1526done:
1981 rc = EXIT_SUCCESS; 1527 rc = EXIT_SUCCESS;
1982 close(input); 1528 close(input);
1983 1529
1984 dprintf(" IP events: %10ld\n", total); 1530 dump_printf(" IP events: %10ld\n", total);
1985 dprintf(" mmap events: %10ld\n", total_mmap); 1531 dump_printf(" mmap events: %10ld\n", total_mmap);
1986 dprintf(" comm events: %10ld\n", total_comm); 1532 dump_printf(" comm events: %10ld\n", total_comm);
1987 dprintf(" fork events: %10ld\n", total_fork); 1533 dump_printf(" fork events: %10ld\n", total_fork);
1988 dprintf(" lost events: %10ld\n", total_lost); 1534 dump_printf(" lost events: %10ld\n", total_lost);
1989 dprintf(" unknown events: %10ld\n", total_unknown); 1535 dump_printf(" unknown events: %10ld\n", total_unknown);
1990 1536
1991 if (dump_trace) 1537 if (dump_trace)
1992 return 0; 1538 return 0;
1993 1539
1994 if (verbose >= 3) 1540 if (verbose >= 3)
1995 threads__fprintf(stdout); 1541 threads__fprintf(stdout, &threads);
1996 1542
1997 if (verbose >= 2) 1543 if (verbose >= 2)
1998 dsos__fprintf(stdout); 1544 dsos__fprintf(stdout);
@@ -2001,6 +1547,9 @@ done:
2001 output__resort(total); 1547 output__resort(total);
2002 output__fprintf(stdout, total); 1548 output__fprintf(stdout, total);
2003 1549
1550 if (show_threads)
1551 perf_read_values_destroy(&show_threads_values);
1552
2004 return rc; 1553 return rc;
2005} 1554}
2006 1555
@@ -2069,12 +1618,16 @@ static const struct option options[] = {
2069 "be more verbose (show symbol address, etc)"), 1618 "be more verbose (show symbol address, etc)"),
2070 OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, 1619 OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
2071 "dump raw trace in ASCII"), 1620 "dump raw trace in ASCII"),
2072 OPT_STRING('k', "vmlinux", &vmlinux, "file", "vmlinux pathname"), 1621 OPT_STRING('k', "vmlinux", &vmlinux_name, "file", "vmlinux pathname"),
2073 OPT_BOOLEAN('f', "force", &force, "don't complain, do it"), 1622 OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
2074 OPT_BOOLEAN('m', "modules", &modules, 1623 OPT_BOOLEAN('m', "modules", &modules,
2075 "load module symbols - WARNING: use only with -k and LIVE kernel"), 1624 "load module symbols - WARNING: use only with -k and LIVE kernel"),
2076 OPT_BOOLEAN('n', "show-nr-samples", &show_nr_samples, 1625 OPT_BOOLEAN('n', "show-nr-samples", &show_nr_samples,
2077 "Show a column with the number of samples"), 1626 "Show a column with the number of samples"),
1627 OPT_BOOLEAN('T', "threads", &show_threads,
1628 "Show per-thread event counters"),
1629 OPT_STRING(0, "pretty", &pretty_printing_style, "key",
1630 "pretty printing style key: normal raw"),
2078 OPT_STRING('s', "sort", &sort_order, "key[,key2...]", 1631 OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
2079 "sort by key(s): pid, comm, dso, symbol, parent"), 1632 "sort by key(s): pid, comm, dso, symbol, parent"),
2080 OPT_BOOLEAN('P', "full-paths", &full_paths, 1633 OPT_BOOLEAN('P', "full-paths", &full_paths,
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index b4b06c7903e1..61b828236c11 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -42,6 +42,8 @@
42#include "util/util.h" 42#include "util/util.h"
43#include "util/parse-options.h" 43#include "util/parse-options.h"
44#include "util/parse-events.h" 44#include "util/parse-events.h"
45#include "util/event.h"
46#include "util/debug.h"
45 47
46#include <sys/prctl.h> 48#include <sys/prctl.h>
47#include <math.h> 49#include <math.h>
@@ -60,10 +62,7 @@ static struct perf_counter_attr default_attrs[] = {
60 62
61}; 63};
62 64
63#define MAX_RUN 100
64
65static int system_wide = 0; 65static int system_wide = 0;
66static int verbose = 0;
67static unsigned int nr_cpus = 0; 66static unsigned int nr_cpus = 0;
68static int run_idx = 0; 67static int run_idx = 0;
69 68
@@ -75,26 +74,56 @@ static int null_run = 0;
75 74
76static int fd[MAX_NR_CPUS][MAX_COUNTERS]; 75static int fd[MAX_NR_CPUS][MAX_COUNTERS];
77 76
78static u64 runtime_nsecs[MAX_RUN]; 77static int event_scaled[MAX_COUNTERS];
79static u64 walltime_nsecs[MAX_RUN];
80static u64 runtime_cycles[MAX_RUN];
81 78
82static u64 event_res[MAX_RUN][MAX_COUNTERS][3]; 79struct stats
83static u64 event_scaled[MAX_RUN][MAX_COUNTERS]; 80{
81 double n, mean, M2;
82};
84 83
85static u64 event_res_avg[MAX_COUNTERS][3]; 84static void update_stats(struct stats *stats, u64 val)
86static u64 event_res_noise[MAX_COUNTERS][3]; 85{
86 double delta;
87 87
88static u64 event_scaled_avg[MAX_COUNTERS]; 88 stats->n++;
89 delta = val - stats->mean;
90 stats->mean += delta / stats->n;
91 stats->M2 += delta*(val - stats->mean);
92}
89 93
90static u64 runtime_nsecs_avg; 94static double avg_stats(struct stats *stats)
91static u64 runtime_nsecs_noise; 95{
96 return stats->mean;
97}
92 98
93static u64 walltime_nsecs_avg; 99/*
94static u64 walltime_nsecs_noise; 100 * http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
101 *
102 * (\Sum n_i^2) - ((\Sum n_i)^2)/n
103 * s^2 = -------------------------------
104 * n - 1
105 *
106 * http://en.wikipedia.org/wiki/Stddev
107 *
108 * The std dev of the mean is related to the std dev by:
109 *
110 * s
111 * s_mean = -------
112 * sqrt(n)
113 *
114 */
115static double stddev_stats(struct stats *stats)
116{
117 double variance = stats->M2 / (stats->n - 1);
118 double variance_mean = variance / stats->n;
119
120 return sqrt(variance_mean);
121}
95 122
96static u64 runtime_cycles_avg; 123struct stats event_res_stats[MAX_COUNTERS][3];
97static u64 runtime_cycles_noise; 124struct stats runtime_nsecs_stats;
125struct stats walltime_nsecs_stats;
126struct stats runtime_cycles_stats;
98 127
99#define MATCH_EVENT(t, c, counter) \ 128#define MATCH_EVENT(t, c, counter) \
100 (attrs[counter].type == PERF_TYPE_##t && \ 129 (attrs[counter].type == PERF_TYPE_##t && \
@@ -149,12 +178,11 @@ static inline int nsec_counter(int counter)
149 */ 178 */
150static void read_counter(int counter) 179static void read_counter(int counter)
151{ 180{
152 u64 *count, single_count[3]; 181 u64 count[3], single_count[3];
153 unsigned int cpu; 182 unsigned int cpu;
154 size_t res, nv; 183 size_t res, nv;
155 int scaled; 184 int scaled;
156 185 int i;
157 count = event_res[run_idx][counter];
158 186
159 count[0] = count[1] = count[2] = 0; 187 count[0] = count[1] = count[2] = 0;
160 188
@@ -179,24 +207,33 @@ static void read_counter(int counter)
179 scaled = 0; 207 scaled = 0;
180 if (scale) { 208 if (scale) {
181 if (count[2] == 0) { 209 if (count[2] == 0) {
182 event_scaled[run_idx][counter] = -1; 210 event_scaled[counter] = -1;
183 count[0] = 0; 211 count[0] = 0;
184 return; 212 return;
185 } 213 }
186 214
187 if (count[2] < count[1]) { 215 if (count[2] < count[1]) {
188 event_scaled[run_idx][counter] = 1; 216 event_scaled[counter] = 1;
189 count[0] = (unsigned long long) 217 count[0] = (unsigned long long)
190 ((double)count[0] * count[1] / count[2] + 0.5); 218 ((double)count[0] * count[1] / count[2] + 0.5);
191 } 219 }
192 } 220 }
221
222 for (i = 0; i < 3; i++)
223 update_stats(&event_res_stats[counter][i], count[i]);
224
225 if (verbose) {
226 fprintf(stderr, "%s: %Ld %Ld %Ld\n", event_name(counter),
227 count[0], count[1], count[2]);
228 }
229
193 /* 230 /*
194 * Save the full runtime - to allow normalization during printout: 231 * Save the full runtime - to allow normalization during printout:
195 */ 232 */
196 if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) 233 if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter))
197 runtime_nsecs[run_idx] = count[0]; 234 update_stats(&runtime_nsecs_stats, count[0]);
198 if (MATCH_EVENT(HARDWARE, HW_CPU_CYCLES, counter)) 235 if (MATCH_EVENT(HARDWARE, HW_CPU_CYCLES, counter))
199 runtime_cycles[run_idx] = count[0]; 236 update_stats(&runtime_cycles_stats, count[0]);
200} 237}
201 238
202static int run_perf_stat(int argc __used, const char **argv) 239static int run_perf_stat(int argc __used, const char **argv)
@@ -270,7 +307,7 @@ static int run_perf_stat(int argc __used, const char **argv)
270 307
271 t1 = rdclock(); 308 t1 = rdclock();
272 309
273 walltime_nsecs[run_idx] = t1 - t0; 310 update_stats(&walltime_nsecs_stats, t1 - t0);
274 311
275 for (counter = 0; counter < nr_counters; counter++) 312 for (counter = 0; counter < nr_counters; counter++)
276 read_counter(counter); 313 read_counter(counter);
@@ -278,42 +315,38 @@ static int run_perf_stat(int argc __used, const char **argv)
278 return WEXITSTATUS(status); 315 return WEXITSTATUS(status);
279} 316}
280 317
281static void print_noise(u64 *count, u64 *noise) 318static void print_noise(int counter, double avg)
282{ 319{
283 if (run_count > 1) 320 if (run_count == 1)
284 fprintf(stderr, " ( +- %7.3f%% )", 321 return;
285 (double)noise[0]/(count[0]+1)*100.0); 322
323 fprintf(stderr, " ( +- %7.3f%% )",
324 100 * stddev_stats(&event_res_stats[counter][0]) / avg);
286} 325}
287 326
288static void nsec_printout(int counter, u64 *count, u64 *noise) 327static void nsec_printout(int counter, double avg)
289{ 328{
290 double msecs = (double)count[0] / 1000000; 329 double msecs = avg / 1e6;
291 330
292 fprintf(stderr, " %14.6f %-24s", msecs, event_name(counter)); 331 fprintf(stderr, " %14.6f %-24s", msecs, event_name(counter));
293 332
294 if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) { 333 if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) {
295 if (walltime_nsecs_avg) 334 fprintf(stderr, " # %10.3f CPUs ",
296 fprintf(stderr, " # %10.3f CPUs ", 335 avg / avg_stats(&walltime_nsecs_stats));
297 (double)count[0] / (double)walltime_nsecs_avg);
298 } 336 }
299 print_noise(count, noise);
300} 337}
301 338
302static void abs_printout(int counter, u64 *count, u64 *noise) 339static void abs_printout(int counter, double avg)
303{ 340{
304 fprintf(stderr, " %14Ld %-24s", count[0], event_name(counter)); 341 fprintf(stderr, " %14.0f %-24s", avg, event_name(counter));
305 342
306 if (runtime_cycles_avg && 343 if (MATCH_EVENT(HARDWARE, HW_INSTRUCTIONS, counter)) {
307 MATCH_EVENT(HARDWARE, HW_INSTRUCTIONS, counter)) {
308 fprintf(stderr, " # %10.3f IPC ", 344 fprintf(stderr, " # %10.3f IPC ",
309 (double)count[0] / (double)runtime_cycles_avg); 345 avg / avg_stats(&runtime_cycles_stats));
310 } else { 346 } else {
311 if (runtime_nsecs_avg) { 347 fprintf(stderr, " # %10.3f M/sec",
312 fprintf(stderr, " # %10.3f M/sec", 348 1000.0 * avg / avg_stats(&runtime_nsecs_stats));
313 (double)count[0]/runtime_nsecs_avg*1000.0);
314 }
315 } 349 }
316 print_noise(count, noise);
317} 350}
318 351
319/* 352/*
@@ -321,12 +354,8 @@ static void abs_printout(int counter, u64 *count, u64 *noise)
321 */ 354 */
322static void print_counter(int counter) 355static void print_counter(int counter)
323{ 356{
324 u64 *count, *noise; 357 double avg = avg_stats(&event_res_stats[counter][0]);
325 int scaled; 358 int scaled = event_scaled[counter];
326
327 count = event_res_avg[counter];
328 noise = event_res_noise[counter];
329 scaled = event_scaled_avg[counter];
330 359
331 if (scaled == -1) { 360 if (scaled == -1) {
332 fprintf(stderr, " %14s %-24s\n", 361 fprintf(stderr, " %14s %-24s\n",
@@ -335,110 +364,29 @@ static void print_counter(int counter)
335 } 364 }
336 365
337 if (nsec_counter(counter)) 366 if (nsec_counter(counter))
338 nsec_printout(counter, count, noise); 367 nsec_printout(counter, avg);
339 else 368 else
340 abs_printout(counter, count, noise); 369 abs_printout(counter, avg);
341
342 if (scaled)
343 fprintf(stderr, " (scaled from %.2f%%)",
344 (double) count[2] / count[1] * 100);
345
346 fprintf(stderr, "\n");
347}
348 370
349/* 371 print_noise(counter, avg);
350 * normalize_noise noise values down to stddev:
351 */
352static void normalize_noise(u64 *val)
353{
354 double res;
355 372
356 res = (double)*val / (run_count * sqrt((double)run_count)); 373 if (scaled) {
374 double avg_enabled, avg_running;
357 375
358 *val = (u64)res; 376 avg_enabled = avg_stats(&event_res_stats[counter][1]);
359} 377 avg_running = avg_stats(&event_res_stats[counter][2]);
360 378
361static void update_avg(const char *name, int idx, u64 *avg, u64 *val) 379 fprintf(stderr, " (scaled from %.2f%%)",
362{ 380 100 * avg_running / avg_enabled);
363 *avg += *val;
364
365 if (verbose > 1)
366 fprintf(stderr, "debug: %20s[%d]: %Ld\n", name, idx, *val);
367}
368/*
369 * Calculate the averages and noises:
370 */
371static void calc_avg(void)
372{
373 int i, j;
374
375 if (verbose > 1)
376 fprintf(stderr, "\n");
377
378 for (i = 0; i < run_count; i++) {
379 update_avg("runtime", 0, &runtime_nsecs_avg, runtime_nsecs + i);
380 update_avg("walltime", 0, &walltime_nsecs_avg, walltime_nsecs + i);
381 update_avg("runtime_cycles", 0, &runtime_cycles_avg, runtime_cycles + i);
382
383 for (j = 0; j < nr_counters; j++) {
384 update_avg("counter/0", j,
385 event_res_avg[j]+0, event_res[i][j]+0);
386 update_avg("counter/1", j,
387 event_res_avg[j]+1, event_res[i][j]+1);
388 update_avg("counter/2", j,
389 event_res_avg[j]+2, event_res[i][j]+2);
390 if (event_scaled[i][j] != (u64)-1)
391 update_avg("scaled", j,
392 event_scaled_avg + j, event_scaled[i]+j);
393 else
394 event_scaled_avg[j] = -1;
395 }
396 }
397 runtime_nsecs_avg /= run_count;
398 walltime_nsecs_avg /= run_count;
399 runtime_cycles_avg /= run_count;
400
401 for (j = 0; j < nr_counters; j++) {
402 event_res_avg[j][0] /= run_count;
403 event_res_avg[j][1] /= run_count;
404 event_res_avg[j][2] /= run_count;
405 }
406
407 for (i = 0; i < run_count; i++) {
408 runtime_nsecs_noise +=
409 abs((s64)(runtime_nsecs[i] - runtime_nsecs_avg));
410 walltime_nsecs_noise +=
411 abs((s64)(walltime_nsecs[i] - walltime_nsecs_avg));
412 runtime_cycles_noise +=
413 abs((s64)(runtime_cycles[i] - runtime_cycles_avg));
414
415 for (j = 0; j < nr_counters; j++) {
416 event_res_noise[j][0] +=
417 abs((s64)(event_res[i][j][0] - event_res_avg[j][0]));
418 event_res_noise[j][1] +=
419 abs((s64)(event_res[i][j][1] - event_res_avg[j][1]));
420 event_res_noise[j][2] +=
421 abs((s64)(event_res[i][j][2] - event_res_avg[j][2]));
422 }
423 } 381 }
424 382
425 normalize_noise(&runtime_nsecs_noise); 383 fprintf(stderr, "\n");
426 normalize_noise(&walltime_nsecs_noise);
427 normalize_noise(&runtime_cycles_noise);
428
429 for (j = 0; j < nr_counters; j++) {
430 normalize_noise(&event_res_noise[j][0]);
431 normalize_noise(&event_res_noise[j][1]);
432 normalize_noise(&event_res_noise[j][2]);
433 }
434} 384}
435 385
436static void print_stat(int argc, const char **argv) 386static void print_stat(int argc, const char **argv)
437{ 387{
438 int i, counter; 388 int i, counter;
439 389
440 calc_avg();
441
442 fflush(stdout); 390 fflush(stdout);
443 391
444 fprintf(stderr, "\n"); 392 fprintf(stderr, "\n");
@@ -457,10 +405,11 @@ static void print_stat(int argc, const char **argv)
457 405
458 fprintf(stderr, "\n"); 406 fprintf(stderr, "\n");
459 fprintf(stderr, " %14.9f seconds time elapsed", 407 fprintf(stderr, " %14.9f seconds time elapsed",
460 (double)walltime_nsecs_avg/1e9); 408 avg_stats(&walltime_nsecs_stats)/1e9);
461 if (run_count > 1) { 409 if (run_count > 1) {
462 fprintf(stderr, " ( +- %7.3f%% )", 410 fprintf(stderr, " ( +- %7.3f%% )",
463 100.0*(double)walltime_nsecs_noise/(double)walltime_nsecs_avg); 411 100*stddev_stats(&walltime_nsecs_stats) /
412 avg_stats(&walltime_nsecs_stats));
464 } 413 }
465 fprintf(stderr, "\n\n"); 414 fprintf(stderr, "\n\n");
466} 415}
@@ -515,7 +464,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used)
515 PARSE_OPT_STOP_AT_NON_OPTION); 464 PARSE_OPT_STOP_AT_NON_OPTION);
516 if (!argc) 465 if (!argc)
517 usage_with_options(stat_usage, options); 466 usage_with_options(stat_usage, options);
518 if (run_count <= 0 || run_count > MAX_RUN) 467 if (run_count <= 0)
519 usage_with_options(stat_usage, options); 468 usage_with_options(stat_usage, options);
520 469
521 /* Set attrs and nr_counters if no event is selected and !null_run */ 470 /* Set attrs and nr_counters if no event is selected and !null_run */
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 7de28ce9ca26..4002ccb36750 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -27,6 +27,8 @@
27#include "util/parse-options.h" 27#include "util/parse-options.h"
28#include "util/parse-events.h" 28#include "util/parse-events.h"
29 29
30#include "util/debug.h"
31
30#include <assert.h> 32#include <assert.h>
31#include <fcntl.h> 33#include <fcntl.h>
32 34
@@ -68,8 +70,6 @@ static int group = 0;
68static unsigned int page_size; 70static unsigned int page_size;
69static unsigned int mmap_pages = 16; 71static unsigned int mmap_pages = 16;
70static int freq = 0; 72static int freq = 0;
71static int verbose = 0;
72static char *vmlinux = NULL;
73 73
74static int delay_secs = 2; 74static int delay_secs = 2;
75static int zero; 75static int zero;
@@ -122,7 +122,8 @@ static void parse_source(struct sym_entry *syme)
122 struct module *module; 122 struct module *module;
123 struct section *section = NULL; 123 struct section *section = NULL;
124 FILE *file; 124 FILE *file;
125 char command[PATH_MAX*2], *path = vmlinux; 125 char command[PATH_MAX*2];
126 const char *path = vmlinux_name;
126 u64 start, end, len; 127 u64 start, end, len;
127 128
128 if (!syme) 129 if (!syme)
@@ -338,8 +339,6 @@ static void show_details(struct sym_entry *syme)
338 printf("%d lines not displayed, maybe increase display entries [e]\n", more); 339 printf("%d lines not displayed, maybe increase display entries [e]\n", more);
339} 340}
340 341
341struct dso *kernel_dso;
342
343/* 342/*
344 * Symbols will be added here in record_ip and will get out 343 * Symbols will be added here in record_ip and will get out
345 * after decayed. 344 * after decayed.
@@ -484,17 +483,24 @@ static void print_sym_table(void)
484 if (nr_counters == 1) 483 if (nr_counters == 1)
485 printf(" samples pcnt"); 484 printf(" samples pcnt");
486 else 485 else
487 printf(" weight samples pcnt"); 486 printf(" weight samples pcnt");
488 487
489 printf(" RIP kernel function\n" 488 if (verbose)
490 " ______ _______ _____ ________________ _______________\n\n" 489 printf(" RIP ");
491 ); 490 printf(" kernel function\n");
491 printf(" %s _______ _____",
492 nr_counters == 1 ? " " : "______");
493 if (verbose)
494 printf(" ________________");
495 printf(" _______________\n\n");
492 496
493 for (nd = rb_first(&tmp); nd; nd = rb_next(nd)) { 497 for (nd = rb_first(&tmp); nd; nd = rb_next(nd)) {
494 struct sym_entry *syme = rb_entry(nd, struct sym_entry, rb_node); 498 struct symbol *sym;
495 struct symbol *sym = (struct symbol *)(syme + 1);
496 double pcnt; 499 double pcnt;
497 500
501 syme = rb_entry(nd, struct sym_entry, rb_node);
502 sym = (struct symbol *)(syme + 1);
503
498 if (++printed > print_entries || (int)syme->snap_count < count_filter) 504 if (++printed > print_entries || (int)syme->snap_count < count_filter)
499 continue; 505 continue;
500 506
@@ -507,7 +513,9 @@ static void print_sym_table(void)
507 printf("%9.1f %10ld - ", syme->weight, syme->snap_count); 513 printf("%9.1f %10ld - ", syme->weight, syme->snap_count);
508 514
509 percent_color_fprintf(stdout, "%4.1f%%", pcnt); 515 percent_color_fprintf(stdout, "%4.1f%%", pcnt);
510 printf(" - %016llx : %s", sym->start, sym->name); 516 if (verbose)
517 printf(" - %016llx", sym->start);
518 printf(" : %s", sym->name);
511 if (sym->module) 519 if (sym->module)
512 printf("\t[%s]", sym->module->name); 520 printf("\t[%s]", sym->module->name);
513 printf("\n"); 521 printf("\n");
@@ -613,7 +621,7 @@ static void print_mapped_keys(void)
613 621
614 fprintf(stdout, "\t[f] profile display filter (count). \t(%d)\n", count_filter); 622 fprintf(stdout, "\t[f] profile display filter (count). \t(%d)\n", count_filter);
615 623
616 if (vmlinux) { 624 if (vmlinux_name) {
617 fprintf(stdout, "\t[F] annotate display filter (percent). \t(%d%%)\n", sym_pcnt_filter); 625 fprintf(stdout, "\t[F] annotate display filter (percent). \t(%d%%)\n", sym_pcnt_filter);
618 fprintf(stdout, "\t[s] annotate symbol. \t(%s)\n", name?: "NULL"); 626 fprintf(stdout, "\t[s] annotate symbol. \t(%s)\n", name?: "NULL");
619 fprintf(stdout, "\t[S] stop annotation.\n"); 627 fprintf(stdout, "\t[S] stop annotation.\n");
@@ -642,7 +650,9 @@ static int key_mapped(int c)
642 case 'F': 650 case 'F':
643 case 's': 651 case 's':
644 case 'S': 652 case 'S':
645 return vmlinux ? 1 : 0; 653 return vmlinux_name ? 1 : 0;
654 default:
655 break;
646 } 656 }
647 657
648 return 0; 658 return 0;
@@ -728,6 +738,8 @@ static void handle_keypress(int c)
728 case 'z': 738 case 'z':
729 zero = ~zero; 739 zero = ~zero;
730 break; 740 break;
741 default:
742 break;
731 } 743 }
732} 744}
733 745
@@ -816,13 +828,13 @@ static int parse_symbols(void)
816{ 828{
817 struct rb_node *node; 829 struct rb_node *node;
818 struct symbol *sym; 830 struct symbol *sym;
819 int modules = vmlinux ? 1 : 0; 831 int use_modules = vmlinux_name ? 1 : 0;
820 832
821 kernel_dso = dso__new("[kernel]", sizeof(struct sym_entry)); 833 kernel_dso = dso__new("[kernel]", sizeof(struct sym_entry));
822 if (kernel_dso == NULL) 834 if (kernel_dso == NULL)
823 return -1; 835 return -1;
824 836
825 if (dso__load_kernel(kernel_dso, vmlinux, symbol_filter, verbose, modules) <= 0) 837 if (dso__load_kernel(kernel_dso, vmlinux_name, symbol_filter, verbose, use_modules) <= 0)
826 goto out_delete_dso; 838 goto out_delete_dso;
827 839
828 node = rb_first(&kernel_dso->syms); 840 node = rb_first(&kernel_dso->syms);
@@ -937,26 +949,6 @@ static void mmap_read_counter(struct mmap_data *md)
937 last_read = this_read; 949 last_read = this_read;
938 950
939 for (; old != head;) { 951 for (; old != head;) {
940 struct ip_event {
941 struct perf_event_header header;
942 u64 ip;
943 u32 pid, target_pid;
944 };
945 struct mmap_event {
946 struct perf_event_header header;
947 u32 pid, target_pid;
948 u64 start;
949 u64 len;
950 u64 pgoff;
951 char filename[PATH_MAX];
952 };
953
954 typedef union event_union {
955 struct perf_event_header header;
956 struct ip_event ip;
957 struct mmap_event mmap;
958 } event_t;
959
960 event_t *event = (event_t *)&data[old & md->mask]; 952 event_t *event = (event_t *)&data[old & md->mask];
961 953
962 event_t event_copy; 954 event_t event_copy;
@@ -1138,7 +1130,7 @@ static const struct option options[] = {
1138 "system-wide collection from all CPUs"), 1130 "system-wide collection from all CPUs"),
1139 OPT_INTEGER('C', "CPU", &profile_cpu, 1131 OPT_INTEGER('C', "CPU", &profile_cpu,
1140 "CPU to profile on"), 1132 "CPU to profile on"),
1141 OPT_STRING('k', "vmlinux", &vmlinux, "file", "vmlinux pathname"), 1133 OPT_STRING('k', "vmlinux", &vmlinux_name, "file", "vmlinux pathname"),
1142 OPT_INTEGER('m', "mmap-pages", &mmap_pages, 1134 OPT_INTEGER('m', "mmap-pages", &mmap_pages,
1143 "number of mmap data pages"), 1135 "number of mmap data pages"),
1144 OPT_INTEGER('r', "realtime", &realtime_prio, 1136 OPT_INTEGER('r', "realtime", &realtime_prio,
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
new file mode 100644
index 000000000000..914ab366e369
--- /dev/null
+++ b/tools/perf/builtin-trace.c
@@ -0,0 +1,297 @@
1#include "builtin.h"
2
3#include "util/util.h"
4#include "util/cache.h"
5#include "util/symbol.h"
6#include "util/thread.h"
7#include "util/header.h"
8
9#include "util/parse-options.h"
10
11#include "perf.h"
12#include "util/debug.h"
13
14#include "util/trace-event.h"
15
16static char const *input_name = "perf.data";
17static int input;
18static unsigned long page_size;
19static unsigned long mmap_window = 32;
20
21static unsigned long total = 0;
22static unsigned long total_comm = 0;
23
24static struct rb_root threads;
25static struct thread *last_match;
26
27static struct perf_header *header;
28static u64 sample_type;
29
30
31static int
32process_comm_event(event_t *event, unsigned long offset, unsigned long head)
33{
34 struct thread *thread;
35
36 thread = threads__findnew(event->comm.pid, &threads, &last_match);
37
38 dump_printf("%p [%p]: PERF_EVENT_COMM: %s:%d\n",
39 (void *)(offset + head),
40 (void *)(long)(event->header.size),
41 event->comm.comm, event->comm.pid);
42
43 if (thread == NULL ||
44 thread__set_comm(thread, event->comm.comm)) {
45 dump_printf("problem processing PERF_EVENT_COMM, skipping event.\n");
46 return -1;
47 }
48 total_comm++;
49
50 return 0;
51}
52
53static int
54process_sample_event(event_t *event, unsigned long offset, unsigned long head)
55{
56 char level;
57 int show = 0;
58 struct dso *dso = NULL;
59 struct thread *thread;
60 u64 ip = event->ip.ip;
61 u64 timestamp = -1;
62 u32 cpu = -1;
63 u64 period = 1;
64 void *more_data = event->ip.__more_data;
65 int cpumode;
66
67 thread = threads__findnew(event->ip.pid, &threads, &last_match);
68
69 if (sample_type & PERF_SAMPLE_TIME) {
70 timestamp = *(u64 *)more_data;
71 more_data += sizeof(u64);
72 }
73
74 if (sample_type & PERF_SAMPLE_CPU) {
75 cpu = *(u32 *)more_data;
76 more_data += sizeof(u32);
77 more_data += sizeof(u32); /* reserved */
78 }
79
80 if (sample_type & PERF_SAMPLE_PERIOD) {
81 period = *(u64 *)more_data;
82 more_data += sizeof(u64);
83 }
84
85 dump_printf("%p [%p]: PERF_EVENT_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n",
86 (void *)(offset + head),
87 (void *)(long)(event->header.size),
88 event->header.misc,
89 event->ip.pid, event->ip.tid,
90 (void *)(long)ip,
91 (long long)period);
92
93 dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
94
95 if (thread == NULL) {
96 eprintf("problem processing %d event, skipping it.\n",
97 event->header.type);
98 return -1;
99 }
100
101 cpumode = event->header.misc & PERF_EVENT_MISC_CPUMODE_MASK;
102
103 if (cpumode == PERF_EVENT_MISC_KERNEL) {
104 show = SHOW_KERNEL;
105 level = 'k';
106
107 dso = kernel_dso;
108
109 dump_printf(" ...... dso: %s\n", dso->name);
110
111 } else if (cpumode == PERF_EVENT_MISC_USER) {
112
113 show = SHOW_USER;
114 level = '.';
115
116 } else {
117 show = SHOW_HV;
118 level = 'H';
119
120 dso = hypervisor_dso;
121
122 dump_printf(" ...... dso: [hypervisor]\n");
123 }
124
125 if (sample_type & PERF_SAMPLE_RAW) {
126 struct {
127 u32 size;
128 char data[0];
129 } *raw = more_data;
130
131 /*
132 * FIXME: better resolve from pid from the struct trace_entry
133 * field, although it should be the same than this perf
134 * event pid
135 */
136 print_event(cpu, raw->data, raw->size, timestamp, thread->comm);
137 }
138 total += period;
139
140 return 0;
141}
142
143static int
144process_event(event_t *event, unsigned long offset, unsigned long head)
145{
146 trace_event(event);
147
148 switch (event->header.type) {
149 case PERF_EVENT_MMAP ... PERF_EVENT_LOST:
150 return 0;
151
152 case PERF_EVENT_COMM:
153 return process_comm_event(event, offset, head);
154
155 case PERF_EVENT_EXIT ... PERF_EVENT_READ:
156 return 0;
157
158 case PERF_EVENT_SAMPLE:
159 return process_sample_event(event, offset, head);
160
161 case PERF_EVENT_MAX:
162 default:
163 return -1;
164 }
165
166 return 0;
167}
168
169static int __cmd_trace(void)
170{
171 int ret, rc = EXIT_FAILURE;
172 unsigned long offset = 0;
173 unsigned long head = 0;
174 struct stat perf_stat;
175 event_t *event;
176 uint32_t size;
177 char *buf;
178
179 trace_report();
180 register_idle_thread(&threads, &last_match);
181
182 input = open(input_name, O_RDONLY);
183 if (input < 0) {
184 perror("failed to open file");
185 exit(-1);
186 }
187
188 ret = fstat(input, &perf_stat);
189 if (ret < 0) {
190 perror("failed to stat file");
191 exit(-1);
192 }
193
194 if (!perf_stat.st_size) {
195 fprintf(stderr, "zero-sized file, nothing to do!\n");
196 exit(0);
197 }
198 header = perf_header__read(input);
199 head = header->data_offset;
200 sample_type = perf_header__sample_type(header);
201
202 if (!(sample_type & PERF_SAMPLE_RAW))
203 die("No trace sample to read. Did you call perf record "
204 "without -R?");
205
206 if (load_kernel() < 0) {
207 perror("failed to load kernel symbols");
208 return EXIT_FAILURE;
209 }
210
211remap:
212 buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
213 MAP_SHARED, input, offset);
214 if (buf == MAP_FAILED) {
215 perror("failed to mmap file");
216 exit(-1);
217 }
218
219more:
220 event = (event_t *)(buf + head);
221
222 size = event->header.size;
223 if (!size)
224 size = 8;
225
226 if (head + event->header.size >= page_size * mmap_window) {
227 unsigned long shift = page_size * (head / page_size);
228 int res;
229
230 res = munmap(buf, page_size * mmap_window);
231 assert(res == 0);
232
233 offset += shift;
234 head -= shift;
235 goto remap;
236 }
237
238 size = event->header.size;
239
240
241 if (!size || process_event(event, offset, head) < 0) {
242
243 /*
244 * assume we lost track of the stream, check alignment, and
245 * increment a single u64 in the hope to catch on again 'soon'.
246 */
247
248 if (unlikely(head & 7))
249 head &= ~7ULL;
250
251 size = 8;
252 }
253
254 head += size;
255
256 if (offset + head < (unsigned long)perf_stat.st_size)
257 goto more;
258
259 rc = EXIT_SUCCESS;
260 close(input);
261
262 return rc;
263}
264
265static const char * const annotate_usage[] = {
266 "perf trace [<options>] <command>",
267 NULL
268};
269
270static const struct option options[] = {
271 OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
272 "dump raw trace in ASCII"),
273 OPT_BOOLEAN('v', "verbose", &verbose,
274 "be more verbose (show symbol address, etc)"),
275 OPT_END()
276};
277
278int cmd_trace(int argc, const char **argv, const char *prefix __used)
279{
280 symbol__init();
281 page_size = getpagesize();
282
283 argc = parse_options(argc, argv, options, annotate_usage, 0);
284 if (argc) {
285 /*
286 * Special case: if there's an argument left then assume tha
287 * it's a symbol filter:
288 */
289 if (argc > 1)
290 usage_with_options(annotate_usage, options);
291 }
292
293
294 setup_pager();
295
296 return __cmd_trace();
297}
diff --git a/tools/perf/builtin.h b/tools/perf/builtin.h
index 51d168230ee7..3a63e41fb44e 100644
--- a/tools/perf/builtin.h
+++ b/tools/perf/builtin.h
@@ -22,5 +22,6 @@ extern int cmd_stat(int argc, const char **argv, const char *prefix);
22extern int cmd_top(int argc, const char **argv, const char *prefix); 22extern int cmd_top(int argc, const char **argv, const char *prefix);
23extern int cmd_version(int argc, const char **argv, const char *prefix); 23extern int cmd_version(int argc, const char **argv, const char *prefix);
24extern int cmd_list(int argc, const char **argv, const char *prefix); 24extern int cmd_list(int argc, const char **argv, const char *prefix);
25extern int cmd_trace(int argc, const char **argv, const char *prefix);
25 26
26#endif 27#endif
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index 31982ad064b4..fe4589dde950 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -292,6 +292,7 @@ static void handle_internal_command(int argc, const char **argv)
292 { "top", cmd_top, 0 }, 292 { "top", cmd_top, 0 },
293 { "annotate", cmd_annotate, 0 }, 293 { "annotate", cmd_annotate, 0 },
294 { "version", cmd_version, 0 }, 294 { "version", cmd_version, 0 },
295 { "trace", cmd_trace, 0 },
295 }; 296 };
296 unsigned int i; 297 unsigned int i;
297 static const char ext[] = STRIP_EXTENSION; 298 static const char ext[] = STRIP_EXTENSION;
diff --git a/tools/perf/util/abspath.c b/tools/perf/util/abspath.c
index 61d33b81fc97..a791dd467261 100644
--- a/tools/perf/util/abspath.c
+++ b/tools/perf/util/abspath.c
@@ -50,7 +50,8 @@ const char *make_absolute_path(const char *path)
50 die ("Could not get current working directory"); 50 die ("Could not get current working directory");
51 51
52 if (last_elem) { 52 if (last_elem) {
53 int len = strlen(buf); 53 len = strlen(buf);
54
54 if (len + strlen(last_elem) + 2 > PATH_MAX) 55 if (len + strlen(last_elem) + 2 > PATH_MAX)
55 die ("Too long path name: '%s/%s'", 56 die ("Too long path name: '%s/%s'",
56 buf, last_elem); 57 buf, last_elem);
diff --git a/tools/perf/util/cache.h b/tools/perf/util/cache.h
index 4b50c412b9c5..6f8ea9d210b6 100644
--- a/tools/perf/util/cache.h
+++ b/tools/perf/util/cache.h
@@ -52,7 +52,6 @@ extern const char *perf_mailmap_file;
52extern void maybe_flush_or_die(FILE *, const char *); 52extern void maybe_flush_or_die(FILE *, const char *);
53extern int copy_fd(int ifd, int ofd); 53extern int copy_fd(int ifd, int ofd);
54extern int copy_file(const char *dst, const char *src, int mode); 54extern int copy_file(const char *dst, const char *src, int mode);
55extern ssize_t read_in_full(int fd, void *buf, size_t count);
56extern ssize_t write_in_full(int fd, const void *buf, size_t count); 55extern ssize_t write_in_full(int fd, const void *buf, size_t count);
57extern void write_or_die(int fd, const void *buf, size_t count); 56extern void write_or_die(int fd, const void *buf, size_t count);
58extern int write_or_whine(int fd, const void *buf, size_t count, const char *msg); 57extern int write_or_whine(int fd, const void *buf, size_t count, const char *msg);
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 011473411642..3b8380f1b478 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -50,6 +50,7 @@ rb_insert_callchain(struct rb_root *root, struct callchain_node *chain,
50 else 50 else
51 p = &(*p)->rb_right; 51 p = &(*p)->rb_right;
52 break; 52 break;
53 case CHAIN_NONE:
53 default: 54 default:
54 break; 55 break;
55 } 56 }
@@ -143,6 +144,7 @@ int register_callchain_param(struct callchain_param *param)
143 case CHAIN_FLAT: 144 case CHAIN_FLAT:
144 param->sort = sort_chain_flat; 145 param->sort = sort_chain_flat;
145 break; 146 break;
147 case CHAIN_NONE:
146 default: 148 default:
147 return -1; 149 return -1;
148 } 150 }
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index a926ae4f5a16..43cf3ea9e088 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -4,6 +4,7 @@
4#include "../perf.h" 4#include "../perf.h"
5#include <linux/list.h> 5#include <linux/list.h>
6#include <linux/rbtree.h> 6#include <linux/rbtree.h>
7#include "util.h"
7#include "symbol.h" 8#include "symbol.h"
8 9
9enum chain_mode { 10enum chain_mode {
diff --git a/tools/perf/util/color.c b/tools/perf/util/color.c
index 90a044d1fe7d..e88bca55a599 100644
--- a/tools/perf/util/color.c
+++ b/tools/perf/util/color.c
@@ -166,7 +166,7 @@ int perf_color_default_config(const char *var, const char *value, void *cb)
166 return perf_default_config(var, value, cb); 166 return perf_default_config(var, value, cb);
167} 167}
168 168
169static int color_vfprintf(FILE *fp, const char *color, const char *fmt, 169static int __color_vfprintf(FILE *fp, const char *color, const char *fmt,
170 va_list args, const char *trail) 170 va_list args, const char *trail)
171{ 171{
172 int r = 0; 172 int r = 0;
@@ -191,6 +191,10 @@ static int color_vfprintf(FILE *fp, const char *color, const char *fmt,
191 return r; 191 return r;
192} 192}
193 193
194int color_vfprintf(FILE *fp, const char *color, const char *fmt, va_list args)
195{
196 return __color_vfprintf(fp, color, fmt, args, NULL);
197}
194 198
195 199
196int color_fprintf(FILE *fp, const char *color, const char *fmt, ...) 200int color_fprintf(FILE *fp, const char *color, const char *fmt, ...)
@@ -199,7 +203,7 @@ int color_fprintf(FILE *fp, const char *color, const char *fmt, ...)
199 int r; 203 int r;
200 204
201 va_start(args, fmt); 205 va_start(args, fmt);
202 r = color_vfprintf(fp, color, fmt, args, NULL); 206 r = color_vfprintf(fp, color, fmt, args);
203 va_end(args); 207 va_end(args);
204 return r; 208 return r;
205} 209}
@@ -209,7 +213,7 @@ int color_fprintf_ln(FILE *fp, const char *color, const char *fmt, ...)
209 va_list args; 213 va_list args;
210 int r; 214 int r;
211 va_start(args, fmt); 215 va_start(args, fmt);
212 r = color_vfprintf(fp, color, fmt, args, "\n"); 216 r = __color_vfprintf(fp, color, fmt, args, "\n");
213 va_end(args); 217 va_end(args);
214 return r; 218 return r;
215} 219}
@@ -242,9 +246,9 @@ int color_fwrite_lines(FILE *fp, const char *color,
242 return 0; 246 return 0;
243} 247}
244 248
245char *get_percent_color(double percent) 249const char *get_percent_color(double percent)
246{ 250{
247 char *color = PERF_COLOR_NORMAL; 251 const char *color = PERF_COLOR_NORMAL;
248 252
249 /* 253 /*
250 * We color high-overhead entries in red, mid-overhead 254 * We color high-overhead entries in red, mid-overhead
@@ -263,7 +267,7 @@ char *get_percent_color(double percent)
263int percent_color_fprintf(FILE *fp, const char *fmt, double percent) 267int percent_color_fprintf(FILE *fp, const char *fmt, double percent)
264{ 268{
265 int r; 269 int r;
266 char *color; 270 const char *color;
267 271
268 color = get_percent_color(percent); 272 color = get_percent_color(percent);
269 r = color_fprintf(fp, color, fmt, percent); 273 r = color_fprintf(fp, color, fmt, percent);
diff --git a/tools/perf/util/color.h b/tools/perf/util/color.h
index 706cec50bd25..58d597564b99 100644
--- a/tools/perf/util/color.h
+++ b/tools/perf/util/color.h
@@ -32,10 +32,11 @@ int perf_color_default_config(const char *var, const char *value, void *cb);
32int perf_config_colorbool(const char *var, const char *value, int stdout_is_tty); 32int perf_config_colorbool(const char *var, const char *value, int stdout_is_tty);
33void color_parse(const char *value, const char *var, char *dst); 33void color_parse(const char *value, const char *var, char *dst);
34void color_parse_mem(const char *value, int len, const char *var, char *dst); 34void color_parse_mem(const char *value, int len, const char *var, char *dst);
35int color_vfprintf(FILE *fp, const char *color, const char *fmt, va_list args);
35int color_fprintf(FILE *fp, const char *color, const char *fmt, ...); 36int color_fprintf(FILE *fp, const char *color, const char *fmt, ...);
36int color_fprintf_ln(FILE *fp, const char *color, const char *fmt, ...); 37int color_fprintf_ln(FILE *fp, const char *color, const char *fmt, ...);
37int color_fwrite_lines(FILE *fp, const char *color, size_t count, const char *buf); 38int color_fwrite_lines(FILE *fp, const char *color, size_t count, const char *buf);
38int percent_color_fprintf(FILE *fp, const char *fmt, double percent); 39int percent_color_fprintf(FILE *fp, const char *fmt, double percent);
39char *get_percent_color(double percent); 40const char *get_percent_color(double percent);
40 41
41#endif /* COLOR_H */ 42#endif /* COLOR_H */
diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c
index 780df541006d..8784649109ce 100644
--- a/tools/perf/util/config.c
+++ b/tools/perf/util/config.c
@@ -160,17 +160,18 @@ static int get_extended_base_var(char *name, int baselen, int c)
160 name[baselen++] = '.'; 160 name[baselen++] = '.';
161 161
162 for (;;) { 162 for (;;) {
163 int c = get_next_char(); 163 int ch = get_next_char();
164 if (c == '\n') 164
165 if (ch == '\n')
165 return -1; 166 return -1;
166 if (c == '"') 167 if (ch == '"')
167 break; 168 break;
168 if (c == '\\') { 169 if (ch == '\\') {
169 c = get_next_char(); 170 ch = get_next_char();
170 if (c == '\n') 171 if (ch == '\n')
171 return -1; 172 return -1;
172 } 173 }
173 name[baselen++] = c; 174 name[baselen++] = ch;
174 if (baselen > MAXNAME / 2) 175 if (baselen > MAXNAME / 2)
175 return -1; 176 return -1;
176 } 177 }
@@ -530,6 +531,8 @@ static int store_aux(const char* key, const char* value, void *cb __used)
530 store.offset[store.seen] = ftell(config_file); 531 store.offset[store.seen] = ftell(config_file);
531 } 532 }
532 } 533 }
534 default:
535 break;
533 } 536 }
534 return 0; 537 return 0;
535} 538}
@@ -619,6 +622,7 @@ contline:
619 switch (contents[offset]) { 622 switch (contents[offset]) {
620 case '=': equal_offset = offset; break; 623 case '=': equal_offset = offset; break;
621 case ']': bracket_offset = offset; break; 624 case ']': bracket_offset = offset; break;
625 default: break;
622 } 626 }
623 if (offset > 0 && contents[offset-1] == '\\') { 627 if (offset > 0 && contents[offset-1] == '\\') {
624 offset_ = offset; 628 offset_ = offset;
@@ -742,9 +746,9 @@ int perf_config_set_multivar(const char* key, const char* value,
742 goto write_err_out; 746 goto write_err_out;
743 } else { 747 } else {
744 struct stat st; 748 struct stat st;
745 char* contents; 749 char *contents;
746 ssize_t contents_sz, copy_begin, copy_end; 750 ssize_t contents_sz, copy_begin, copy_end;
747 int i, new_line = 0; 751 int new_line = 0;
748 752
749 if (value_regex == NULL) 753 if (value_regex == NULL)
750 store.value_regex = NULL; 754 store.value_regex = NULL;
diff --git a/tools/perf/util/debug.c b/tools/perf/util/debug.c
new file mode 100644
index 000000000000..e8ca98fe0bd4
--- /dev/null
+++ b/tools/perf/util/debug.c
@@ -0,0 +1,95 @@
1/* For general debugging purposes */
2
3#include "../perf.h"
4
5#include <string.h>
6#include <stdarg.h>
7#include <stdio.h>
8
9#include "color.h"
10#include "event.h"
11#include "debug.h"
12
13int verbose = 0;
14int dump_trace = 0;
15
16int eprintf(const char *fmt, ...)
17{
18 va_list args;
19 int ret = 0;
20
21 if (verbose) {
22 va_start(args, fmt);
23 ret = vfprintf(stderr, fmt, args);
24 va_end(args);
25 }
26
27 return ret;
28}
29
30int dump_printf(const char *fmt, ...)
31{
32 va_list args;
33 int ret = 0;
34
35 if (dump_trace) {
36 va_start(args, fmt);
37 ret = vprintf(fmt, args);
38 va_end(args);
39 }
40
41 return ret;
42}
43
44static int dump_printf_color(const char *fmt, const char *color, ...)
45{
46 va_list args;
47 int ret = 0;
48
49 if (dump_trace) {
50 va_start(args, color);
51 ret = color_vfprintf(stdout, color, fmt, args);
52 va_end(args);
53 }
54
55 return ret;
56}
57
58
59void trace_event(event_t *event)
60{
61 unsigned char *raw_event = (void *)event;
62 const char *color = PERF_COLOR_BLUE;
63 int i, j;
64
65 if (!dump_trace)
66 return;
67
68 dump_printf(".");
69 dump_printf_color("\n. ... raw event: size %d bytes\n", color,
70 event->header.size);
71
72 for (i = 0; i < event->header.size; i++) {
73 if ((i & 15) == 0) {
74 dump_printf(".");
75 dump_printf_color(" %04x: ", color, i);
76 }
77
78 dump_printf_color(" %02x", color, raw_event[i]);
79
80 if (((i & 15) == 15) || i == event->header.size-1) {
81 dump_printf_color(" ", color);
82 for (j = 0; j < 15-(i & 15); j++)
83 dump_printf_color(" ", color);
84 for (j = 0; j < (i & 15); j++) {
85 if (isprint(raw_event[i-15+j]))
86 dump_printf_color("%c", color,
87 raw_event[i-15+j]);
88 else
89 dump_printf_color(".", color);
90 }
91 dump_printf_color("\n", color);
92 }
93 }
94 dump_printf(".\n");
95}
diff --git a/tools/perf/util/debug.h b/tools/perf/util/debug.h
new file mode 100644
index 000000000000..437eea58ce40
--- /dev/null
+++ b/tools/perf/util/debug.h
@@ -0,0 +1,8 @@
1/* For debugging general purposes */
2
3extern int verbose;
4extern int dump_trace;
5
6int eprintf(const char *fmt, ...) __attribute__((format(printf, 1, 2)));
7int dump_printf(const char *fmt, ...) __attribute__((format(printf, 1, 2)));
8void trace_event(event_t *event);
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
new file mode 100644
index 000000000000..fa2d4e91d329
--- /dev/null
+++ b/tools/perf/util/event.h
@@ -0,0 +1,96 @@
1#ifndef __PERF_EVENT_H
2#define __PERF_EVENT_H
3#include "../perf.h"
4#include "util.h"
5#include <linux/list.h>
6
7enum {
8 SHOW_KERNEL = 1,
9 SHOW_USER = 2,
10 SHOW_HV = 4,
11};
12
13/*
14 * PERF_SAMPLE_IP | PERF_SAMPLE_TID | *
15 */
16struct ip_event {
17 struct perf_event_header header;
18 u64 ip;
19 u32 pid, tid;
20 unsigned char __more_data[];
21};
22
23struct mmap_event {
24 struct perf_event_header header;
25 u32 pid, tid;
26 u64 start;
27 u64 len;
28 u64 pgoff;
29 char filename[PATH_MAX];
30};
31
32struct comm_event {
33 struct perf_event_header header;
34 u32 pid, tid;
35 char comm[16];
36};
37
38struct fork_event {
39 struct perf_event_header header;
40 u32 pid, ppid;
41 u32 tid, ptid;
42};
43
44struct lost_event {
45 struct perf_event_header header;
46 u64 id;
47 u64 lost;
48};
49
50/*
51 * PERF_FORMAT_ENABLED | PERF_FORMAT_RUNNING | PERF_FORMAT_ID
52 */
53struct read_event {
54 struct perf_event_header header;
55 u32 pid,tid;
56 u64 value;
57 u64 time_enabled;
58 u64 time_running;
59 u64 id;
60};
61
62typedef union event_union {
63 struct perf_event_header header;
64 struct ip_event ip;
65 struct mmap_event mmap;
66 struct comm_event comm;
67 struct fork_event fork;
68 struct lost_event lost;
69 struct read_event read;
70} event_t;
71
72struct map {
73 struct list_head node;
74 u64 start;
75 u64 end;
76 u64 pgoff;
77 u64 (*map_ip)(struct map *, u64);
78 struct dso *dso;
79};
80
81static inline u64 map__map_ip(struct map *map, u64 ip)
82{
83 return ip - map->start + map->pgoff;
84}
85
86static inline u64 vdso__map_ip(struct map *map __used, u64 ip)
87{
88 return ip;
89}
90
91struct map *map__new(struct mmap_event *event, char *cwd, int cwdlen);
92struct map *map__clone(struct map *self);
93int map__overlap(struct map *l, struct map *r);
94size_t map__fprintf(struct map *self, FILE *fp);
95
96#endif
diff --git a/tools/perf/util/exec_cmd.c b/tools/perf/util/exec_cmd.c
index 34a352867382..2745605dba11 100644
--- a/tools/perf/util/exec_cmd.c
+++ b/tools/perf/util/exec_cmd.c
@@ -6,7 +6,6 @@
6 6
7#define MAX_ARGS 32 7#define MAX_ARGS 32
8 8
9extern char **environ;
10static const char *argv_exec_path; 9static const char *argv_exec_path;
11static const char *argv0_path; 10static const char *argv0_path;
12 11
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index b92a457ca32e..ec4d4c2f9522 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -237,9 +237,44 @@ struct perf_header *perf_header__read(int fd)
237 self->data_offset = f_header.data.offset; 237 self->data_offset = f_header.data.offset;
238 self->data_size = f_header.data.size; 238 self->data_size = f_header.data.size;
239 239
240 lseek(fd, self->data_offset + self->data_size, SEEK_SET); 240 lseek(fd, self->data_offset, SEEK_SET);
241 241
242 self->frozen = 1; 242 self->frozen = 1;
243 243
244 return self; 244 return self;
245} 245}
246
247u64 perf_header__sample_type(struct perf_header *header)
248{
249 u64 type = 0;
250 int i;
251
252 for (i = 0; i < header->attrs; i++) {
253 struct perf_header_attr *attr = header->attr[i];
254
255 if (!type)
256 type = attr->attr.sample_type;
257 else if (type != attr->attr.sample_type)
258 die("non matching sample_type");
259 }
260
261 return type;
262}
263
264struct perf_counter_attr *
265perf_header__find_attr(u64 id, struct perf_header *header)
266{
267 int i;
268
269 for (i = 0; i < header->attrs; i++) {
270 struct perf_header_attr *attr = header->attr[i];
271 int j;
272
273 for (j = 0; j < attr->ids; j++) {
274 if (attr->id[j] == id)
275 return &attr->attr;
276 }
277 }
278
279 return NULL;
280}
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index bf280449fcfd..5d0a72ecc919 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -31,6 +31,10 @@ struct perf_header_attr *
31perf_header_attr__new(struct perf_counter_attr *attr); 31perf_header_attr__new(struct perf_counter_attr *attr);
32void perf_header_attr__add_id(struct perf_header_attr *self, u64 id); 32void perf_header_attr__add_id(struct perf_header_attr *self, u64 id);
33 33
34u64 perf_header__sample_type(struct perf_header *header);
35struct perf_counter_attr *
36perf_header__find_attr(u64 id, struct perf_header *header);
37
34 38
35struct perf_header *perf_header__new(void); 39struct perf_header *perf_header__new(void);
36 40
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
new file mode 100644
index 000000000000..804e02382739
--- /dev/null
+++ b/tools/perf/util/map.c
@@ -0,0 +1,97 @@
1#include "event.h"
2#include "symbol.h"
3#include <stdlib.h>
4#include <string.h>
5#include <stdio.h>
6
7static inline int is_anon_memory(const char *filename)
8{
9 return strcmp(filename, "//anon") == 0;
10}
11
12static int strcommon(const char *pathname, char *cwd, int cwdlen)
13{
14 int n = 0;
15
16 while (n < cwdlen && pathname[n] == cwd[n])
17 ++n;
18
19 return n;
20}
21
22 struct map *map__new(struct mmap_event *event, char *cwd, int cwdlen)
23{
24 struct map *self = malloc(sizeof(*self));
25
26 if (self != NULL) {
27 const char *filename = event->filename;
28 char newfilename[PATH_MAX];
29 int anon;
30
31 if (cwd) {
32 int n = strcommon(filename, cwd, cwdlen);
33
34 if (n == cwdlen) {
35 snprintf(newfilename, sizeof(newfilename),
36 ".%s", filename + n);
37 filename = newfilename;
38 }
39 }
40
41 anon = is_anon_memory(filename);
42
43 if (anon) {
44 snprintf(newfilename, sizeof(newfilename), "/tmp/perf-%d.map", event->pid);
45 filename = newfilename;
46 }
47
48 self->start = event->start;
49 self->end = event->start + event->len;
50 self->pgoff = event->pgoff;
51
52 self->dso = dsos__findnew(filename);
53 if (self->dso == NULL)
54 goto out_delete;
55
56 if (self->dso == vdso || anon)
57 self->map_ip = vdso__map_ip;
58 else
59 self->map_ip = map__map_ip;
60 }
61 return self;
62out_delete:
63 free(self);
64 return NULL;
65}
66
67struct map *map__clone(struct map *self)
68{
69 struct map *map = malloc(sizeof(*self));
70
71 if (!map)
72 return NULL;
73
74 memcpy(map, self, sizeof(*self));
75
76 return map;
77}
78
79int map__overlap(struct map *l, struct map *r)
80{
81 if (l->start > r->start) {
82 struct map *t = l;
83 l = r;
84 r = t;
85 }
86
87 if (l->end > r->start)
88 return 1;
89
90 return 0;
91}
92
93size_t map__fprintf(struct map *self, FILE *fp)
94{
95 return fprintf(fp, " %Lx-%Lx %Lx %s\n",
96 self->start, self->end, self->pgoff, self->dso->name);
97}
diff --git a/tools/perf/util/module.c b/tools/perf/util/module.c
index ddabe925d65d..3d567fe59c79 100644
--- a/tools/perf/util/module.c
+++ b/tools/perf/util/module.c
@@ -436,9 +436,9 @@ static int mod_dso__load_module_paths(struct mod_dso *self)
436 goto out_failure; 436 goto out_failure;
437 437
438 while (!feof(file)) { 438 while (!feof(file)) {
439 char *path, *name, *tmp; 439 char *name, *tmp;
440 struct module *module; 440 struct module *module;
441 int line_len, len; 441 int line_len;
442 442
443 line_len = getline(&line, &n, file); 443 line_len = getline(&line, &n, file);
444 if (line_len < 0) 444 if (line_len < 0)
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 044178408783..a587d41ae3c9 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -1,23 +1,21 @@
1 1
2#include "../perf.h"
3#include "util.h" 2#include "util.h"
3#include "../perf.h"
4#include "parse-options.h" 4#include "parse-options.h"
5#include "parse-events.h" 5#include "parse-events.h"
6#include "exec_cmd.h" 6#include "exec_cmd.h"
7#include "string.h" 7#include "string.h"
8#include "cache.h" 8#include "cache.h"
9 9
10extern char *strcasestr(const char *haystack, const char *needle);
11
12int nr_counters; 10int nr_counters;
13 11
14struct perf_counter_attr attrs[MAX_COUNTERS]; 12struct perf_counter_attr attrs[MAX_COUNTERS];
15 13
16struct event_symbol { 14struct event_symbol {
17 u8 type; 15 u8 type;
18 u64 config; 16 u64 config;
19 char *symbol; 17 const char *symbol;
20 char *alias; 18 const char *alias;
21}; 19};
22 20
23char debugfs_path[MAXPATHLEN]; 21char debugfs_path[MAXPATHLEN];
@@ -51,7 +49,7 @@ static struct event_symbol event_symbols[] = {
51#define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE) 49#define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE)
52#define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT) 50#define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT)
53 51
54static char *hw_event_names[] = { 52static const char *hw_event_names[] = {
55 "cycles", 53 "cycles",
56 "instructions", 54 "instructions",
57 "cache-references", 55 "cache-references",
@@ -61,7 +59,7 @@ static char *hw_event_names[] = {
61 "bus-cycles", 59 "bus-cycles",
62}; 60};
63 61
64static char *sw_event_names[] = { 62static const char *sw_event_names[] = {
65 "cpu-clock-msecs", 63 "cpu-clock-msecs",
66 "task-clock-msecs", 64 "task-clock-msecs",
67 "page-faults", 65 "page-faults",
@@ -73,7 +71,7 @@ static char *sw_event_names[] = {
73 71
74#define MAX_ALIASES 8 72#define MAX_ALIASES 8
75 73
76static char *hw_cache[][MAX_ALIASES] = { 74static const char *hw_cache[][MAX_ALIASES] = {
77 { "L1-dcache", "l1-d", "l1d", "L1-data", }, 75 { "L1-dcache", "l1-d", "l1d", "L1-data", },
78 { "L1-icache", "l1-i", "l1i", "L1-instruction", }, 76 { "L1-icache", "l1-i", "l1i", "L1-instruction", },
79 { "LLC", "L2" }, 77 { "LLC", "L2" },
@@ -82,13 +80,13 @@ static char *hw_cache[][MAX_ALIASES] = {
82 { "branch", "branches", "bpu", "btb", "bpc", }, 80 { "branch", "branches", "bpu", "btb", "bpc", },
83}; 81};
84 82
85static char *hw_cache_op[][MAX_ALIASES] = { 83static const char *hw_cache_op[][MAX_ALIASES] = {
86 { "load", "loads", "read", }, 84 { "load", "loads", "read", },
87 { "store", "stores", "write", }, 85 { "store", "stores", "write", },
88 { "prefetch", "prefetches", "speculative-read", "speculative-load", }, 86 { "prefetch", "prefetches", "speculative-read", "speculative-load", },
89}; 87};
90 88
91static char *hw_cache_result[][MAX_ALIASES] = { 89static const char *hw_cache_result[][MAX_ALIASES] = {
92 { "refs", "Reference", "ops", "access", }, 90 { "refs", "Reference", "ops", "access", },
93 { "misses", "miss", }, 91 { "misses", "miss", },
94}; 92};
@@ -113,11 +111,9 @@ static unsigned long hw_cache_stat[C(MAX)] = {
113 [C(BPU)] = (CACHE_READ), 111 [C(BPU)] = (CACHE_READ),
114}; 112};
115 113
116#define for_each_subsystem(sys_dir, sys_dirent, sys_next, file, st) \ 114#define for_each_subsystem(sys_dir, sys_dirent, sys_next) \
117 while (!readdir_r(sys_dir, &sys_dirent, &sys_next) && sys_next) \ 115 while (!readdir_r(sys_dir, &sys_dirent, &sys_next) && sys_next) \
118 if (snprintf(file, MAXPATHLEN, "%s/%s", debugfs_path, \ 116 if (sys_dirent.d_type == DT_DIR && \
119 sys_dirent.d_name) && \
120 (!stat(file, &st)) && (S_ISDIR(st.st_mode)) && \
121 (strcmp(sys_dirent.d_name, ".")) && \ 117 (strcmp(sys_dirent.d_name, ".")) && \
122 (strcmp(sys_dirent.d_name, ".."))) 118 (strcmp(sys_dirent.d_name, "..")))
123 119
@@ -136,11 +132,9 @@ static int tp_event_has_id(struct dirent *sys_dir, struct dirent *evt_dir)
136 return 0; 132 return 0;
137} 133}
138 134
139#define for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next, file, st) \ 135#define for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next) \
140 while (!readdir_r(evt_dir, &evt_dirent, &evt_next) && evt_next) \ 136 while (!readdir_r(evt_dir, &evt_dirent, &evt_next) && evt_next) \
141 if (snprintf(file, MAXPATHLEN, "%s/%s/%s", debugfs_path, \ 137 if (evt_dirent.d_type == DT_DIR && \
142 sys_dirent.d_name, evt_dirent.d_name) && \
143 (!stat(file, &st)) && (S_ISDIR(st.st_mode)) && \
144 (strcmp(evt_dirent.d_name, ".")) && \ 138 (strcmp(evt_dirent.d_name, ".")) && \
145 (strcmp(evt_dirent.d_name, "..")) && \ 139 (strcmp(evt_dirent.d_name, "..")) && \
146 (!tp_event_has_id(&sys_dirent, &evt_dirent))) 140 (!tp_event_has_id(&sys_dirent, &evt_dirent)))
@@ -158,34 +152,39 @@ int valid_debugfs_mount(const char *debugfs)
158 return 0; 152 return 0;
159} 153}
160 154
161static char *tracepoint_id_to_name(u64 config) 155struct tracepoint_path *tracepoint_id_to_path(u64 config)
162{ 156{
163 static char tracepoint_name[2 * MAX_EVENT_LENGTH]; 157 struct tracepoint_path *path = NULL;
164 DIR *sys_dir, *evt_dir; 158 DIR *sys_dir, *evt_dir;
165 struct dirent *sys_next, *evt_next, sys_dirent, evt_dirent; 159 struct dirent *sys_next, *evt_next, sys_dirent, evt_dirent;
166 struct stat st;
167 char id_buf[4]; 160 char id_buf[4];
168 int fd; 161 int sys_dir_fd, fd;
169 u64 id; 162 u64 id;
170 char evt_path[MAXPATHLEN]; 163 char evt_path[MAXPATHLEN];
171 164
172 if (valid_debugfs_mount(debugfs_path)) 165 if (valid_debugfs_mount(debugfs_path))
173 return "unkown"; 166 return NULL;
174 167
175 sys_dir = opendir(debugfs_path); 168 sys_dir = opendir(debugfs_path);
176 if (!sys_dir) 169 if (!sys_dir)
177 goto cleanup; 170 goto cleanup;
178 171 sys_dir_fd = dirfd(sys_dir);
179 for_each_subsystem(sys_dir, sys_dirent, sys_next, evt_path, st) { 172
180 evt_dir = opendir(evt_path); 173 for_each_subsystem(sys_dir, sys_dirent, sys_next) {
181 if (!evt_dir) 174 int dfd = openat(sys_dir_fd, sys_dirent.d_name,
182 goto cleanup; 175 O_RDONLY|O_DIRECTORY), evt_dir_fd;
183 for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next, 176 if (dfd == -1)
184 evt_path, st) { 177 continue;
185 snprintf(evt_path, MAXPATHLEN, "%s/%s/%s/id", 178 evt_dir = fdopendir(dfd);
186 debugfs_path, sys_dirent.d_name, 179 if (!evt_dir) {
180 close(dfd);
181 continue;
182 }
183 evt_dir_fd = dirfd(evt_dir);
184 for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next) {
185 snprintf(evt_path, MAXPATHLEN, "%s/id",
187 evt_dirent.d_name); 186 evt_dirent.d_name);
188 fd = open(evt_path, O_RDONLY); 187 fd = openat(evt_dir_fd, evt_path, O_RDONLY);
189 if (fd < 0) 188 if (fd < 0)
190 continue; 189 continue;
191 if (read(fd, id_buf, sizeof(id_buf)) < 0) { 190 if (read(fd, id_buf, sizeof(id_buf)) < 0) {
@@ -197,10 +196,23 @@ static char *tracepoint_id_to_name(u64 config)
197 if (id == config) { 196 if (id == config) {
198 closedir(evt_dir); 197 closedir(evt_dir);
199 closedir(sys_dir); 198 closedir(sys_dir);
200 snprintf(tracepoint_name, 2 * MAX_EVENT_LENGTH, 199 path = calloc(1, sizeof(path));
201 "%s:%s", sys_dirent.d_name, 200 path->system = malloc(MAX_EVENT_LENGTH);
202 evt_dirent.d_name); 201 if (!path->system) {
203 return tracepoint_name; 202 free(path);
203 return NULL;
204 }
205 path->name = malloc(MAX_EVENT_LENGTH);
206 if (!path->name) {
207 free(path->system);
208 free(path);
209 return NULL;
210 }
211 strncpy(path->system, sys_dirent.d_name,
212 MAX_EVENT_LENGTH);
213 strncpy(path->name, evt_dirent.d_name,
214 MAX_EVENT_LENGTH);
215 return path;
204 } 216 }
205 } 217 }
206 closedir(evt_dir); 218 closedir(evt_dir);
@@ -208,7 +220,25 @@ static char *tracepoint_id_to_name(u64 config)
208 220
209cleanup: 221cleanup:
210 closedir(sys_dir); 222 closedir(sys_dir);
211 return "unkown"; 223 return NULL;
224}
225
226#define TP_PATH_LEN (MAX_EVENT_LENGTH * 2 + 1)
227static const char *tracepoint_id_to_name(u64 config)
228{
229 static char buf[TP_PATH_LEN];
230 struct tracepoint_path *path;
231
232 path = tracepoint_id_to_path(config);
233 if (path) {
234 snprintf(buf, TP_PATH_LEN, "%s:%s", path->system, path->name);
235 free(path->name);
236 free(path->system);
237 free(path);
238 } else
239 snprintf(buf, TP_PATH_LEN, "%s:%s", "unknown", "unknown");
240
241 return buf;
212} 242}
213 243
214static int is_cache_op_valid(u8 cache_type, u8 cache_op) 244static int is_cache_op_valid(u8 cache_type, u8 cache_op)
@@ -235,7 +265,7 @@ static char *event_cache_name(u8 cache_type, u8 cache_op, u8 cache_result)
235 return name; 265 return name;
236} 266}
237 267
238char *event_name(int counter) 268const char *event_name(int counter)
239{ 269{
240 u64 config = attrs[counter].config; 270 u64 config = attrs[counter].config;
241 int type = attrs[counter].type; 271 int type = attrs[counter].type;
@@ -243,7 +273,7 @@ char *event_name(int counter)
243 return __event_name(type, config); 273 return __event_name(type, config);
244} 274}
245 275
246char *__event_name(int type, u64 config) 276const char *__event_name(int type, u64 config)
247{ 277{
248 static char buf[32]; 278 static char buf[32];
249 279
@@ -294,7 +324,7 @@ char *__event_name(int type, u64 config)
294 return "unknown"; 324 return "unknown";
295} 325}
296 326
297static int parse_aliases(const char **str, char *names[][MAX_ALIASES], int size) 327static int parse_aliases(const char **str, const char *names[][MAX_ALIASES], int size)
298{ 328{
299 int i, j; 329 int i, j;
300 int n, longest = -1; 330 int n, longest = -1;
@@ -598,7 +628,7 @@ static void print_tracepoint_events(void)
598{ 628{
599 DIR *sys_dir, *evt_dir; 629 DIR *sys_dir, *evt_dir;
600 struct dirent *sys_next, *evt_next, sys_dirent, evt_dirent; 630 struct dirent *sys_next, *evt_next, sys_dirent, evt_dirent;
601 struct stat st; 631 int sys_dir_fd;
602 char evt_path[MAXPATHLEN]; 632 char evt_path[MAXPATHLEN];
603 633
604 if (valid_debugfs_mount(debugfs_path)) 634 if (valid_debugfs_mount(debugfs_path))
@@ -607,16 +637,23 @@ static void print_tracepoint_events(void)
607 sys_dir = opendir(debugfs_path); 637 sys_dir = opendir(debugfs_path);
608 if (!sys_dir) 638 if (!sys_dir)
609 goto cleanup; 639 goto cleanup;
610 640 sys_dir_fd = dirfd(sys_dir);
611 for_each_subsystem(sys_dir, sys_dirent, sys_next, evt_path, st) { 641
612 evt_dir = opendir(evt_path); 642 for_each_subsystem(sys_dir, sys_dirent, sys_next) {
613 if (!evt_dir) 643 int dfd = openat(sys_dir_fd, sys_dirent.d_name,
614 goto cleanup; 644 O_RDONLY|O_DIRECTORY), evt_dir_fd;
615 for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next, 645 if (dfd == -1)
616 evt_path, st) { 646 continue;
647 evt_dir = fdopendir(dfd);
648 if (!evt_dir) {
649 close(dfd);
650 continue;
651 }
652 evt_dir_fd = dirfd(evt_dir);
653 for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next) {
617 snprintf(evt_path, MAXPATHLEN, "%s:%s", 654 snprintf(evt_path, MAXPATHLEN, "%s:%s",
618 sys_dirent.d_name, evt_dirent.d_name); 655 sys_dirent.d_name, evt_dirent.d_name);
619 fprintf(stderr, " %-40s [%s]\n", evt_path, 656 fprintf(stderr, " %-42s [%s]\n", evt_path,
620 event_type_descriptors[PERF_TYPE_TRACEPOINT+1]); 657 event_type_descriptors[PERF_TYPE_TRACEPOINT+1]);
621 } 658 }
622 closedir(evt_dir); 659 closedir(evt_dir);
@@ -650,7 +687,7 @@ void print_events(void)
650 sprintf(name, "%s OR %s", syms->symbol, syms->alias); 687 sprintf(name, "%s OR %s", syms->symbol, syms->alias);
651 else 688 else
652 strcpy(name, syms->symbol); 689 strcpy(name, syms->symbol);
653 fprintf(stderr, " %-40s [%s]\n", name, 690 fprintf(stderr, " %-42s [%s]\n", name,
654 event_type_descriptors[type]); 691 event_type_descriptors[type]);
655 692
656 prev_type = type; 693 prev_type = type;
@@ -664,7 +701,7 @@ void print_events(void)
664 continue; 701 continue;
665 702
666 for (i = 0; i < PERF_COUNT_HW_CACHE_RESULT_MAX; i++) { 703 for (i = 0; i < PERF_COUNT_HW_CACHE_RESULT_MAX; i++) {
667 fprintf(stderr, " %-40s [%s]\n", 704 fprintf(stderr, " %-42s [%s]\n",
668 event_cache_name(type, op, i), 705 event_cache_name(type, op, i),
669 event_type_descriptors[4]); 706 event_type_descriptors[4]);
670 } 707 }
@@ -672,7 +709,7 @@ void print_events(void)
672 } 709 }
673 710
674 fprintf(stderr, "\n"); 711 fprintf(stderr, "\n");
675 fprintf(stderr, " %-40s [raw hardware event descriptor]\n", 712 fprintf(stderr, " %-42s [raw hardware event descriptor]\n",
676 "rNNN"); 713 "rNNN");
677 fprintf(stderr, "\n"); 714 fprintf(stderr, "\n");
678 715
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index 192a962e3a0f..60704c15961f 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -1,16 +1,25 @@
1 1#ifndef _PARSE_EVENTS_H
2#define _PARSE_EVENTS_H
2/* 3/*
3 * Parse symbolic events/counts passed in as options: 4 * Parse symbolic events/counts passed in as options:
4 */ 5 */
5 6
6struct option; 7struct option;
7 8
9struct tracepoint_path {
10 char *system;
11 char *name;
12 struct tracepoint_path *next;
13};
14
15extern struct tracepoint_path *tracepoint_id_to_path(u64 config);
16
8extern int nr_counters; 17extern int nr_counters;
9 18
10extern struct perf_counter_attr attrs[MAX_COUNTERS]; 19extern struct perf_counter_attr attrs[MAX_COUNTERS];
11 20
12extern char *event_name(int ctr); 21extern const char *event_name(int ctr);
13extern char *__event_name(int type, u64 config); 22extern const char *__event_name(int type, u64 config);
14 23
15extern int parse_events(const struct option *opt, const char *str, int unset); 24extern int parse_events(const struct option *opt, const char *str, int unset);
16 25
@@ -21,3 +30,5 @@ extern void print_events(void);
21extern char debugfs_path[]; 30extern char debugfs_path[];
22extern int valid_debugfs_mount(const char *debugfs); 31extern int valid_debugfs_mount(const char *debugfs);
23 32
33
34#endif /* _PARSE_EVENTS_H */
diff --git a/tools/perf/util/parse-options.c b/tools/perf/util/parse-options.c
index 1bf67190c820..6d8af48c925e 100644
--- a/tools/perf/util/parse-options.c
+++ b/tools/perf/util/parse-options.c
@@ -53,6 +53,12 @@ static int get_value(struct parse_opt_ctx_t *p,
53 case OPTION_SET_INT: 53 case OPTION_SET_INT:
54 case OPTION_SET_PTR: 54 case OPTION_SET_PTR:
55 return opterror(opt, "takes no value", flags); 55 return opterror(opt, "takes no value", flags);
56 case OPTION_END:
57 case OPTION_ARGUMENT:
58 case OPTION_GROUP:
59 case OPTION_STRING:
60 case OPTION_INTEGER:
61 case OPTION_LONG:
56 default: 62 default:
57 break; 63 break;
58 } 64 }
@@ -130,6 +136,9 @@ static int get_value(struct parse_opt_ctx_t *p,
130 return opterror(opt, "expects a numerical value", flags); 136 return opterror(opt, "expects a numerical value", flags);
131 return 0; 137 return 0;
132 138
139 case OPTION_END:
140 case OPTION_ARGUMENT:
141 case OPTION_GROUP:
133 default: 142 default:
134 die("should not happen, someone must be hit on the forehead"); 143 die("should not happen, someone must be hit on the forehead");
135 } 144 }
@@ -296,6 +305,8 @@ int parse_options_step(struct parse_opt_ctx_t *ctx,
296 return parse_options_usage(usagestr, options); 305 return parse_options_usage(usagestr, options);
297 case -2: 306 case -2:
298 goto unknown; 307 goto unknown;
308 default:
309 break;
299 } 310 }
300 if (ctx->opt) 311 if (ctx->opt)
301 check_typos(arg + 1, options); 312 check_typos(arg + 1, options);
@@ -314,6 +325,8 @@ int parse_options_step(struct parse_opt_ctx_t *ctx,
314 ctx->argv[0] = strdup(ctx->opt - 1); 325 ctx->argv[0] = strdup(ctx->opt - 1);
315 *(char *)ctx->argv[0] = '-'; 326 *(char *)ctx->argv[0] = '-';
316 goto unknown; 327 goto unknown;
328 default:
329 break;
317 } 330 }
318 } 331 }
319 continue; 332 continue;
@@ -336,6 +349,8 @@ int parse_options_step(struct parse_opt_ctx_t *ctx,
336 return parse_options_usage(usagestr, options); 349 return parse_options_usage(usagestr, options);
337 case -2: 350 case -2:
338 goto unknown; 351 goto unknown;
352 default:
353 break;
339 } 354 }
340 continue; 355 continue;
341unknown: 356unknown:
@@ -456,6 +471,13 @@ int usage_with_options_internal(const char * const *usagestr,
456 } 471 }
457 break; 472 break;
458 default: /* OPTION_{BIT,BOOLEAN,SET_INT,SET_PTR} */ 473 default: /* OPTION_{BIT,BOOLEAN,SET_INT,SET_PTR} */
474 case OPTION_END:
475 case OPTION_GROUP:
476 case OPTION_BIT:
477 case OPTION_BOOLEAN:
478 case OPTION_SET_INT:
479 case OPTION_SET_PTR:
480 case OPTION_LONG:
459 break; 481 break;
460 } 482 }
461 483
diff --git a/tools/perf/util/path.c b/tools/perf/util/path.c
index a501a40dd2cb..fd1f2faaade4 100644
--- a/tools/perf/util/path.c
+++ b/tools/perf/util/path.c
@@ -17,7 +17,7 @@ static char bad_path[] = "/bad-path/";
17 * Two hacks: 17 * Two hacks:
18 */ 18 */
19 19
20static char *get_perf_dir(void) 20static const char *get_perf_dir(void)
21{ 21{
22 return "."; 22 return ".";
23} 23}
@@ -38,8 +38,9 @@ size_t strlcpy(char *dest, const char *src, size_t size)
38static char *get_pathname(void) 38static char *get_pathname(void)
39{ 39{
40 static char pathname_array[4][PATH_MAX]; 40 static char pathname_array[4][PATH_MAX];
41 static int index; 41 static int idx;
42 return pathname_array[3 & ++index]; 42
43 return pathname_array[3 & ++idx];
43} 44}
44 45
45static char *cleanup_path(char *path) 46static char *cleanup_path(char *path)
@@ -161,20 +162,24 @@ int perf_mkstemp(char *path, size_t len, const char *template)
161} 162}
162 163
163 164
164const char *make_relative_path(const char *abs, const char *base) 165const char *make_relative_path(const char *abs_path, const char *base)
165{ 166{
166 static char buf[PATH_MAX + 1]; 167 static char buf[PATH_MAX + 1];
167 int baselen; 168 int baselen;
169
168 if (!base) 170 if (!base)
169 return abs; 171 return abs_path;
172
170 baselen = strlen(base); 173 baselen = strlen(base);
171 if (prefixcmp(abs, base)) 174 if (prefixcmp(abs_path, base))
172 return abs; 175 return abs_path;
173 if (abs[baselen] == '/') 176 if (abs_path[baselen] == '/')
174 baselen++; 177 baselen++;
175 else if (base[baselen - 1] != '/') 178 else if (base[baselen - 1] != '/')
176 return abs; 179 return abs_path;
177 strcpy(buf, abs + baselen); 180
181 strcpy(buf, abs_path + baselen);
182
178 return buf; 183 return buf;
179} 184}
180 185
diff --git a/tools/perf/util/run-command.c b/tools/perf/util/run-command.c
index a3935343091a..2b615acf94d7 100644
--- a/tools/perf/util/run-command.c
+++ b/tools/perf/util/run-command.c
@@ -262,7 +262,7 @@ int run_hook(const char *index_file, const char *name, ...)
262{ 262{
263 struct child_process hook; 263 struct child_process hook;
264 const char **argv = NULL, *env[2]; 264 const char **argv = NULL, *env[2];
265 char index[PATH_MAX]; 265 char idx[PATH_MAX];
266 va_list args; 266 va_list args;
267 int ret; 267 int ret;
268 size_t i = 0, alloc = 0; 268 size_t i = 0, alloc = 0;
@@ -284,8 +284,8 @@ int run_hook(const char *index_file, const char *name, ...)
284 hook.no_stdin = 1; 284 hook.no_stdin = 1;
285 hook.stdout_to_stderr = 1; 285 hook.stdout_to_stderr = 1;
286 if (index_file) { 286 if (index_file) {
287 snprintf(index, sizeof(index), "PERF_INDEX_FILE=%s", index_file); 287 snprintf(idx, sizeof(idx), "PERF_INDEX_FILE=%s", index_file);
288 env[0] = index; 288 env[0] = idx;
289 env[1] = NULL; 289 env[1] = NULL;
290 hook.env = env; 290 hook.env = env;
291 } 291 }
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 5c0f42e6b33b..fd3d9c8e90fc 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -3,6 +3,8 @@
3#include "string.h" 3#include "string.h"
4#include "symbol.h" 4#include "symbol.h"
5 5
6#include "debug.h"
7
6#include <libelf.h> 8#include <libelf.h>
7#include <gelf.h> 9#include <gelf.h>
8#include <elf.h> 10#include <elf.h>
@@ -21,7 +23,7 @@ enum dso_origin {
21 23
22static struct symbol *symbol__new(u64 start, u64 len, 24static struct symbol *symbol__new(u64 start, u64 len,
23 const char *name, unsigned int priv_size, 25 const char *name, unsigned int priv_size,
24 u64 obj_start, int verbose) 26 u64 obj_start, int v)
25{ 27{
26 size_t namelen = strlen(name) + 1; 28 size_t namelen = strlen(name) + 1;
27 struct symbol *self = calloc(1, priv_size + sizeof(*self) + namelen); 29 struct symbol *self = calloc(1, priv_size + sizeof(*self) + namelen);
@@ -29,7 +31,7 @@ static struct symbol *symbol__new(u64 start, u64 len,
29 if (!self) 31 if (!self)
30 return NULL; 32 return NULL;
31 33
32 if (verbose >= 2) 34 if (v >= 2)
33 printf("new symbol: %016Lx [%08lx]: %s, hist: %p, obj_start: %p\n", 35 printf("new symbol: %016Lx [%08lx]: %s, hist: %p, obj_start: %p\n",
34 (u64)start, (unsigned long)len, name, self->hist, (void *)(unsigned long)obj_start); 36 (u64)start, (unsigned long)len, name, self->hist, (void *)(unsigned long)obj_start);
35 37
@@ -156,7 +158,7 @@ size_t dso__fprintf(struct dso *self, FILE *fp)
156 return ret; 158 return ret;
157} 159}
158 160
159static int dso__load_kallsyms(struct dso *self, symbol_filter_t filter, int verbose) 161static int dso__load_kallsyms(struct dso *self, symbol_filter_t filter, int v)
160{ 162{
161 struct rb_node *nd, *prevnd; 163 struct rb_node *nd, *prevnd;
162 char *line = NULL; 164 char *line = NULL;
@@ -198,7 +200,7 @@ static int dso__load_kallsyms(struct dso *self, symbol_filter_t filter, int verb
198 * Well fix up the end later, when we have all sorted. 200 * Well fix up the end later, when we have all sorted.
199 */ 201 */
200 sym = symbol__new(start, 0xdead, line + len + 2, 202 sym = symbol__new(start, 0xdead, line + len + 2,
201 self->sym_priv_size, 0, verbose); 203 self->sym_priv_size, 0, v);
202 204
203 if (sym == NULL) 205 if (sym == NULL)
204 goto out_delete_line; 206 goto out_delete_line;
@@ -239,7 +241,7 @@ out_failure:
239 return -1; 241 return -1;
240} 242}
241 243
242static int dso__load_perf_map(struct dso *self, symbol_filter_t filter, int verbose) 244static int dso__load_perf_map(struct dso *self, symbol_filter_t filter, int v)
243{ 245{
244 char *line = NULL; 246 char *line = NULL;
245 size_t n; 247 size_t n;
@@ -277,7 +279,7 @@ static int dso__load_perf_map(struct dso *self, symbol_filter_t filter, int verb
277 continue; 279 continue;
278 280
279 sym = symbol__new(start, size, line + len, 281 sym = symbol__new(start, size, line + len,
280 self->sym_priv_size, start, verbose); 282 self->sym_priv_size, start, v);
281 283
282 if (sym == NULL) 284 if (sym == NULL)
283 goto out_delete_line; 285 goto out_delete_line;
@@ -305,13 +307,13 @@ out_failure:
305 * elf_symtab__for_each_symbol - iterate thru all the symbols 307 * elf_symtab__for_each_symbol - iterate thru all the symbols
306 * 308 *
307 * @self: struct elf_symtab instance to iterate 309 * @self: struct elf_symtab instance to iterate
308 * @index: uint32_t index 310 * @idx: uint32_t idx
309 * @sym: GElf_Sym iterator 311 * @sym: GElf_Sym iterator
310 */ 312 */
311#define elf_symtab__for_each_symbol(syms, nr_syms, index, sym) \ 313#define elf_symtab__for_each_symbol(syms, nr_syms, idx, sym) \
312 for (index = 0, gelf_getsym(syms, index, &sym);\ 314 for (idx = 0, gelf_getsym(syms, idx, &sym);\
313 index < nr_syms; \ 315 idx < nr_syms; \
314 index++, gelf_getsym(syms, index, &sym)) 316 idx++, gelf_getsym(syms, idx, &sym))
315 317
316static inline uint8_t elf_sym__type(const GElf_Sym *sym) 318static inline uint8_t elf_sym__type(const GElf_Sym *sym)
317{ 319{
@@ -354,7 +356,7 @@ static inline const char *elf_sym__name(const GElf_Sym *sym,
354 356
355static Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep, 357static Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep,
356 GElf_Shdr *shp, const char *name, 358 GElf_Shdr *shp, const char *name,
357 size_t *index) 359 size_t *idx)
358{ 360{
359 Elf_Scn *sec = NULL; 361 Elf_Scn *sec = NULL;
360 size_t cnt = 1; 362 size_t cnt = 1;
@@ -365,8 +367,8 @@ static Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep,
365 gelf_getshdr(sec, shp); 367 gelf_getshdr(sec, shp);
366 str = elf_strptr(elf, ep->e_shstrndx, shp->sh_name); 368 str = elf_strptr(elf, ep->e_shstrndx, shp->sh_name);
367 if (!strcmp(name, str)) { 369 if (!strcmp(name, str)) {
368 if (index) 370 if (idx)
369 *index = cnt; 371 *idx = cnt;
370 break; 372 break;
371 } 373 }
372 ++cnt; 374 ++cnt;
@@ -392,7 +394,7 @@ static Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep,
392 * And always look at the original dso, not at debuginfo packages, that 394 * And always look at the original dso, not at debuginfo packages, that
393 * have the PLT data stripped out (shdr_rel_plt.sh_type == SHT_NOBITS). 395 * have the PLT data stripped out (shdr_rel_plt.sh_type == SHT_NOBITS).
394 */ 396 */
395static int dso__synthesize_plt_symbols(struct dso *self, int verbose) 397static int dso__synthesize_plt_symbols(struct dso *self, int v)
396{ 398{
397 uint32_t nr_rel_entries, idx; 399 uint32_t nr_rel_entries, idx;
398 GElf_Sym sym; 400 GElf_Sym sym;
@@ -442,7 +444,7 @@ static int dso__synthesize_plt_symbols(struct dso *self, int verbose)
442 goto out_elf_end; 444 goto out_elf_end;
443 445
444 /* 446 /*
445 * Fetch the relocation section to find the indexes to the GOT 447 * Fetch the relocation section to find the idxes to the GOT
446 * and the symbols in the .dynsym they refer to. 448 * and the symbols in the .dynsym they refer to.
447 */ 449 */
448 reldata = elf_getdata(scn_plt_rel, NULL); 450 reldata = elf_getdata(scn_plt_rel, NULL);
@@ -476,7 +478,7 @@ static int dso__synthesize_plt_symbols(struct dso *self, int verbose)
476 "%s@plt", elf_sym__name(&sym, symstrs)); 478 "%s@plt", elf_sym__name(&sym, symstrs));
477 479
478 f = symbol__new(plt_offset, shdr_plt.sh_entsize, 480 f = symbol__new(plt_offset, shdr_plt.sh_entsize,
479 sympltname, self->sym_priv_size, 0, verbose); 481 sympltname, self->sym_priv_size, 0, v);
480 if (!f) 482 if (!f)
481 goto out_elf_end; 483 goto out_elf_end;
482 484
@@ -494,7 +496,7 @@ static int dso__synthesize_plt_symbols(struct dso *self, int verbose)
494 "%s@plt", elf_sym__name(&sym, symstrs)); 496 "%s@plt", elf_sym__name(&sym, symstrs));
495 497
496 f = symbol__new(plt_offset, shdr_plt.sh_entsize, 498 f = symbol__new(plt_offset, shdr_plt.sh_entsize,
497 sympltname, self->sym_priv_size, 0, verbose); 499 sympltname, self->sym_priv_size, 0, v);
498 if (!f) 500 if (!f)
499 goto out_elf_end; 501 goto out_elf_end;
500 502
@@ -518,12 +520,12 @@ out:
518} 520}
519 521
520static int dso__load_sym(struct dso *self, int fd, const char *name, 522static int dso__load_sym(struct dso *self, int fd, const char *name,
521 symbol_filter_t filter, int verbose, struct module *mod) 523 symbol_filter_t filter, int v, struct module *mod)
522{ 524{
523 Elf_Data *symstrs, *secstrs; 525 Elf_Data *symstrs, *secstrs;
524 uint32_t nr_syms; 526 uint32_t nr_syms;
525 int err = -1; 527 int err = -1;
526 uint32_t index; 528 uint32_t idx;
527 GElf_Ehdr ehdr; 529 GElf_Ehdr ehdr;
528 GElf_Shdr shdr; 530 GElf_Shdr shdr;
529 Elf_Data *syms; 531 Elf_Data *syms;
@@ -534,14 +536,14 @@ static int dso__load_sym(struct dso *self, int fd, const char *name,
534 536
535 elf = elf_begin(fd, ELF_C_READ_MMAP, NULL); 537 elf = elf_begin(fd, ELF_C_READ_MMAP, NULL);
536 if (elf == NULL) { 538 if (elf == NULL) {
537 if (verbose) 539 if (v)
538 fprintf(stderr, "%s: cannot read %s ELF file.\n", 540 fprintf(stderr, "%s: cannot read %s ELF file.\n",
539 __func__, name); 541 __func__, name);
540 goto out_close; 542 goto out_close;
541 } 543 }
542 544
543 if (gelf_getehdr(elf, &ehdr) == NULL) { 545 if (gelf_getehdr(elf, &ehdr) == NULL) {
544 if (verbose) 546 if (v)
545 fprintf(stderr, "%s: cannot get elf header.\n", __func__); 547 fprintf(stderr, "%s: cannot get elf header.\n", __func__);
546 goto out_elf_end; 548 goto out_elf_end;
547 } 549 }
@@ -583,9 +585,9 @@ static int dso__load_sym(struct dso *self, int fd, const char *name,
583 NULL) != NULL); 585 NULL) != NULL);
584 } else self->adjust_symbols = 0; 586 } else self->adjust_symbols = 0;
585 587
586 elf_symtab__for_each_symbol(syms, nr_syms, index, sym) { 588 elf_symtab__for_each_symbol(syms, nr_syms, idx, sym) {
587 struct symbol *f; 589 struct symbol *f;
588 const char *name; 590 const char *elf_name;
589 char *demangled; 591 char *demangled;
590 u64 obj_start; 592 u64 obj_start;
591 struct section *section = NULL; 593 struct section *section = NULL;
@@ -608,7 +610,7 @@ static int dso__load_sym(struct dso *self, int fd, const char *name,
608 obj_start = sym.st_value; 610 obj_start = sym.st_value;
609 611
610 if (self->adjust_symbols) { 612 if (self->adjust_symbols) {
611 if (verbose >= 2) 613 if (v >= 2)
612 printf("adjusting symbol: st_value: %Lx sh_addr: %Lx sh_offset: %Lx\n", 614 printf("adjusting symbol: st_value: %Lx sh_addr: %Lx sh_offset: %Lx\n",
613 (u64)sym.st_value, (u64)shdr.sh_addr, (u64)shdr.sh_offset); 615 (u64)sym.st_value, (u64)shdr.sh_addr, (u64)shdr.sh_offset);
614 616
@@ -630,13 +632,13 @@ static int dso__load_sym(struct dso *self, int fd, const char *name,
630 * DWARF DW_compile_unit has this, but we don't always have access 632 * DWARF DW_compile_unit has this, but we don't always have access
631 * to it... 633 * to it...
632 */ 634 */
633 name = elf_sym__name(&sym, symstrs); 635 elf_name = elf_sym__name(&sym, symstrs);
634 demangled = bfd_demangle(NULL, name, DMGL_PARAMS | DMGL_ANSI); 636 demangled = bfd_demangle(NULL, elf_name, DMGL_PARAMS | DMGL_ANSI);
635 if (demangled != NULL) 637 if (demangled != NULL)
636 name = demangled; 638 elf_name = demangled;
637 639
638 f = symbol__new(sym.st_value, sym.st_size, name, 640 f = symbol__new(sym.st_value, sym.st_size, elf_name,
639 self->sym_priv_size, obj_start, verbose); 641 self->sym_priv_size, obj_start, v);
640 free(demangled); 642 free(demangled);
641 if (!f) 643 if (!f)
642 goto out_elf_end; 644 goto out_elf_end;
@@ -659,7 +661,7 @@ out_close:
659 661
660#define BUILD_ID_SIZE 128 662#define BUILD_ID_SIZE 128
661 663
662static char *dso__read_build_id(struct dso *self, int verbose) 664static char *dso__read_build_id(struct dso *self, int v)
663{ 665{
664 int i; 666 int i;
665 GElf_Ehdr ehdr; 667 GElf_Ehdr ehdr;
@@ -676,14 +678,14 @@ static char *dso__read_build_id(struct dso *self, int verbose)
676 678
677 elf = elf_begin(fd, ELF_C_READ_MMAP, NULL); 679 elf = elf_begin(fd, ELF_C_READ_MMAP, NULL);
678 if (elf == NULL) { 680 if (elf == NULL) {
679 if (verbose) 681 if (v)
680 fprintf(stderr, "%s: cannot read %s ELF file.\n", 682 fprintf(stderr, "%s: cannot read %s ELF file.\n",
681 __func__, self->name); 683 __func__, self->name);
682 goto out_close; 684 goto out_close;
683 } 685 }
684 686
685 if (gelf_getehdr(elf, &ehdr) == NULL) { 687 if (gelf_getehdr(elf, &ehdr) == NULL) {
686 if (verbose) 688 if (v)
687 fprintf(stderr, "%s: cannot get elf header.\n", __func__); 689 fprintf(stderr, "%s: cannot get elf header.\n", __func__);
688 goto out_elf_end; 690 goto out_elf_end;
689 } 691 }
@@ -706,7 +708,7 @@ static char *dso__read_build_id(struct dso *self, int verbose)
706 ++raw; 708 ++raw;
707 bid += 2; 709 bid += 2;
708 } 710 }
709 if (verbose >= 2) 711 if (v >= 2)
710 printf("%s(%s): %s\n", __func__, self->name, build_id); 712 printf("%s(%s): %s\n", __func__, self->name, build_id);
711out_elf_end: 713out_elf_end:
712 elf_end(elf); 714 elf_end(elf);
@@ -732,7 +734,7 @@ char dso__symtab_origin(const struct dso *self)
732 return origin[self->origin]; 734 return origin[self->origin];
733} 735}
734 736
735int dso__load(struct dso *self, symbol_filter_t filter, int verbose) 737int dso__load(struct dso *self, symbol_filter_t filter, int v)
736{ 738{
737 int size = PATH_MAX; 739 int size = PATH_MAX;
738 char *name = malloc(size), *build_id = NULL; 740 char *name = malloc(size), *build_id = NULL;
@@ -745,7 +747,7 @@ int dso__load(struct dso *self, symbol_filter_t filter, int verbose)
745 self->adjust_symbols = 0; 747 self->adjust_symbols = 0;
746 748
747 if (strncmp(self->name, "/tmp/perf-", 10) == 0) { 749 if (strncmp(self->name, "/tmp/perf-", 10) == 0) {
748 ret = dso__load_perf_map(self, filter, verbose); 750 ret = dso__load_perf_map(self, filter, v);
749 self->origin = ret > 0 ? DSO__ORIG_JAVA_JIT : 751 self->origin = ret > 0 ? DSO__ORIG_JAVA_JIT :
750 DSO__ORIG_NOT_FOUND; 752 DSO__ORIG_NOT_FOUND;
751 return ret; 753 return ret;
@@ -764,7 +766,7 @@ more:
764 snprintf(name, size, "/usr/lib/debug%s", self->name); 766 snprintf(name, size, "/usr/lib/debug%s", self->name);
765 break; 767 break;
766 case DSO__ORIG_BUILDID: 768 case DSO__ORIG_BUILDID:
767 build_id = dso__read_build_id(self, verbose); 769 build_id = dso__read_build_id(self, v);
768 if (build_id != NULL) { 770 if (build_id != NULL) {
769 snprintf(name, size, 771 snprintf(name, size,
770 "/usr/lib/debug/.build-id/%.2s/%s.debug", 772 "/usr/lib/debug/.build-id/%.2s/%s.debug",
@@ -785,7 +787,7 @@ more:
785 fd = open(name, O_RDONLY); 787 fd = open(name, O_RDONLY);
786 } while (fd < 0); 788 } while (fd < 0);
787 789
788 ret = dso__load_sym(self, fd, name, filter, verbose, NULL); 790 ret = dso__load_sym(self, fd, name, filter, v, NULL);
789 close(fd); 791 close(fd);
790 792
791 /* 793 /*
@@ -795,7 +797,7 @@ more:
795 goto more; 797 goto more;
796 798
797 if (ret > 0) { 799 if (ret > 0) {
798 int nr_plt = dso__synthesize_plt_symbols(self, verbose); 800 int nr_plt = dso__synthesize_plt_symbols(self, v);
799 if (nr_plt > 0) 801 if (nr_plt > 0)
800 ret += nr_plt; 802 ret += nr_plt;
801 } 803 }
@@ -807,7 +809,7 @@ out:
807} 809}
808 810
809static int dso__load_module(struct dso *self, struct mod_dso *mods, const char *name, 811static int dso__load_module(struct dso *self, struct mod_dso *mods, const char *name,
810 symbol_filter_t filter, int verbose) 812 symbol_filter_t filter, int v)
811{ 813{
812 struct module *mod = mod_dso__find_module(mods, name); 814 struct module *mod = mod_dso__find_module(mods, name);
813 int err = 0, fd; 815 int err = 0, fd;
@@ -820,13 +822,13 @@ static int dso__load_module(struct dso *self, struct mod_dso *mods, const char *
820 if (fd < 0) 822 if (fd < 0)
821 return err; 823 return err;
822 824
823 err = dso__load_sym(self, fd, name, filter, verbose, mod); 825 err = dso__load_sym(self, fd, name, filter, v, mod);
824 close(fd); 826 close(fd);
825 827
826 return err; 828 return err;
827} 829}
828 830
829int dso__load_modules(struct dso *self, symbol_filter_t filter, int verbose) 831int dso__load_modules(struct dso *self, symbol_filter_t filter, int v)
830{ 832{
831 struct mod_dso *mods = mod_dso__new_dso("modules"); 833 struct mod_dso *mods = mod_dso__new_dso("modules");
832 struct module *pos; 834 struct module *pos;
@@ -844,7 +846,7 @@ int dso__load_modules(struct dso *self, symbol_filter_t filter, int verbose)
844 next = rb_first(&mods->mods); 846 next = rb_first(&mods->mods);
845 while (next) { 847 while (next) {
846 pos = rb_entry(next, struct module, rb_node); 848 pos = rb_entry(next, struct module, rb_node);
847 err = dso__load_module(self, mods, pos->name, filter, verbose); 849 err = dso__load_module(self, mods, pos->name, filter, v);
848 850
849 if (err < 0) 851 if (err < 0)
850 break; 852 break;
@@ -887,14 +889,14 @@ static inline void dso__fill_symbol_holes(struct dso *self)
887} 889}
888 890
889static int dso__load_vmlinux(struct dso *self, const char *vmlinux, 891static int dso__load_vmlinux(struct dso *self, const char *vmlinux,
890 symbol_filter_t filter, int verbose) 892 symbol_filter_t filter, int v)
891{ 893{
892 int err, fd = open(vmlinux, O_RDONLY); 894 int err, fd = open(vmlinux, O_RDONLY);
893 895
894 if (fd < 0) 896 if (fd < 0)
895 return -1; 897 return -1;
896 898
897 err = dso__load_sym(self, fd, vmlinux, filter, verbose, NULL); 899 err = dso__load_sym(self, fd, vmlinux, filter, v, NULL);
898 900
899 if (err > 0) 901 if (err > 0)
900 dso__fill_symbol_holes(self); 902 dso__fill_symbol_holes(self);
@@ -905,18 +907,18 @@ static int dso__load_vmlinux(struct dso *self, const char *vmlinux,
905} 907}
906 908
907int dso__load_kernel(struct dso *self, const char *vmlinux, 909int dso__load_kernel(struct dso *self, const char *vmlinux,
908 symbol_filter_t filter, int verbose, int modules) 910 symbol_filter_t filter, int v, int use_modules)
909{ 911{
910 int err = -1; 912 int err = -1;
911 913
912 if (vmlinux) { 914 if (vmlinux) {
913 err = dso__load_vmlinux(self, vmlinux, filter, verbose); 915 err = dso__load_vmlinux(self, vmlinux, filter, v);
914 if (err > 0 && modules) 916 if (err > 0 && use_modules)
915 err = dso__load_modules(self, filter, verbose); 917 err = dso__load_modules(self, filter, v);
916 } 918 }
917 919
918 if (err <= 0) 920 if (err <= 0)
919 err = dso__load_kallsyms(self, filter, verbose); 921 err = dso__load_kallsyms(self, filter, v);
920 922
921 if (err > 0) 923 if (err > 0)
922 self->origin = DSO__ORIG_KERNEL; 924 self->origin = DSO__ORIG_KERNEL;
@@ -924,6 +926,103 @@ int dso__load_kernel(struct dso *self, const char *vmlinux,
924 return err; 926 return err;
925} 927}
926 928
929LIST_HEAD(dsos);
930struct dso *kernel_dso;
931struct dso *vdso;
932struct dso *hypervisor_dso;
933
934const char *vmlinux_name = "vmlinux";
935int modules;
936
937static void dsos__add(struct dso *dso)
938{
939 list_add_tail(&dso->node, &dsos);
940}
941
942static struct dso *dsos__find(const char *name)
943{
944 struct dso *pos;
945
946 list_for_each_entry(pos, &dsos, node)
947 if (strcmp(pos->name, name) == 0)
948 return pos;
949 return NULL;
950}
951
952struct dso *dsos__findnew(const char *name)
953{
954 struct dso *dso = dsos__find(name);
955 int nr;
956
957 if (dso)
958 return dso;
959
960 dso = dso__new(name, 0);
961 if (!dso)
962 goto out_delete_dso;
963
964 nr = dso__load(dso, NULL, verbose);
965 if (nr < 0) {
966 eprintf("Failed to open: %s\n", name);
967 goto out_delete_dso;
968 }
969 if (!nr)
970 eprintf("No symbols found in: %s, maybe install a debug package?\n", name);
971
972 dsos__add(dso);
973
974 return dso;
975
976out_delete_dso:
977 dso__delete(dso);
978 return NULL;
979}
980
981void dsos__fprintf(FILE *fp)
982{
983 struct dso *pos;
984
985 list_for_each_entry(pos, &dsos, node)
986 dso__fprintf(pos, fp);
987}
988
989static struct symbol *vdso__find_symbol(struct dso *dso, u64 ip)
990{
991 return dso__find_symbol(dso, ip);
992}
993
994int load_kernel(void)
995{
996 int err;
997
998 kernel_dso = dso__new("[kernel]", 0);
999 if (!kernel_dso)
1000 return -1;
1001
1002 err = dso__load_kernel(kernel_dso, vmlinux_name, NULL, verbose, modules);
1003 if (err <= 0) {
1004 dso__delete(kernel_dso);
1005 kernel_dso = NULL;
1006 } else
1007 dsos__add(kernel_dso);
1008
1009 vdso = dso__new("[vdso]", 0);
1010 if (!vdso)
1011 return -1;
1012
1013 vdso->find_symbol = vdso__find_symbol;
1014
1015 dsos__add(vdso);
1016
1017 hypervisor_dso = dso__new("[hypervisor]", 0);
1018 if (!hypervisor_dso)
1019 return -1;
1020 dsos__add(hypervisor_dso);
1021
1022 return err;
1023}
1024
1025
927void symbol__init(void) 1026void symbol__init(void)
928{ 1027{
929 elf_version(EV_CURRENT); 1028 elf_version(EV_CURRENT);
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index b53bf0125c1b..6e8490716408 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -6,6 +6,7 @@
6#include <linux/list.h> 6#include <linux/list.h>
7#include <linux/rbtree.h> 7#include <linux/rbtree.h>
8#include "module.h" 8#include "module.h"
9#include "event.h"
9 10
10#ifdef HAVE_CPLUS_DEMANGLE 11#ifdef HAVE_CPLUS_DEMANGLE
11extern char *cplus_demangle(const char *, int); 12extern char *cplus_demangle(const char *, int);
@@ -54,7 +55,7 @@ struct dso {
54 char name[0]; 55 char name[0];
55}; 56};
56 57
57const char *sym_hist_filter; 58extern const char *sym_hist_filter;
58 59
59typedef int (*symbol_filter_t)(struct dso *self, struct symbol *sym); 60typedef int (*symbol_filter_t)(struct dso *self, struct symbol *sym);
60 61
@@ -72,9 +73,20 @@ int dso__load_kernel(struct dso *self, const char *vmlinux,
72 symbol_filter_t filter, int verbose, int modules); 73 symbol_filter_t filter, int verbose, int modules);
73int dso__load_modules(struct dso *self, symbol_filter_t filter, int verbose); 74int dso__load_modules(struct dso *self, symbol_filter_t filter, int verbose);
74int dso__load(struct dso *self, symbol_filter_t filter, int verbose); 75int dso__load(struct dso *self, symbol_filter_t filter, int verbose);
76struct dso *dsos__findnew(const char *name);
77void dsos__fprintf(FILE *fp);
75 78
76size_t dso__fprintf(struct dso *self, FILE *fp); 79size_t dso__fprintf(struct dso *self, FILE *fp);
77char dso__symtab_origin(const struct dso *self); 80char dso__symtab_origin(const struct dso *self);
78 81
82int load_kernel(void);
83
79void symbol__init(void); 84void symbol__init(void);
85
86extern struct list_head dsos;
87extern struct dso *kernel_dso;
88extern struct dso *vdso;
89extern struct dso *hypervisor_dso;
90extern const char *vmlinux_name;
91extern int modules;
80#endif /* _PERF_SYMBOL_ */ 92#endif /* _PERF_SYMBOL_ */
diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
new file mode 100644
index 000000000000..7635928ca278
--- /dev/null
+++ b/tools/perf/util/thread.c
@@ -0,0 +1,175 @@
1#include "../perf.h"
2#include <stdlib.h>
3#include <stdio.h>
4#include <string.h>
5#include "thread.h"
6#include "util.h"
7#include "debug.h"
8
9static struct thread *thread__new(pid_t pid)
10{
11 struct thread *self = malloc(sizeof(*self));
12
13 if (self != NULL) {
14 self->pid = pid;
15 self->comm = malloc(32);
16 if (self->comm)
17 snprintf(self->comm, 32, ":%d", self->pid);
18 INIT_LIST_HEAD(&self->maps);
19 }
20
21 return self;
22}
23
24int thread__set_comm(struct thread *self, const char *comm)
25{
26 if (self->comm)
27 free(self->comm);
28 self->comm = strdup(comm);
29 return self->comm ? 0 : -ENOMEM;
30}
31
32static size_t thread__fprintf(struct thread *self, FILE *fp)
33{
34 struct map *pos;
35 size_t ret = fprintf(fp, "Thread %d %s\n", self->pid, self->comm);
36
37 list_for_each_entry(pos, &self->maps, node)
38 ret += map__fprintf(pos, fp);
39
40 return ret;
41}
42
43struct thread *
44threads__findnew(pid_t pid, struct rb_root *threads, struct thread **last_match)
45{
46 struct rb_node **p = &threads->rb_node;
47 struct rb_node *parent = NULL;
48 struct thread *th;
49
50 /*
51 * Font-end cache - PID lookups come in blocks,
52 * so most of the time we dont have to look up
53 * the full rbtree:
54 */
55 if (*last_match && (*last_match)->pid == pid)
56 return *last_match;
57
58 while (*p != NULL) {
59 parent = *p;
60 th = rb_entry(parent, struct thread, rb_node);
61
62 if (th->pid == pid) {
63 *last_match = th;
64 return th;
65 }
66
67 if (pid < th->pid)
68 p = &(*p)->rb_left;
69 else
70 p = &(*p)->rb_right;
71 }
72
73 th = thread__new(pid);
74 if (th != NULL) {
75 rb_link_node(&th->rb_node, parent, p);
76 rb_insert_color(&th->rb_node, threads);
77 *last_match = th;
78 }
79
80 return th;
81}
82
83struct thread *
84register_idle_thread(struct rb_root *threads, struct thread **last_match)
85{
86 struct thread *thread = threads__findnew(0, threads, last_match);
87
88 if (!thread || thread__set_comm(thread, "[init]")) {
89 fprintf(stderr, "problem inserting idle task.\n");
90 exit(-1);
91 }
92
93 return thread;
94}
95
96void thread__insert_map(struct thread *self, struct map *map)
97{
98 struct map *pos, *tmp;
99
100 list_for_each_entry_safe(pos, tmp, &self->maps, node) {
101 if (map__overlap(pos, map)) {
102 if (verbose >= 2) {
103 printf("overlapping maps:\n");
104 map__fprintf(map, stdout);
105 map__fprintf(pos, stdout);
106 }
107
108 if (map->start <= pos->start && map->end > pos->start)
109 pos->start = map->end;
110
111 if (map->end >= pos->end && map->start < pos->end)
112 pos->end = map->start;
113
114 if (verbose >= 2) {
115 printf("after collision:\n");
116 map__fprintf(pos, stdout);
117 }
118
119 if (pos->start >= pos->end) {
120 list_del_init(&pos->node);
121 free(pos);
122 }
123 }
124 }
125
126 list_add_tail(&map->node, &self->maps);
127}
128
129int thread__fork(struct thread *self, struct thread *parent)
130{
131 struct map *map;
132
133 if (self->comm)
134 free(self->comm);
135 self->comm = strdup(parent->comm);
136 if (!self->comm)
137 return -ENOMEM;
138
139 list_for_each_entry(map, &parent->maps, node) {
140 struct map *new = map__clone(map);
141 if (!new)
142 return -ENOMEM;
143 thread__insert_map(self, new);
144 }
145
146 return 0;
147}
148
149struct map *thread__find_map(struct thread *self, u64 ip)
150{
151 struct map *pos;
152
153 if (self == NULL)
154 return NULL;
155
156 list_for_each_entry(pos, &self->maps, node)
157 if (ip >= pos->start && ip <= pos->end)
158 return pos;
159
160 return NULL;
161}
162
163size_t threads__fprintf(FILE *fp, struct rb_root *threads)
164{
165 size_t ret = 0;
166 struct rb_node *nd;
167
168 for (nd = rb_first(threads); nd; nd = rb_next(nd)) {
169 struct thread *pos = rb_entry(nd, struct thread, rb_node);
170
171 ret += thread__fprintf(pos, fp);
172 }
173
174 return ret;
175}
diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
new file mode 100644
index 000000000000..634f2809a342
--- /dev/null
+++ b/tools/perf/util/thread.h
@@ -0,0 +1,21 @@
1#include <linux/rbtree.h>
2#include <linux/list.h>
3#include <unistd.h>
4#include "symbol.h"
5
6struct thread {
7 struct rb_node rb_node;
8 struct list_head maps;
9 pid_t pid;
10 char *comm;
11};
12
13int thread__set_comm(struct thread *self, const char *comm);
14struct thread *
15threads__findnew(pid_t pid, struct rb_root *threads, struct thread **last_match);
16struct thread *
17register_idle_thread(struct rb_root *threads, struct thread **last_match);
18void thread__insert_map(struct thread *self, struct map *map);
19int thread__fork(struct thread *self, struct thread *parent);
20struct map *thread__find_map(struct thread *self, u64 ip);
21size_t threads__fprintf(FILE *fp, struct rb_root *threads);
diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c
new file mode 100644
index 000000000000..6c9302a7274c
--- /dev/null
+++ b/tools/perf/util/trace-event-info.c
@@ -0,0 +1,539 @@
1/*
2 * Copyright (C) 2008,2009, Steven Rostedt <srostedt@redhat.com>
3 *
4 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; version 2 of the License (not later!)
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 *
19 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
20 */
21#define _GNU_SOURCE
22#include <dirent.h>
23#include <stdio.h>
24#include <stdlib.h>
25#include <string.h>
26#include <stdarg.h>
27#include <sys/types.h>
28#include <sys/stat.h>
29#include <sys/wait.h>
30#include <pthread.h>
31#include <fcntl.h>
32#include <unistd.h>
33#include <ctype.h>
34#include <errno.h>
35#include <stdbool.h>
36
37#include "../perf.h"
38#include "trace-event.h"
39
40
41#define VERSION "0.5"
42
43#define _STR(x) #x
44#define STR(x) _STR(x)
45#define MAX_PATH 256
46
47#define TRACE_CTRL "tracing_on"
48#define TRACE "trace"
49#define AVAILABLE "available_tracers"
50#define CURRENT "current_tracer"
51#define ITER_CTRL "trace_options"
52#define MAX_LATENCY "tracing_max_latency"
53
54unsigned int page_size;
55
56static const char *output_file = "trace.info";
57static int output_fd;
58
59struct event_list {
60 struct event_list *next;
61 const char *event;
62};
63
64struct events {
65 struct events *sibling;
66 struct events *children;
67 struct events *next;
68 char *name;
69};
70
71
72
73static void die(const char *fmt, ...)
74{
75 va_list ap;
76 int ret = errno;
77
78 if (errno)
79 perror("trace-cmd");
80 else
81 ret = -1;
82
83 va_start(ap, fmt);
84 fprintf(stderr, " ");
85 vfprintf(stderr, fmt, ap);
86 va_end(ap);
87
88 fprintf(stderr, "\n");
89 exit(ret);
90}
91
92void *malloc_or_die(unsigned int size)
93{
94 void *data;
95
96 data = malloc(size);
97 if (!data)
98 die("malloc");
99 return data;
100}
101
102static const char *find_debugfs(void)
103{
104 static char debugfs[MAX_PATH+1];
105 static int debugfs_found;
106 char type[100];
107 FILE *fp;
108
109 if (debugfs_found)
110 return debugfs;
111
112 if ((fp = fopen("/proc/mounts","r")) == NULL)
113 die("Can't open /proc/mounts for read");
114
115 while (fscanf(fp, "%*s %"
116 STR(MAX_PATH)
117 "s %99s %*s %*d %*d\n",
118 debugfs, type) == 2) {
119 if (strcmp(type, "debugfs") == 0)
120 break;
121 }
122 fclose(fp);
123
124 if (strcmp(type, "debugfs") != 0)
125 die("debugfs not mounted, please mount");
126
127 debugfs_found = 1;
128
129 return debugfs;
130}
131
132/*
133 * Finds the path to the debugfs/tracing
134 * Allocates the string and stores it.
135 */
136static const char *find_tracing_dir(void)
137{
138 static char *tracing;
139 static int tracing_found;
140 const char *debugfs;
141
142 if (tracing_found)
143 return tracing;
144
145 debugfs = find_debugfs();
146
147 tracing = malloc_or_die(strlen(debugfs) + 9);
148
149 sprintf(tracing, "%s/tracing", debugfs);
150
151 tracing_found = 1;
152 return tracing;
153}
154
155static char *get_tracing_file(const char *name)
156{
157 const char *tracing;
158 char *file;
159
160 tracing = find_tracing_dir();
161 if (!tracing)
162 return NULL;
163
164 file = malloc_or_die(strlen(tracing) + strlen(name) + 2);
165
166 sprintf(file, "%s/%s", tracing, name);
167 return file;
168}
169
170static void put_tracing_file(char *file)
171{
172 free(file);
173}
174
175static ssize_t write_or_die(const void *buf, size_t len)
176{
177 int ret;
178
179 ret = write(output_fd, buf, len);
180 if (ret < 0)
181 die("writing to '%s'", output_file);
182
183 return ret;
184}
185
186int bigendian(void)
187{
188 unsigned char str[] = { 0x1, 0x2, 0x3, 0x4, 0x0, 0x0, 0x0, 0x0};
189 unsigned int *ptr;
190
191 ptr = (unsigned int *)(void *)str;
192 return *ptr == 0x01020304;
193}
194
195static unsigned long long copy_file_fd(int fd)
196{
197 unsigned long long size = 0;
198 char buf[BUFSIZ];
199 int r;
200
201 do {
202 r = read(fd, buf, BUFSIZ);
203 if (r > 0) {
204 size += r;
205 write_or_die(buf, r);
206 }
207 } while (r > 0);
208
209 return size;
210}
211
212static unsigned long long copy_file(const char *file)
213{
214 unsigned long long size = 0;
215 int fd;
216
217 fd = open(file, O_RDONLY);
218 if (fd < 0)
219 die("Can't read '%s'", file);
220 size = copy_file_fd(fd);
221 close(fd);
222
223 return size;
224}
225
226static unsigned long get_size_fd(int fd)
227{
228 unsigned long long size = 0;
229 char buf[BUFSIZ];
230 int r;
231
232 do {
233 r = read(fd, buf, BUFSIZ);
234 if (r > 0)
235 size += r;
236 } while (r > 0);
237
238 lseek(fd, 0, SEEK_SET);
239
240 return size;
241}
242
243static unsigned long get_size(const char *file)
244{
245 unsigned long long size = 0;
246 int fd;
247
248 fd = open(file, O_RDONLY);
249 if (fd < 0)
250 die("Can't read '%s'", file);
251 size = get_size_fd(fd);
252 close(fd);
253
254 return size;
255}
256
257static void read_header_files(void)
258{
259 unsigned long long size, check_size;
260 char *path;
261 int fd;
262
263 path = get_tracing_file("events/header_page");
264 fd = open(path, O_RDONLY);
265 if (fd < 0)
266 die("can't read '%s'", path);
267
268 /* unfortunately, you can not stat debugfs files for size */
269 size = get_size_fd(fd);
270
271 write_or_die("header_page", 12);
272 write_or_die(&size, 8);
273 check_size = copy_file_fd(fd);
274 if (size != check_size)
275 die("wrong size for '%s' size=%lld read=%lld",
276 path, size, check_size);
277 put_tracing_file(path);
278
279 path = get_tracing_file("events/header_event");
280 fd = open(path, O_RDONLY);
281 if (fd < 0)
282 die("can't read '%s'", path);
283
284 size = get_size_fd(fd);
285
286 write_or_die("header_event", 13);
287 write_or_die(&size, 8);
288 check_size = copy_file_fd(fd);
289 if (size != check_size)
290 die("wrong size for '%s'", path);
291 put_tracing_file(path);
292}
293
294static bool name_in_tp_list(char *sys, struct tracepoint_path *tps)
295{
296 while (tps) {
297 if (!strcmp(sys, tps->name))
298 return true;
299 tps = tps->next;
300 }
301
302 return false;
303}
304
305static void copy_event_system(const char *sys, struct tracepoint_path *tps)
306{
307 unsigned long long size, check_size;
308 struct dirent *dent;
309 struct stat st;
310 char *format;
311 DIR *dir;
312 int count = 0;
313 int ret;
314
315 dir = opendir(sys);
316 if (!dir)
317 die("can't read directory '%s'", sys);
318
319 while ((dent = readdir(dir))) {
320 if (strcmp(dent->d_name, ".") == 0 ||
321 strcmp(dent->d_name, "..") == 0 ||
322 !name_in_tp_list(dent->d_name, tps))
323 continue;
324 format = malloc_or_die(strlen(sys) + strlen(dent->d_name) + 10);
325 sprintf(format, "%s/%s/format", sys, dent->d_name);
326 ret = stat(format, &st);
327 free(format);
328 if (ret < 0)
329 continue;
330 count++;
331 }
332
333 write_or_die(&count, 4);
334
335 rewinddir(dir);
336 while ((dent = readdir(dir))) {
337 if (strcmp(dent->d_name, ".") == 0 ||
338 strcmp(dent->d_name, "..") == 0 ||
339 !name_in_tp_list(dent->d_name, tps))
340 continue;
341 format = malloc_or_die(strlen(sys) + strlen(dent->d_name) + 10);
342 sprintf(format, "%s/%s/format", sys, dent->d_name);
343 ret = stat(format, &st);
344
345 if (ret >= 0) {
346 /* unfortunately, you can not stat debugfs files for size */
347 size = get_size(format);
348 write_or_die(&size, 8);
349 check_size = copy_file(format);
350 if (size != check_size)
351 die("error in size of file '%s'", format);
352 }
353
354 free(format);
355 }
356}
357
358static void read_ftrace_files(struct tracepoint_path *tps)
359{
360 char *path;
361
362 path = get_tracing_file("events/ftrace");
363
364 copy_event_system(path, tps);
365
366 put_tracing_file(path);
367}
368
369static bool system_in_tp_list(char *sys, struct tracepoint_path *tps)
370{
371 while (tps) {
372 if (!strcmp(sys, tps->system))
373 return true;
374 tps = tps->next;
375 }
376
377 return false;
378}
379
380static void read_event_files(struct tracepoint_path *tps)
381{
382 struct dirent *dent;
383 struct stat st;
384 char *path;
385 char *sys;
386 DIR *dir;
387 int count = 0;
388 int ret;
389
390 path = get_tracing_file("events");
391
392 dir = opendir(path);
393 if (!dir)
394 die("can't read directory '%s'", path);
395
396 while ((dent = readdir(dir))) {
397 if (strcmp(dent->d_name, ".") == 0 ||
398 strcmp(dent->d_name, "..") == 0 ||
399 strcmp(dent->d_name, "ftrace") == 0 ||
400 !system_in_tp_list(dent->d_name, tps))
401 continue;
402 sys = malloc_or_die(strlen(path) + strlen(dent->d_name) + 2);
403 sprintf(sys, "%s/%s", path, dent->d_name);
404 ret = stat(sys, &st);
405 free(sys);
406 if (ret < 0)
407 continue;
408 if (S_ISDIR(st.st_mode))
409 count++;
410 }
411
412 write_or_die(&count, 4);
413
414 rewinddir(dir);
415 while ((dent = readdir(dir))) {
416 if (strcmp(dent->d_name, ".") == 0 ||
417 strcmp(dent->d_name, "..") == 0 ||
418 strcmp(dent->d_name, "ftrace") == 0 ||
419 !system_in_tp_list(dent->d_name, tps))
420 continue;
421 sys = malloc_or_die(strlen(path) + strlen(dent->d_name) + 2);
422 sprintf(sys, "%s/%s", path, dent->d_name);
423 ret = stat(sys, &st);
424 if (ret >= 0) {
425 if (S_ISDIR(st.st_mode)) {
426 write_or_die(dent->d_name, strlen(dent->d_name) + 1);
427 copy_event_system(sys, tps);
428 }
429 }
430 free(sys);
431 }
432
433 put_tracing_file(path);
434}
435
436static void read_proc_kallsyms(void)
437{
438 unsigned int size, check_size;
439 const char *path = "/proc/kallsyms";
440 struct stat st;
441 int ret;
442
443 ret = stat(path, &st);
444 if (ret < 0) {
445 /* not found */
446 size = 0;
447 write_or_die(&size, 4);
448 return;
449 }
450 size = get_size(path);
451 write_or_die(&size, 4);
452 check_size = copy_file(path);
453 if (size != check_size)
454 die("error in size of file '%s'", path);
455
456}
457
458static void read_ftrace_printk(void)
459{
460 unsigned int size, check_size;
461 const char *path;
462 struct stat st;
463 int ret;
464
465 path = get_tracing_file("printk_formats");
466 ret = stat(path, &st);
467 if (ret < 0) {
468 /* not found */
469 size = 0;
470 write_or_die(&size, 4);
471 return;
472 }
473 size = get_size(path);
474 write_or_die(&size, 4);
475 check_size = copy_file(path);
476 if (size != check_size)
477 die("error in size of file '%s'", path);
478
479}
480
481static struct tracepoint_path *
482get_tracepoints_path(struct perf_counter_attr *pattrs, int nb_counters)
483{
484 struct tracepoint_path path, *ppath = &path;
485 int i;
486
487 for (i = 0; i < nb_counters; i++) {
488 if (pattrs[i].type != PERF_TYPE_TRACEPOINT)
489 continue;
490 ppath->next = tracepoint_id_to_path(pattrs[i].config);
491 if (!ppath->next)
492 die("%s\n", "No memory to alloc tracepoints list");
493 ppath = ppath->next;
494 }
495
496 return path.next;
497}
498void read_tracing_data(struct perf_counter_attr *pattrs, int nb_counters)
499{
500 char buf[BUFSIZ];
501 struct tracepoint_path *tps;
502
503 output_fd = open(output_file, O_WRONLY | O_CREAT | O_TRUNC | O_LARGEFILE, 0644);
504 if (output_fd < 0)
505 die("creating file '%s'", output_file);
506
507 buf[0] = 23;
508 buf[1] = 8;
509 buf[2] = 68;
510 memcpy(buf + 3, "tracing", 7);
511
512 write_or_die(buf, 10);
513
514 write_or_die(VERSION, strlen(VERSION) + 1);
515
516 /* save endian */
517 if (bigendian())
518 buf[0] = 1;
519 else
520 buf[0] = 0;
521
522 write_or_die(buf, 1);
523
524 /* save size of long */
525 buf[0] = sizeof(long);
526 write_or_die(buf, 1);
527
528 /* save page_size */
529 page_size = getpagesize();
530 write_or_die(&page_size, 4);
531
532 tps = get_tracepoints_path(pattrs, nb_counters);
533
534 read_header_files();
535 read_ftrace_files(tps);
536 read_event_files(tps);
537 read_proc_kallsyms();
538 read_ftrace_printk();
539}
diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c
new file mode 100644
index 000000000000..629e602d9405
--- /dev/null
+++ b/tools/perf/util/trace-event-parse.c
@@ -0,0 +1,2942 @@
1/*
2 * Copyright (C) 2009, Steven Rostedt <srostedt@redhat.com>
3 *
4 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; version 2 of the License (not later!)
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 *
19 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
20 *
21 * The parts for function graph printing was taken and modified from the
22 * Linux Kernel that were written by Frederic Weisbecker.
23 */
24#define _GNU_SOURCE
25#include <stdio.h>
26#include <stdlib.h>
27#include <string.h>
28#include <ctype.h>
29#include <errno.h>
30
31#undef _GNU_SOURCE
32#include "../perf.h"
33#include "util.h"
34#include "trace-event.h"
35
36int header_page_ts_offset;
37int header_page_ts_size;
38int header_page_size_offset;
39int header_page_size_size;
40int header_page_data_offset;
41int header_page_data_size;
42
43static char *input_buf;
44static unsigned long long input_buf_ptr;
45static unsigned long long input_buf_siz;
46
47static int cpus;
48static int long_size;
49
50static void init_input_buf(char *buf, unsigned long long size)
51{
52 input_buf = buf;
53 input_buf_siz = size;
54 input_buf_ptr = 0;
55}
56
57struct cmdline {
58 char *comm;
59 int pid;
60};
61
62static struct cmdline *cmdlines;
63static int cmdline_count;
64
65static int cmdline_cmp(const void *a, const void *b)
66{
67 const struct cmdline *ca = a;
68 const struct cmdline *cb = b;
69
70 if (ca->pid < cb->pid)
71 return -1;
72 if (ca->pid > cb->pid)
73 return 1;
74
75 return 0;
76}
77
78void parse_cmdlines(char *file, int size __unused)
79{
80 struct cmdline_list {
81 struct cmdline_list *next;
82 char *comm;
83 int pid;
84 } *list = NULL, *item;
85 char *line;
86 char *next = NULL;
87 int i;
88
89 line = strtok_r(file, "\n", &next);
90 while (line) {
91 item = malloc_or_die(sizeof(*item));
92 sscanf(line, "%d %as", &item->pid,
93 (float *)(void *)&item->comm); /* workaround gcc warning */
94 item->next = list;
95 list = item;
96 line = strtok_r(NULL, "\n", &next);
97 cmdline_count++;
98 }
99
100 cmdlines = malloc_or_die(sizeof(*cmdlines) * cmdline_count);
101
102 i = 0;
103 while (list) {
104 cmdlines[i].pid = list->pid;
105 cmdlines[i].comm = list->comm;
106 i++;
107 item = list;
108 list = list->next;
109 free(item);
110 }
111
112 qsort(cmdlines, cmdline_count, sizeof(*cmdlines), cmdline_cmp);
113}
114
115static struct func_map {
116 unsigned long long addr;
117 char *func;
118 char *mod;
119} *func_list;
120static unsigned int func_count;
121
122static int func_cmp(const void *a, const void *b)
123{
124 const struct func_map *fa = a;
125 const struct func_map *fb = b;
126
127 if (fa->addr < fb->addr)
128 return -1;
129 if (fa->addr > fb->addr)
130 return 1;
131
132 return 0;
133}
134
135void parse_proc_kallsyms(char *file, unsigned int size __unused)
136{
137 struct func_list {
138 struct func_list *next;
139 unsigned long long addr;
140 char *func;
141 char *mod;
142 } *list = NULL, *item;
143 char *line;
144 char *next = NULL;
145 char *addr_str;
146 char ch;
147 int ret;
148 int i;
149
150 line = strtok_r(file, "\n", &next);
151 while (line) {
152 item = malloc_or_die(sizeof(*item));
153 item->mod = NULL;
154 ret = sscanf(line, "%as %c %as\t[%as",
155 (float *)(void *)&addr_str, /* workaround gcc warning */
156 &ch,
157 (float *)(void *)&item->func,
158 (float *)(void *)&item->mod);
159 item->addr = strtoull(addr_str, NULL, 16);
160 free(addr_str);
161
162 /* truncate the extra ']' */
163 if (item->mod)
164 item->mod[strlen(item->mod) - 1] = 0;
165
166
167 item->next = list;
168 list = item;
169 line = strtok_r(NULL, "\n", &next);
170 func_count++;
171 }
172
173 func_list = malloc_or_die(sizeof(*func_list) * func_count + 1);
174
175 i = 0;
176 while (list) {
177 func_list[i].func = list->func;
178 func_list[i].addr = list->addr;
179 func_list[i].mod = list->mod;
180 i++;
181 item = list;
182 list = list->next;
183 free(item);
184 }
185
186 qsort(func_list, func_count, sizeof(*func_list), func_cmp);
187
188 /*
189 * Add a special record at the end.
190 */
191 func_list[func_count].func = NULL;
192 func_list[func_count].addr = 0;
193 func_list[func_count].mod = NULL;
194}
195
196/*
197 * We are searching for a record in between, not an exact
198 * match.
199 */
200static int func_bcmp(const void *a, const void *b)
201{
202 const struct func_map *fa = a;
203 const struct func_map *fb = b;
204
205 if ((fa->addr == fb->addr) ||
206
207 (fa->addr > fb->addr &&
208 fa->addr < (fb+1)->addr))
209 return 0;
210
211 if (fa->addr < fb->addr)
212 return -1;
213
214 return 1;
215}
216
217static struct func_map *find_func(unsigned long long addr)
218{
219 struct func_map *func;
220 struct func_map key;
221
222 key.addr = addr;
223
224 func = bsearch(&key, func_list, func_count, sizeof(*func_list),
225 func_bcmp);
226
227 return func;
228}
229
230void print_funcs(void)
231{
232 int i;
233
234 for (i = 0; i < (int)func_count; i++) {
235 printf("%016llx %s",
236 func_list[i].addr,
237 func_list[i].func);
238 if (func_list[i].mod)
239 printf(" [%s]\n", func_list[i].mod);
240 else
241 printf("\n");
242 }
243}
244
245static struct printk_map {
246 unsigned long long addr;
247 char *printk;
248} *printk_list;
249static unsigned int printk_count;
250
251static int printk_cmp(const void *a, const void *b)
252{
253 const struct func_map *fa = a;
254 const struct func_map *fb = b;
255
256 if (fa->addr < fb->addr)
257 return -1;
258 if (fa->addr > fb->addr)
259 return 1;
260
261 return 0;
262}
263
264static struct printk_map *find_printk(unsigned long long addr)
265{
266 struct printk_map *printk;
267 struct printk_map key;
268
269 key.addr = addr;
270
271 printk = bsearch(&key, printk_list, printk_count, sizeof(*printk_list),
272 printk_cmp);
273
274 return printk;
275}
276
277void parse_ftrace_printk(char *file, unsigned int size __unused)
278{
279 struct printk_list {
280 struct printk_list *next;
281 unsigned long long addr;
282 char *printk;
283 } *list = NULL, *item;
284 char *line;
285 char *next = NULL;
286 char *addr_str;
287 int ret;
288 int i;
289
290 line = strtok_r(file, "\n", &next);
291 while (line) {
292 item = malloc_or_die(sizeof(*item));
293 ret = sscanf(line, "%as : %as",
294 (float *)(void *)&addr_str, /* workaround gcc warning */
295 (float *)(void *)&item->printk);
296 item->addr = strtoull(addr_str, NULL, 16);
297 free(addr_str);
298
299 item->next = list;
300 list = item;
301 line = strtok_r(NULL, "\n", &next);
302 printk_count++;
303 }
304
305 printk_list = malloc_or_die(sizeof(*printk_list) * printk_count + 1);
306
307 i = 0;
308 while (list) {
309 printk_list[i].printk = list->printk;
310 printk_list[i].addr = list->addr;
311 i++;
312 item = list;
313 list = list->next;
314 free(item);
315 }
316
317 qsort(printk_list, printk_count, sizeof(*printk_list), printk_cmp);
318}
319
320void print_printk(void)
321{
322 int i;
323
324 for (i = 0; i < (int)printk_count; i++) {
325 printf("%016llx %s\n",
326 printk_list[i].addr,
327 printk_list[i].printk);
328 }
329}
330
331static struct event *alloc_event(void)
332{
333 struct event *event;
334
335 event = malloc_or_die(sizeof(*event));
336 memset(event, 0, sizeof(*event));
337
338 return event;
339}
340
341enum event_type {
342 EVENT_ERROR,
343 EVENT_NONE,
344 EVENT_SPACE,
345 EVENT_NEWLINE,
346 EVENT_OP,
347 EVENT_DELIM,
348 EVENT_ITEM,
349 EVENT_DQUOTE,
350 EVENT_SQUOTE,
351};
352
353static struct event *event_list;
354
355static void add_event(struct event *event)
356{
357 event->next = event_list;
358 event_list = event;
359}
360
361static int event_item_type(enum event_type type)
362{
363 switch (type) {
364 case EVENT_ITEM ... EVENT_SQUOTE:
365 return 1;
366 case EVENT_ERROR ... EVENT_DELIM:
367 default:
368 return 0;
369 }
370}
371
372static void free_arg(struct print_arg *arg)
373{
374 if (!arg)
375 return;
376
377 switch (arg->type) {
378 case PRINT_ATOM:
379 if (arg->atom.atom)
380 free(arg->atom.atom);
381 break;
382 case PRINT_NULL:
383 case PRINT_FIELD ... PRINT_OP:
384 default:
385 /* todo */
386 break;
387 }
388
389 free(arg);
390}
391
392static enum event_type get_type(int ch)
393{
394 if (ch == '\n')
395 return EVENT_NEWLINE;
396 if (isspace(ch))
397 return EVENT_SPACE;
398 if (isalnum(ch) || ch == '_')
399 return EVENT_ITEM;
400 if (ch == '\'')
401 return EVENT_SQUOTE;
402 if (ch == '"')
403 return EVENT_DQUOTE;
404 if (!isprint(ch))
405 return EVENT_NONE;
406 if (ch == '(' || ch == ')' || ch == ',')
407 return EVENT_DELIM;
408
409 return EVENT_OP;
410}
411
412static int __read_char(void)
413{
414 if (input_buf_ptr >= input_buf_siz)
415 return -1;
416
417 return input_buf[input_buf_ptr++];
418}
419
420static int __peek_char(void)
421{
422 if (input_buf_ptr >= input_buf_siz)
423 return -1;
424
425 return input_buf[input_buf_ptr];
426}
427
428static enum event_type __read_token(char **tok)
429{
430 char buf[BUFSIZ];
431 int ch, last_ch, quote_ch, next_ch;
432 int i = 0;
433 int tok_size = 0;
434 enum event_type type;
435
436 *tok = NULL;
437
438
439 ch = __read_char();
440 if (ch < 0)
441 return EVENT_NONE;
442
443 type = get_type(ch);
444 if (type == EVENT_NONE)
445 return type;
446
447 buf[i++] = ch;
448
449 switch (type) {
450 case EVENT_NEWLINE:
451 case EVENT_DELIM:
452 *tok = malloc_or_die(2);
453 (*tok)[0] = ch;
454 (*tok)[1] = 0;
455 return type;
456
457 case EVENT_OP:
458 switch (ch) {
459 case '-':
460 next_ch = __peek_char();
461 if (next_ch == '>') {
462 buf[i++] = __read_char();
463 break;
464 }
465 /* fall through */
466 case '+':
467 case '|':
468 case '&':
469 case '>':
470 case '<':
471 last_ch = ch;
472 ch = __peek_char();
473 if (ch != last_ch)
474 goto test_equal;
475 buf[i++] = __read_char();
476 switch (last_ch) {
477 case '>':
478 case '<':
479 goto test_equal;
480 default:
481 break;
482 }
483 break;
484 case '!':
485 case '=':
486 goto test_equal;
487 default: /* what should we do instead? */
488 break;
489 }
490 buf[i] = 0;
491 *tok = strdup(buf);
492 return type;
493
494 test_equal:
495 ch = __peek_char();
496 if (ch == '=')
497 buf[i++] = __read_char();
498 break;
499
500 case EVENT_DQUOTE:
501 case EVENT_SQUOTE:
502 /* don't keep quotes */
503 i--;
504 quote_ch = ch;
505 last_ch = 0;
506 do {
507 if (i == (BUFSIZ - 1)) {
508 buf[i] = 0;
509 if (*tok) {
510 *tok = realloc(*tok, tok_size + BUFSIZ);
511 if (!*tok)
512 return EVENT_NONE;
513 strcat(*tok, buf);
514 } else
515 *tok = strdup(buf);
516
517 if (!*tok)
518 return EVENT_NONE;
519 tok_size += BUFSIZ;
520 i = 0;
521 }
522 last_ch = ch;
523 ch = __read_char();
524 buf[i++] = ch;
525 } while (ch != quote_ch && last_ch != '\\');
526 /* remove the last quote */
527 i--;
528 goto out;
529
530 case EVENT_ERROR ... EVENT_SPACE:
531 case EVENT_ITEM:
532 default:
533 break;
534 }
535
536 while (get_type(__peek_char()) == type) {
537 if (i == (BUFSIZ - 1)) {
538 buf[i] = 0;
539 if (*tok) {
540 *tok = realloc(*tok, tok_size + BUFSIZ);
541 if (!*tok)
542 return EVENT_NONE;
543 strcat(*tok, buf);
544 } else
545 *tok = strdup(buf);
546
547 if (!*tok)
548 return EVENT_NONE;
549 tok_size += BUFSIZ;
550 i = 0;
551 }
552 ch = __read_char();
553 buf[i++] = ch;
554 }
555
556 out:
557 buf[i] = 0;
558 if (*tok) {
559 *tok = realloc(*tok, tok_size + i);
560 if (!*tok)
561 return EVENT_NONE;
562 strcat(*tok, buf);
563 } else
564 *tok = strdup(buf);
565 if (!*tok)
566 return EVENT_NONE;
567
568 return type;
569}
570
571static void free_token(char *tok)
572{
573 if (tok)
574 free(tok);
575}
576
577static enum event_type read_token(char **tok)
578{
579 enum event_type type;
580
581 for (;;) {
582 type = __read_token(tok);
583 if (type != EVENT_SPACE)
584 return type;
585
586 free_token(*tok);
587 }
588
589 /* not reached */
590 return EVENT_NONE;
591}
592
593/* no newline */
594static enum event_type read_token_item(char **tok)
595{
596 enum event_type type;
597
598 for (;;) {
599 type = __read_token(tok);
600 if (type != EVENT_SPACE && type != EVENT_NEWLINE)
601 return type;
602
603 free_token(*tok);
604 }
605
606 /* not reached */
607 return EVENT_NONE;
608}
609
610static int test_type(enum event_type type, enum event_type expect)
611{
612 if (type != expect) {
613 die("Error: expected type %d but read %d",
614 expect, type);
615 return -1;
616 }
617 return 0;
618}
619
620static int test_type_token(enum event_type type, char *token,
621 enum event_type expect, char *expect_tok)
622{
623 if (type != expect) {
624 die("Error: expected type %d but read %d",
625 expect, type);
626 return -1;
627 }
628
629 if (strcmp(token, expect_tok) != 0) {
630 die("Error: expected '%s' but read '%s'",
631 expect_tok, token);
632 return -1;
633 }
634 return 0;
635}
636
637static int __read_expect_type(enum event_type expect, char **tok, int newline_ok)
638{
639 enum event_type type;
640
641 if (newline_ok)
642 type = read_token(tok);
643 else
644 type = read_token_item(tok);
645 return test_type(type, expect);
646}
647
648static int read_expect_type(enum event_type expect, char **tok)
649{
650 return __read_expect_type(expect, tok, 1);
651}
652
653static int __read_expected(enum event_type expect, char *str, int newline_ok)
654{
655 enum event_type type;
656 char *token;
657 int ret;
658
659 if (newline_ok)
660 type = read_token(&token);
661 else
662 type = read_token_item(&token);
663
664 ret = test_type_token(type, token, expect, str);
665
666 free_token(token);
667
668 return 0;
669}
670
671static int read_expected(enum event_type expect, char *str)
672{
673 return __read_expected(expect, str, 1);
674}
675
676static int read_expected_item(enum event_type expect, char *str)
677{
678 return __read_expected(expect, str, 0);
679}
680
681static char *event_read_name(void)
682{
683 char *token;
684
685 if (read_expected(EVENT_ITEM, (char *)"name") < 0)
686 return NULL;
687
688 if (read_expected(EVENT_OP, (char *)":") < 0)
689 return NULL;
690
691 if (read_expect_type(EVENT_ITEM, &token) < 0)
692 goto fail;
693
694 return token;
695
696 fail:
697 free_token(token);
698 return NULL;
699}
700
701static int event_read_id(void)
702{
703 char *token;
704 int id;
705
706 if (read_expected_item(EVENT_ITEM, (char *)"ID") < 0)
707 return -1;
708
709 if (read_expected(EVENT_OP, (char *)":") < 0)
710 return -1;
711
712 if (read_expect_type(EVENT_ITEM, &token) < 0)
713 goto fail;
714
715 id = strtoul(token, NULL, 0);
716 free_token(token);
717 return id;
718
719 fail:
720 free_token(token);
721 return -1;
722}
723
724static int event_read_fields(struct event *event, struct format_field **fields)
725{
726 struct format_field *field = NULL;
727 enum event_type type;
728 char *token;
729 char *last_token;
730 int count = 0;
731
732 do {
733 type = read_token(&token);
734 if (type == EVENT_NEWLINE) {
735 free_token(token);
736 return count;
737 }
738
739 count++;
740
741 if (test_type_token(type, token, EVENT_ITEM, (char *)"field"))
742 goto fail;
743 free_token(token);
744
745 type = read_token(&token);
746 /*
747 * The ftrace fields may still use the "special" name.
748 * Just ignore it.
749 */
750 if (event->flags & EVENT_FL_ISFTRACE &&
751 type == EVENT_ITEM && strcmp(token, "special") == 0) {
752 free_token(token);
753 type = read_token(&token);
754 }
755
756 if (test_type_token(type, token, EVENT_OP, (char *)":") < 0)
757 return -1;
758
759 if (read_expect_type(EVENT_ITEM, &token) < 0)
760 goto fail;
761
762 last_token = token;
763
764 field = malloc_or_die(sizeof(*field));
765 memset(field, 0, sizeof(*field));
766
767 /* read the rest of the type */
768 for (;;) {
769 type = read_token(&token);
770 if (type == EVENT_ITEM ||
771 (type == EVENT_OP && strcmp(token, "*") == 0) ||
772 /*
773 * Some of the ftrace fields are broken and have
774 * an illegal "." in them.
775 */
776 (event->flags & EVENT_FL_ISFTRACE &&
777 type == EVENT_OP && strcmp(token, ".") == 0)) {
778
779 if (strcmp(token, "*") == 0)
780 field->flags |= FIELD_IS_POINTER;
781
782 if (field->type) {
783 field->type = realloc(field->type,
784 strlen(field->type) +
785 strlen(last_token) + 2);
786 strcat(field->type, " ");
787 strcat(field->type, last_token);
788 } else
789 field->type = last_token;
790 last_token = token;
791 continue;
792 }
793
794 break;
795 }
796
797 if (!field->type) {
798 die("no type found");
799 goto fail;
800 }
801 field->name = last_token;
802
803 if (test_type(type, EVENT_OP))
804 goto fail;
805
806 if (strcmp(token, "[") == 0) {
807 enum event_type last_type = type;
808 char *brackets = token;
809 int len;
810
811 field->flags |= FIELD_IS_ARRAY;
812
813 type = read_token(&token);
814 while (strcmp(token, "]") != 0) {
815 if (last_type == EVENT_ITEM &&
816 type == EVENT_ITEM)
817 len = 2;
818 else
819 len = 1;
820 last_type = type;
821
822 brackets = realloc(brackets,
823 strlen(brackets) +
824 strlen(token) + len);
825 if (len == 2)
826 strcat(brackets, " ");
827 strcat(brackets, token);
828 free_token(token);
829 type = read_token(&token);
830 if (type == EVENT_NONE) {
831 die("failed to find token");
832 goto fail;
833 }
834 }
835
836 free_token(token);
837
838 brackets = realloc(brackets, strlen(brackets) + 2);
839 strcat(brackets, "]");
840
841 /* add brackets to type */
842
843 type = read_token(&token);
844 /*
845 * If the next token is not an OP, then it is of
846 * the format: type [] item;
847 */
848 if (type == EVENT_ITEM) {
849 field->type = realloc(field->type,
850 strlen(field->type) +
851 strlen(field->name) +
852 strlen(brackets) + 2);
853 strcat(field->type, " ");
854 strcat(field->type, field->name);
855 free_token(field->name);
856 strcat(field->type, brackets);
857 field->name = token;
858 type = read_token(&token);
859 } else {
860 field->type = realloc(field->type,
861 strlen(field->type) +
862 strlen(brackets) + 1);
863 strcat(field->type, brackets);
864 }
865 free(brackets);
866 }
867
868 if (test_type_token(type, token, EVENT_OP, (char *)";"))
869 goto fail;
870 free_token(token);
871
872 if (read_expected(EVENT_ITEM, (char *)"offset") < 0)
873 goto fail_expect;
874
875 if (read_expected(EVENT_OP, (char *)":") < 0)
876 goto fail_expect;
877
878 if (read_expect_type(EVENT_ITEM, &token))
879 goto fail;
880 field->offset = strtoul(token, NULL, 0);
881 free_token(token);
882
883 if (read_expected(EVENT_OP, (char *)";") < 0)
884 goto fail_expect;
885
886 if (read_expected(EVENT_ITEM, (char *)"size") < 0)
887 goto fail_expect;
888
889 if (read_expected(EVENT_OP, (char *)":") < 0)
890 goto fail_expect;
891
892 if (read_expect_type(EVENT_ITEM, &token))
893 goto fail;
894 field->size = strtoul(token, NULL, 0);
895 free_token(token);
896
897 if (read_expected(EVENT_OP, (char *)";") < 0)
898 goto fail_expect;
899
900 if (read_expect_type(EVENT_NEWLINE, &token) < 0)
901 goto fail;
902 free_token(token);
903
904 *fields = field;
905 fields = &field->next;
906
907 } while (1);
908
909 return 0;
910
911fail:
912 free_token(token);
913fail_expect:
914 if (field)
915 free(field);
916 return -1;
917}
918
919static int event_read_format(struct event *event)
920{
921 char *token;
922 int ret;
923
924 if (read_expected_item(EVENT_ITEM, (char *)"format") < 0)
925 return -1;
926
927 if (read_expected(EVENT_OP, (char *)":") < 0)
928 return -1;
929
930 if (read_expect_type(EVENT_NEWLINE, &token))
931 goto fail;
932 free_token(token);
933
934 ret = event_read_fields(event, &event->format.common_fields);
935 if (ret < 0)
936 return ret;
937 event->format.nr_common = ret;
938
939 ret = event_read_fields(event, &event->format.fields);
940 if (ret < 0)
941 return ret;
942 event->format.nr_fields = ret;
943
944 return 0;
945
946 fail:
947 free_token(token);
948 return -1;
949}
950
951enum event_type
952process_arg_token(struct event *event, struct print_arg *arg,
953 char **tok, enum event_type type);
954
955static enum event_type
956process_arg(struct event *event, struct print_arg *arg, char **tok)
957{
958 enum event_type type;
959 char *token;
960
961 type = read_token(&token);
962 *tok = token;
963
964 return process_arg_token(event, arg, tok, type);
965}
966
967static enum event_type
968process_cond(struct event *event, struct print_arg *top, char **tok)
969{
970 struct print_arg *arg, *left, *right;
971 enum event_type type;
972 char *token = NULL;
973
974 arg = malloc_or_die(sizeof(*arg));
975 memset(arg, 0, sizeof(*arg));
976
977 left = malloc_or_die(sizeof(*left));
978
979 right = malloc_or_die(sizeof(*right));
980
981 arg->type = PRINT_OP;
982 arg->op.left = left;
983 arg->op.right = right;
984
985 *tok = NULL;
986 type = process_arg(event, left, &token);
987 if (test_type_token(type, token, EVENT_OP, (char *)":"))
988 goto out_free;
989
990 arg->op.op = token;
991
992 type = process_arg(event, right, &token);
993
994 top->op.right = arg;
995
996 *tok = token;
997 return type;
998
999out_free:
1000 free_token(*tok);
1001 free(right);
1002 free(left);
1003 free_arg(arg);
1004 return EVENT_ERROR;
1005}
1006
1007static int get_op_prio(char *op)
1008{
1009 if (!op[1]) {
1010 switch (op[0]) {
1011 case '*':
1012 case '/':
1013 case '%':
1014 return 6;
1015 case '+':
1016 case '-':
1017 return 7;
1018 /* '>>' and '<<' are 8 */
1019 case '<':
1020 case '>':
1021 return 9;
1022 /* '==' and '!=' are 10 */
1023 case '&':
1024 return 11;
1025 case '^':
1026 return 12;
1027 case '|':
1028 return 13;
1029 case '?':
1030 return 16;
1031 default:
1032 die("unknown op '%c'", op[0]);
1033 return -1;
1034 }
1035 } else {
1036 if (strcmp(op, "++") == 0 ||
1037 strcmp(op, "--") == 0) {
1038 return 3;
1039 } else if (strcmp(op, ">>") == 0 ||
1040 strcmp(op, "<<") == 0) {
1041 return 8;
1042 } else if (strcmp(op, ">=") == 0 ||
1043 strcmp(op, "<=") == 0) {
1044 return 9;
1045 } else if (strcmp(op, "==") == 0 ||
1046 strcmp(op, "!=") == 0) {
1047 return 10;
1048 } else if (strcmp(op, "&&") == 0) {
1049 return 14;
1050 } else if (strcmp(op, "||") == 0) {
1051 return 15;
1052 } else {
1053 die("unknown op '%s'", op);
1054 return -1;
1055 }
1056 }
1057}
1058
1059static void set_op_prio(struct print_arg *arg)
1060{
1061
1062 /* single ops are the greatest */
1063 if (!arg->op.left || arg->op.left->type == PRINT_NULL) {
1064 arg->op.prio = 0;
1065 return;
1066 }
1067
1068 arg->op.prio = get_op_prio(arg->op.op);
1069}
1070
1071static enum event_type
1072process_op(struct event *event, struct print_arg *arg, char **tok)
1073{
1074 struct print_arg *left, *right = NULL;
1075 enum event_type type;
1076 char *token;
1077
1078 /* the op is passed in via tok */
1079 token = *tok;
1080
1081 if (arg->type == PRINT_OP && !arg->op.left) {
1082 /* handle single op */
1083 if (token[1]) {
1084 die("bad op token %s", token);
1085 return EVENT_ERROR;
1086 }
1087 switch (token[0]) {
1088 case '!':
1089 case '+':
1090 case '-':
1091 break;
1092 default:
1093 die("bad op token %s", token);
1094 return EVENT_ERROR;
1095 }
1096
1097 /* make an empty left */
1098 left = malloc_or_die(sizeof(*left));
1099 left->type = PRINT_NULL;
1100 arg->op.left = left;
1101
1102 right = malloc_or_die(sizeof(*right));
1103 arg->op.right = right;
1104
1105 type = process_arg(event, right, tok);
1106
1107 } else if (strcmp(token, "?") == 0) {
1108
1109 left = malloc_or_die(sizeof(*left));
1110 /* copy the top arg to the left */
1111 *left = *arg;
1112
1113 arg->type = PRINT_OP;
1114 arg->op.op = token;
1115 arg->op.left = left;
1116 arg->op.prio = 0;
1117
1118 type = process_cond(event, arg, tok);
1119
1120 } else if (strcmp(token, ">>") == 0 ||
1121 strcmp(token, "<<") == 0 ||
1122 strcmp(token, "&") == 0 ||
1123 strcmp(token, "|") == 0 ||
1124 strcmp(token, "&&") == 0 ||
1125 strcmp(token, "||") == 0 ||
1126 strcmp(token, "-") == 0 ||
1127 strcmp(token, "+") == 0 ||
1128 strcmp(token, "*") == 0 ||
1129 strcmp(token, "^") == 0 ||
1130 strcmp(token, "/") == 0 ||
1131 strcmp(token, "==") == 0 ||
1132 strcmp(token, "!=") == 0) {
1133
1134 left = malloc_or_die(sizeof(*left));
1135
1136 /* copy the top arg to the left */
1137 *left = *arg;
1138
1139 arg->type = PRINT_OP;
1140 arg->op.op = token;
1141 arg->op.left = left;
1142
1143 set_op_prio(arg);
1144
1145 right = malloc_or_die(sizeof(*right));
1146
1147 type = process_arg(event, right, tok);
1148
1149 arg->op.right = right;
1150
1151 } else {
1152 die("unknown op '%s'", token);
1153 /* the arg is now the left side */
1154 return EVENT_NONE;
1155 }
1156
1157
1158 if (type == EVENT_OP) {
1159 int prio;
1160
1161 /* higher prios need to be closer to the root */
1162 prio = get_op_prio(*tok);
1163
1164 if (prio > arg->op.prio)
1165 return process_op(event, arg, tok);
1166
1167 return process_op(event, right, tok);
1168 }
1169
1170 return type;
1171}
1172
1173static enum event_type
1174process_entry(struct event *event __unused, struct print_arg *arg,
1175 char **tok)
1176{
1177 enum event_type type;
1178 char *field;
1179 char *token;
1180
1181 if (read_expected(EVENT_OP, (char *)"->") < 0)
1182 return EVENT_ERROR;
1183
1184 if (read_expect_type(EVENT_ITEM, &token) < 0)
1185 goto fail;
1186 field = token;
1187
1188 arg->type = PRINT_FIELD;
1189 arg->field.name = field;
1190
1191 type = read_token(&token);
1192 *tok = token;
1193
1194 return type;
1195
1196fail:
1197 free_token(token);
1198 return EVENT_ERROR;
1199}
1200
1201static char *arg_eval (struct print_arg *arg);
1202
1203static long long arg_num_eval(struct print_arg *arg)
1204{
1205 long long left, right;
1206 long long val = 0;
1207
1208 switch (arg->type) {
1209 case PRINT_ATOM:
1210 val = strtoll(arg->atom.atom, NULL, 0);
1211 break;
1212 case PRINT_TYPE:
1213 val = arg_num_eval(arg->typecast.item);
1214 break;
1215 case PRINT_OP:
1216 switch (arg->op.op[0]) {
1217 case '|':
1218 left = arg_num_eval(arg->op.left);
1219 right = arg_num_eval(arg->op.right);
1220 if (arg->op.op[1])
1221 val = left || right;
1222 else
1223 val = left | right;
1224 break;
1225 case '&':
1226 left = arg_num_eval(arg->op.left);
1227 right = arg_num_eval(arg->op.right);
1228 if (arg->op.op[1])
1229 val = left && right;
1230 else
1231 val = left & right;
1232 break;
1233 case '<':
1234 left = arg_num_eval(arg->op.left);
1235 right = arg_num_eval(arg->op.right);
1236 switch (arg->op.op[1]) {
1237 case 0:
1238 val = left < right;
1239 break;
1240 case '<':
1241 val = left << right;
1242 break;
1243 case '=':
1244 val = left <= right;
1245 break;
1246 default:
1247 die("unknown op '%s'", arg->op.op);
1248 }
1249 break;
1250 case '>':
1251 left = arg_num_eval(arg->op.left);
1252 right = arg_num_eval(arg->op.right);
1253 switch (arg->op.op[1]) {
1254 case 0:
1255 val = left > right;
1256 break;
1257 case '>':
1258 val = left >> right;
1259 break;
1260 case '=':
1261 val = left >= right;
1262 break;
1263 default:
1264 die("unknown op '%s'", arg->op.op);
1265 }
1266 break;
1267 case '=':
1268 left = arg_num_eval(arg->op.left);
1269 right = arg_num_eval(arg->op.right);
1270
1271 if (arg->op.op[1] != '=')
1272 die("unknown op '%s'", arg->op.op);
1273
1274 val = left == right;
1275 break;
1276 case '!':
1277 left = arg_num_eval(arg->op.left);
1278 right = arg_num_eval(arg->op.right);
1279
1280 switch (arg->op.op[1]) {
1281 case '=':
1282 val = left != right;
1283 break;
1284 default:
1285 die("unknown op '%s'", arg->op.op);
1286 }
1287 break;
1288 default:
1289 die("unknown op '%s'", arg->op.op);
1290 }
1291 break;
1292
1293 case PRINT_NULL:
1294 case PRINT_FIELD ... PRINT_SYMBOL:
1295 case PRINT_STRING:
1296 default:
1297 die("invalid eval type %d", arg->type);
1298
1299 }
1300 return val;
1301}
1302
1303static char *arg_eval (struct print_arg *arg)
1304{
1305 long long val;
1306 static char buf[20];
1307
1308 switch (arg->type) {
1309 case PRINT_ATOM:
1310 return arg->atom.atom;
1311 case PRINT_TYPE:
1312 return arg_eval(arg->typecast.item);
1313 case PRINT_OP:
1314 val = arg_num_eval(arg);
1315 sprintf(buf, "%lld", val);
1316 return buf;
1317
1318 case PRINT_NULL:
1319 case PRINT_FIELD ... PRINT_SYMBOL:
1320 case PRINT_STRING:
1321 default:
1322 die("invalid eval type %d", arg->type);
1323 break;
1324 }
1325
1326 return NULL;
1327}
1328
1329static enum event_type
1330process_fields(struct event *event, struct print_flag_sym **list, char **tok)
1331{
1332 enum event_type type;
1333 struct print_arg *arg = NULL;
1334 struct print_flag_sym *field;
1335 char *token = NULL;
1336 char *value;
1337
1338 do {
1339 free_token(token);
1340 type = read_token_item(&token);
1341 if (test_type_token(type, token, EVENT_OP, (char *)"{"))
1342 break;
1343
1344 arg = malloc_or_die(sizeof(*arg));
1345
1346 free_token(token);
1347 type = process_arg(event, arg, &token);
1348 if (test_type_token(type, token, EVENT_DELIM, (char *)","))
1349 goto out_free;
1350
1351 field = malloc_or_die(sizeof(*field));
1352 memset(field, 0, sizeof(field));
1353
1354 value = arg_eval(arg);
1355 field->value = strdup(value);
1356
1357 free_token(token);
1358 type = process_arg(event, arg, &token);
1359 if (test_type_token(type, token, EVENT_OP, (char *)"}"))
1360 goto out_free;
1361
1362 value = arg_eval(arg);
1363 field->str = strdup(value);
1364 free_arg(arg);
1365 arg = NULL;
1366
1367 *list = field;
1368 list = &field->next;
1369
1370 free_token(token);
1371 type = read_token_item(&token);
1372 } while (type == EVENT_DELIM && strcmp(token, ",") == 0);
1373
1374 *tok = token;
1375 return type;
1376
1377out_free:
1378 free_arg(arg);
1379 free_token(token);
1380
1381 return EVENT_ERROR;
1382}
1383
1384static enum event_type
1385process_flags(struct event *event, struct print_arg *arg, char **tok)
1386{
1387 struct print_arg *field;
1388 enum event_type type;
1389 char *token;
1390
1391 memset(arg, 0, sizeof(*arg));
1392 arg->type = PRINT_FLAGS;
1393
1394 if (read_expected_item(EVENT_DELIM, (char *)"(") < 0)
1395 return EVENT_ERROR;
1396
1397 field = malloc_or_die(sizeof(*field));
1398
1399 type = process_arg(event, field, &token);
1400 if (test_type_token(type, token, EVENT_DELIM, (char *)","))
1401 goto out_free;
1402
1403 arg->flags.field = field;
1404
1405 type = read_token_item(&token);
1406 if (event_item_type(type)) {
1407 arg->flags.delim = token;
1408 type = read_token_item(&token);
1409 }
1410
1411 if (test_type_token(type, token, EVENT_DELIM, (char *)","))
1412 goto out_free;
1413
1414 type = process_fields(event, &arg->flags.flags, &token);
1415 if (test_type_token(type, token, EVENT_DELIM, (char *)")"))
1416 goto out_free;
1417
1418 free_token(token);
1419 type = read_token_item(tok);
1420 return type;
1421
1422out_free:
1423 free_token(token);
1424 return EVENT_ERROR;
1425}
1426
1427static enum event_type
1428process_symbols(struct event *event, struct print_arg *arg, char **tok)
1429{
1430 struct print_arg *field;
1431 enum event_type type;
1432 char *token;
1433
1434 memset(arg, 0, sizeof(*arg));
1435 arg->type = PRINT_SYMBOL;
1436
1437 if (read_expected_item(EVENT_DELIM, (char *)"(") < 0)
1438 return EVENT_ERROR;
1439
1440 field = malloc_or_die(sizeof(*field));
1441
1442 type = process_arg(event, field, &token);
1443 if (test_type_token(type, token, EVENT_DELIM, (char *)","))
1444 goto out_free;
1445
1446 arg->symbol.field = field;
1447
1448 type = process_fields(event, &arg->symbol.symbols, &token);
1449 if (test_type_token(type, token, EVENT_DELIM, (char *)")"))
1450 goto out_free;
1451
1452 free_token(token);
1453 type = read_token_item(tok);
1454 return type;
1455
1456out_free:
1457 free_token(token);
1458 return EVENT_ERROR;
1459}
1460
1461static enum event_type
1462process_paren(struct event *event, struct print_arg *arg, char **tok)
1463{
1464 struct print_arg *item_arg;
1465 enum event_type type;
1466 int ptr_cast = 0;
1467 char *token;
1468
1469 type = process_arg(event, arg, &token);
1470
1471 if (type == EVENT_ERROR)
1472 return EVENT_ERROR;
1473
1474 if (type == EVENT_OP) {
1475 /* handle the ptr casts */
1476 if (!strcmp(token, "*")) {
1477 /*
1478 * FIXME: should we zapp whitespaces before ')' ?
1479 * (may require a peek_token_item())
1480 */
1481 if (__peek_char() == ')') {
1482 ptr_cast = 1;
1483 free_token(token);
1484 type = read_token_item(&token);
1485 }
1486 }
1487 if (!ptr_cast) {
1488 type = process_op(event, arg, &token);
1489
1490 if (type == EVENT_ERROR)
1491 return EVENT_ERROR;
1492 }
1493 }
1494
1495 if (test_type_token(type, token, EVENT_DELIM, (char *)")")) {
1496 free_token(token);
1497 return EVENT_ERROR;
1498 }
1499
1500 free_token(token);
1501 type = read_token_item(&token);
1502
1503 /*
1504 * If the next token is an item or another open paren, then
1505 * this was a typecast.
1506 */
1507 if (event_item_type(type) ||
1508 (type == EVENT_DELIM && strcmp(token, "(") == 0)) {
1509
1510 /* make this a typecast and contine */
1511
1512 /* prevous must be an atom */
1513 if (arg->type != PRINT_ATOM)
1514 die("previous needed to be PRINT_ATOM");
1515
1516 item_arg = malloc_or_die(sizeof(*item_arg));
1517
1518 arg->type = PRINT_TYPE;
1519 if (ptr_cast) {
1520 char *old = arg->atom.atom;
1521
1522 arg->atom.atom = malloc_or_die(strlen(old + 3));
1523 sprintf(arg->atom.atom, "%s *", old);
1524 free(old);
1525 }
1526 arg->typecast.type = arg->atom.atom;
1527 arg->typecast.item = item_arg;
1528 type = process_arg_token(event, item_arg, &token, type);
1529
1530 }
1531
1532 *tok = token;
1533 return type;
1534}
1535
1536
1537static enum event_type
1538process_str(struct event *event __unused, struct print_arg *arg, char **tok)
1539{
1540 enum event_type type;
1541 char *token;
1542
1543 if (read_expected(EVENT_DELIM, (char *)"(") < 0)
1544 return EVENT_ERROR;
1545
1546 if (read_expect_type(EVENT_ITEM, &token) < 0)
1547 goto fail;
1548
1549 arg->type = PRINT_STRING;
1550 arg->string.string = token;
1551 arg->string.offset = -1;
1552
1553 if (read_expected(EVENT_DELIM, (char *)")") < 0)
1554 return EVENT_ERROR;
1555
1556 type = read_token(&token);
1557 *tok = token;
1558
1559 return type;
1560fail:
1561 free_token(token);
1562 return EVENT_ERROR;
1563}
1564
1565enum event_type
1566process_arg_token(struct event *event, struct print_arg *arg,
1567 char **tok, enum event_type type)
1568{
1569 char *token;
1570 char *atom;
1571
1572 token = *tok;
1573
1574 switch (type) {
1575 case EVENT_ITEM:
1576 if (strcmp(token, "REC") == 0) {
1577 free_token(token);
1578 type = process_entry(event, arg, &token);
1579 } else if (strcmp(token, "__print_flags") == 0) {
1580 free_token(token);
1581 type = process_flags(event, arg, &token);
1582 } else if (strcmp(token, "__print_symbolic") == 0) {
1583 free_token(token);
1584 type = process_symbols(event, arg, &token);
1585 } else if (strcmp(token, "__get_str") == 0) {
1586 free_token(token);
1587 type = process_str(event, arg, &token);
1588 } else {
1589 atom = token;
1590 /* test the next token */
1591 type = read_token_item(&token);
1592
1593 /* atoms can be more than one token long */
1594 while (type == EVENT_ITEM) {
1595 atom = realloc(atom, strlen(atom) + strlen(token) + 2);
1596 strcat(atom, " ");
1597 strcat(atom, token);
1598 free_token(token);
1599 type = read_token_item(&token);
1600 }
1601
1602 /* todo, test for function */
1603
1604 arg->type = PRINT_ATOM;
1605 arg->atom.atom = atom;
1606 }
1607 break;
1608 case EVENT_DQUOTE:
1609 case EVENT_SQUOTE:
1610 arg->type = PRINT_ATOM;
1611 arg->atom.atom = token;
1612 type = read_token_item(&token);
1613 break;
1614 case EVENT_DELIM:
1615 if (strcmp(token, "(") == 0) {
1616 free_token(token);
1617 type = process_paren(event, arg, &token);
1618 break;
1619 }
1620 case EVENT_OP:
1621 /* handle single ops */
1622 arg->type = PRINT_OP;
1623 arg->op.op = token;
1624 arg->op.left = NULL;
1625 type = process_op(event, arg, &token);
1626
1627 break;
1628
1629 case EVENT_ERROR ... EVENT_NEWLINE:
1630 default:
1631 die("unexpected type %d", type);
1632 }
1633 *tok = token;
1634
1635 return type;
1636}
1637
1638static int event_read_print_args(struct event *event, struct print_arg **list)
1639{
1640 enum event_type type;
1641 struct print_arg *arg;
1642 char *token;
1643 int args = 0;
1644
1645 do {
1646 arg = malloc_or_die(sizeof(*arg));
1647 memset(arg, 0, sizeof(*arg));
1648
1649 type = process_arg(event, arg, &token);
1650
1651 if (type == EVENT_ERROR) {
1652 free_arg(arg);
1653 return -1;
1654 }
1655
1656 *list = arg;
1657 args++;
1658
1659 if (type == EVENT_OP) {
1660 type = process_op(event, arg, &token);
1661 list = &arg->next;
1662 continue;
1663 }
1664
1665 if (type == EVENT_DELIM && strcmp(token, ",") == 0) {
1666 free_token(token);
1667 *list = arg;
1668 list = &arg->next;
1669 continue;
1670 }
1671 break;
1672 } while (type != EVENT_NONE);
1673
1674 if (type != EVENT_NONE)
1675 free_token(token);
1676
1677 return args;
1678}
1679
1680static int event_read_print(struct event *event)
1681{
1682 enum event_type type;
1683 char *token;
1684 int ret;
1685
1686 if (read_expected_item(EVENT_ITEM, (char *)"print") < 0)
1687 return -1;
1688
1689 if (read_expected(EVENT_ITEM, (char *)"fmt") < 0)
1690 return -1;
1691
1692 if (read_expected(EVENT_OP, (char *)":") < 0)
1693 return -1;
1694
1695 if (read_expect_type(EVENT_DQUOTE, &token) < 0)
1696 goto fail;
1697
1698 event->print_fmt.format = token;
1699 event->print_fmt.args = NULL;
1700
1701 /* ok to have no arg */
1702 type = read_token_item(&token);
1703
1704 if (type == EVENT_NONE)
1705 return 0;
1706
1707 if (test_type_token(type, token, EVENT_DELIM, (char *)","))
1708 goto fail;
1709
1710 free_token(token);
1711
1712 ret = event_read_print_args(event, &event->print_fmt.args);
1713 if (ret < 0)
1714 return -1;
1715
1716 return 0;
1717
1718 fail:
1719 free_token(token);
1720 return -1;
1721}
1722
1723static struct format_field *
1724find_common_field(struct event *event, const char *name)
1725{
1726 struct format_field *format;
1727
1728 for (format = event->format.common_fields;
1729 format; format = format->next) {
1730 if (strcmp(format->name, name) == 0)
1731 break;
1732 }
1733
1734 return format;
1735}
1736
1737static struct format_field *
1738find_field(struct event *event, const char *name)
1739{
1740 struct format_field *format;
1741
1742 for (format = event->format.fields;
1743 format; format = format->next) {
1744 if (strcmp(format->name, name) == 0)
1745 break;
1746 }
1747
1748 return format;
1749}
1750
1751static struct format_field *
1752find_any_field(struct event *event, const char *name)
1753{
1754 struct format_field *format;
1755
1756 format = find_common_field(event, name);
1757 if (format)
1758 return format;
1759 return find_field(event, name);
1760}
1761
1762static unsigned long long read_size(void *ptr, int size)
1763{
1764 switch (size) {
1765 case 1:
1766 return *(unsigned char *)ptr;
1767 case 2:
1768 return data2host2(ptr);
1769 case 4:
1770 return data2host4(ptr);
1771 case 8:
1772 return data2host8(ptr);
1773 default:
1774 /* BUG! */
1775 return 0;
1776 }
1777}
1778
1779static int get_common_info(const char *type, int *offset, int *size)
1780{
1781 struct event *event;
1782 struct format_field *field;
1783
1784 /*
1785 * All events should have the same common elements.
1786 * Pick any event to find where the type is;
1787 */
1788 if (!event_list)
1789 die("no event_list!");
1790
1791 event = event_list;
1792 field = find_common_field(event, type);
1793 if (!field)
1794 die("field '%s' not found", type);
1795
1796 *offset = field->offset;
1797 *size = field->size;
1798
1799 return 0;
1800}
1801
1802static int parse_common_type(void *data)
1803{
1804 static int type_offset;
1805 static int type_size;
1806 int ret;
1807
1808 if (!type_size) {
1809 ret = get_common_info("common_type",
1810 &type_offset,
1811 &type_size);
1812 if (ret < 0)
1813 return ret;
1814 }
1815 return read_size(data + type_offset, type_size);
1816}
1817
1818static int parse_common_pid(void *data)
1819{
1820 static int pid_offset;
1821 static int pid_size;
1822 int ret;
1823
1824 if (!pid_size) {
1825 ret = get_common_info("common_pid",
1826 &pid_offset,
1827 &pid_size);
1828 if (ret < 0)
1829 return ret;
1830 }
1831
1832 return read_size(data + pid_offset, pid_size);
1833}
1834
1835static struct event *find_event(int id)
1836{
1837 struct event *event;
1838
1839 for (event = event_list; event; event = event->next) {
1840 if (event->id == id)
1841 break;
1842 }
1843 return event;
1844}
1845
1846static unsigned long long eval_num_arg(void *data, int size,
1847 struct event *event, struct print_arg *arg)
1848{
1849 unsigned long long val = 0;
1850 unsigned long long left, right;
1851
1852 switch (arg->type) {
1853 case PRINT_NULL:
1854 /* ?? */
1855 return 0;
1856 case PRINT_ATOM:
1857 return strtoull(arg->atom.atom, NULL, 0);
1858 case PRINT_FIELD:
1859 if (!arg->field.field) {
1860 arg->field.field = find_any_field(event, arg->field.name);
1861 if (!arg->field.field)
1862 die("field %s not found", arg->field.name);
1863 }
1864 /* must be a number */
1865 val = read_size(data + arg->field.field->offset,
1866 arg->field.field->size);
1867 break;
1868 case PRINT_FLAGS:
1869 case PRINT_SYMBOL:
1870 break;
1871 case PRINT_TYPE:
1872 return eval_num_arg(data, size, event, arg->typecast.item);
1873 case PRINT_STRING:
1874 return 0;
1875 break;
1876 case PRINT_OP:
1877 left = eval_num_arg(data, size, event, arg->op.left);
1878 right = eval_num_arg(data, size, event, arg->op.right);
1879 switch (arg->op.op[0]) {
1880 case '|':
1881 if (arg->op.op[1])
1882 val = left || right;
1883 else
1884 val = left | right;
1885 break;
1886 case '&':
1887 if (arg->op.op[1])
1888 val = left && right;
1889 else
1890 val = left & right;
1891 break;
1892 case '<':
1893 switch (arg->op.op[1]) {
1894 case 0:
1895 val = left < right;
1896 break;
1897 case '<':
1898 val = left << right;
1899 break;
1900 case '=':
1901 val = left <= right;
1902 break;
1903 default:
1904 die("unknown op '%s'", arg->op.op);
1905 }
1906 break;
1907 case '>':
1908 switch (arg->op.op[1]) {
1909 case 0:
1910 val = left > right;
1911 break;
1912 case '>':
1913 val = left >> right;
1914 break;
1915 case '=':
1916 val = left >= right;
1917 break;
1918 default:
1919 die("unknown op '%s'", arg->op.op);
1920 }
1921 break;
1922 case '=':
1923 if (arg->op.op[1] != '=')
1924 die("unknown op '%s'", arg->op.op);
1925 val = left == right;
1926 break;
1927 default:
1928 die("unknown op '%s'", arg->op.op);
1929 }
1930 break;
1931 default: /* not sure what to do there */
1932 return 0;
1933 }
1934 return val;
1935}
1936
1937struct flag {
1938 const char *name;
1939 unsigned long long value;
1940};
1941
1942static const struct flag flags[] = {
1943 { "HI_SOFTIRQ", 0 },
1944 { "TIMER_SOFTIRQ", 1 },
1945 { "NET_TX_SOFTIRQ", 2 },
1946 { "NET_RX_SOFTIRQ", 3 },
1947 { "BLOCK_SOFTIRQ", 4 },
1948 { "TASKLET_SOFTIRQ", 5 },
1949 { "SCHED_SOFTIRQ", 6 },
1950 { "HRTIMER_SOFTIRQ", 7 },
1951 { "RCU_SOFTIRQ", 8 },
1952
1953 { "HRTIMER_NORESTART", 0 },
1954 { "HRTIMER_RESTART", 1 },
1955};
1956
1957static unsigned long long eval_flag(const char *flag)
1958{
1959 int i;
1960
1961 /*
1962 * Some flags in the format files do not get converted.
1963 * If the flag is not numeric, see if it is something that
1964 * we already know about.
1965 */
1966 if (isdigit(flag[0]))
1967 return strtoull(flag, NULL, 0);
1968
1969 for (i = 0; i < (int)(sizeof(flags)/sizeof(flags[0])); i++)
1970 if (strcmp(flags[i].name, flag) == 0)
1971 return flags[i].value;
1972
1973 return 0;
1974}
1975
1976static void print_str_arg(void *data, int size,
1977 struct event *event, struct print_arg *arg)
1978{
1979 struct print_flag_sym *flag;
1980 unsigned long long val, fval;
1981 char *str;
1982 int print;
1983
1984 switch (arg->type) {
1985 case PRINT_NULL:
1986 /* ?? */
1987 return;
1988 case PRINT_ATOM:
1989 printf("%s", arg->atom.atom);
1990 return;
1991 case PRINT_FIELD:
1992 if (!arg->field.field) {
1993 arg->field.field = find_any_field(event, arg->field.name);
1994 if (!arg->field.field)
1995 die("field %s not found", arg->field.name);
1996 }
1997 str = malloc_or_die(arg->field.field->size + 1);
1998 memcpy(str, data + arg->field.field->offset,
1999 arg->field.field->size);
2000 str[arg->field.field->size] = 0;
2001 printf("%s", str);
2002 free(str);
2003 break;
2004 case PRINT_FLAGS:
2005 val = eval_num_arg(data, size, event, arg->flags.field);
2006 print = 0;
2007 for (flag = arg->flags.flags; flag; flag = flag->next) {
2008 fval = eval_flag(flag->value);
2009 if (!val && !fval) {
2010 printf("%s", flag->str);
2011 break;
2012 }
2013 if (fval && (val & fval) == fval) {
2014 if (print && arg->flags.delim)
2015 printf("%s", arg->flags.delim);
2016 printf("%s", flag->str);
2017 print = 1;
2018 val &= ~fval;
2019 }
2020 }
2021 break;
2022 case PRINT_SYMBOL:
2023 val = eval_num_arg(data, size, event, arg->symbol.field);
2024 for (flag = arg->symbol.symbols; flag; flag = flag->next) {
2025 fval = eval_flag(flag->value);
2026 if (val == fval) {
2027 printf("%s", flag->str);
2028 break;
2029 }
2030 }
2031 break;
2032
2033 case PRINT_TYPE:
2034 break;
2035 case PRINT_STRING: {
2036 int str_offset;
2037
2038 if (arg->string.offset == -1) {
2039 struct format_field *f;
2040
2041 f = find_any_field(event, arg->string.string);
2042 arg->string.offset = f->offset;
2043 }
2044 str_offset = *(int *)(data + arg->string.offset);
2045 str_offset &= 0xffff;
2046 printf("%s", ((char *)data) + str_offset);
2047 break;
2048 }
2049 case PRINT_OP:
2050 /*
2051 * The only op for string should be ? :
2052 */
2053 if (arg->op.op[0] != '?')
2054 return;
2055 val = eval_num_arg(data, size, event, arg->op.left);
2056 if (val)
2057 print_str_arg(data, size, event, arg->op.right->op.left);
2058 else
2059 print_str_arg(data, size, event, arg->op.right->op.right);
2060 break;
2061 default:
2062 /* well... */
2063 break;
2064 }
2065}
2066
2067static struct print_arg *make_bprint_args(char *fmt, void *data, int size, struct event *event)
2068{
2069 static struct format_field *field, *ip_field;
2070 struct print_arg *args, *arg, **next;
2071 unsigned long long ip, val;
2072 char *ptr;
2073 void *bptr;
2074
2075 if (!field) {
2076 field = find_field(event, "buf");
2077 if (!field)
2078 die("can't find buffer field for binary printk");
2079 ip_field = find_field(event, "ip");
2080 if (!ip_field)
2081 die("can't find ip field for binary printk");
2082 }
2083
2084 ip = read_size(data + ip_field->offset, ip_field->size);
2085
2086 /*
2087 * The first arg is the IP pointer.
2088 */
2089 args = malloc_or_die(sizeof(*args));
2090 arg = args;
2091 arg->next = NULL;
2092 next = &arg->next;
2093
2094 arg->type = PRINT_ATOM;
2095 arg->atom.atom = malloc_or_die(32);
2096 sprintf(arg->atom.atom, "%lld", ip);
2097
2098 /* skip the first "%pf : " */
2099 for (ptr = fmt + 6, bptr = data + field->offset;
2100 bptr < data + size && *ptr; ptr++) {
2101 int ls = 0;
2102
2103 if (*ptr == '%') {
2104 process_again:
2105 ptr++;
2106 switch (*ptr) {
2107 case '%':
2108 break;
2109 case 'l':
2110 ls++;
2111 goto process_again;
2112 case 'L':
2113 ls = 2;
2114 goto process_again;
2115 case '0' ... '9':
2116 goto process_again;
2117 case 'p':
2118 ls = 1;
2119 /* fall through */
2120 case 'd':
2121 case 'u':
2122 case 'x':
2123 case 'i':
2124 bptr = (void *)(((unsigned long)bptr + (long_size - 1)) &
2125 ~(long_size - 1));
2126 switch (ls) {
2127 case 0:
2128 case 1:
2129 ls = long_size;
2130 break;
2131 case 2:
2132 ls = 8;
2133 default:
2134 break;
2135 }
2136 val = read_size(bptr, ls);
2137 bptr += ls;
2138 arg = malloc_or_die(sizeof(*arg));
2139 arg->next = NULL;
2140 arg->type = PRINT_ATOM;
2141 arg->atom.atom = malloc_or_die(32);
2142 sprintf(arg->atom.atom, "%lld", val);
2143 *next = arg;
2144 next = &arg->next;
2145 break;
2146 case 's':
2147 arg = malloc_or_die(sizeof(*arg));
2148 arg->next = NULL;
2149 arg->type = PRINT_STRING;
2150 arg->string.string = strdup(bptr);
2151 bptr += strlen(bptr) + 1;
2152 *next = arg;
2153 next = &arg->next;
2154 default:
2155 break;
2156 }
2157 }
2158 }
2159
2160 return args;
2161}
2162
2163static void free_args(struct print_arg *args)
2164{
2165 struct print_arg *next;
2166
2167 while (args) {
2168 next = args->next;
2169
2170 if (args->type == PRINT_ATOM)
2171 free(args->atom.atom);
2172 else
2173 free(args->string.string);
2174 free(args);
2175 args = next;
2176 }
2177}
2178
2179static char *get_bprint_format(void *data, int size __unused, struct event *event)
2180{
2181 unsigned long long addr;
2182 static struct format_field *field;
2183 struct printk_map *printk;
2184 char *format;
2185 char *p;
2186
2187 if (!field) {
2188 field = find_field(event, "fmt");
2189 if (!field)
2190 die("can't find format field for binary printk");
2191 printf("field->offset = %d size=%d\n", field->offset, field->size);
2192 }
2193
2194 addr = read_size(data + field->offset, field->size);
2195
2196 printk = find_printk(addr);
2197 if (!printk) {
2198 format = malloc_or_die(45);
2199 sprintf(format, "%%pf : (NO FORMAT FOUND at %llx)\n",
2200 addr);
2201 return format;
2202 }
2203
2204 p = printk->printk;
2205 /* Remove any quotes. */
2206 if (*p == '"')
2207 p++;
2208 format = malloc_or_die(strlen(p) + 10);
2209 sprintf(format, "%s : %s", "%pf", p);
2210 /* remove ending quotes and new line since we will add one too */
2211 p = format + strlen(format) - 1;
2212 if (*p == '"')
2213 *p = 0;
2214
2215 p -= 2;
2216 if (strcmp(p, "\\n") == 0)
2217 *p = 0;
2218
2219 return format;
2220}
2221
2222static void pretty_print(void *data, int size, struct event *event)
2223{
2224 struct print_fmt *print_fmt = &event->print_fmt;
2225 struct print_arg *arg = print_fmt->args;
2226 struct print_arg *args = NULL;
2227 const char *ptr = print_fmt->format;
2228 unsigned long long val;
2229 struct func_map *func;
2230 const char *saveptr;
2231 char *bprint_fmt = NULL;
2232 char format[32];
2233 int show_func;
2234 int len;
2235 int ls;
2236
2237 if (event->flags & EVENT_FL_ISFUNC)
2238 ptr = " %pF <-- %pF";
2239
2240 if (event->flags & EVENT_FL_ISBPRINT) {
2241 bprint_fmt = get_bprint_format(data, size, event);
2242 args = make_bprint_args(bprint_fmt, data, size, event);
2243 arg = args;
2244 ptr = bprint_fmt;
2245 }
2246
2247 for (; *ptr; ptr++) {
2248 ls = 0;
2249 if (*ptr == '%') {
2250 saveptr = ptr;
2251 show_func = 0;
2252 cont_process:
2253 ptr++;
2254 switch (*ptr) {
2255 case '%':
2256 printf("%%");
2257 break;
2258 case 'l':
2259 ls++;
2260 goto cont_process;
2261 case 'L':
2262 ls = 2;
2263 goto cont_process;
2264 case 'z':
2265 case 'Z':
2266 case '0' ... '9':
2267 goto cont_process;
2268 case 'p':
2269 if (long_size == 4)
2270 ls = 1;
2271 else
2272 ls = 2;
2273
2274 if (*(ptr+1) == 'F' ||
2275 *(ptr+1) == 'f') {
2276 ptr++;
2277 show_func = *ptr;
2278 }
2279
2280 /* fall through */
2281 case 'd':
2282 case 'i':
2283 case 'x':
2284 case 'X':
2285 case 'u':
2286 if (!arg)
2287 die("no argument match");
2288
2289 len = ((unsigned long)ptr + 1) -
2290 (unsigned long)saveptr;
2291
2292 /* should never happen */
2293 if (len > 32)
2294 die("bad format!");
2295
2296 memcpy(format, saveptr, len);
2297 format[len] = 0;
2298
2299 val = eval_num_arg(data, size, event, arg);
2300 arg = arg->next;
2301
2302 if (show_func) {
2303 func = find_func(val);
2304 if (func) {
2305 printf("%s", func->func);
2306 if (show_func == 'F')
2307 printf("+0x%llx",
2308 val - func->addr);
2309 break;
2310 }
2311 }
2312 switch (ls) {
2313 case 0:
2314 printf(format, (int)val);
2315 break;
2316 case 1:
2317 printf(format, (long)val);
2318 break;
2319 case 2:
2320 printf(format, (long long)val);
2321 break;
2322 default:
2323 die("bad count (%d)", ls);
2324 }
2325 break;
2326 case 's':
2327 if (!arg)
2328 die("no matching argument");
2329
2330 print_str_arg(data, size, event, arg);
2331 arg = arg->next;
2332 break;
2333 default:
2334 printf(">%c<", *ptr);
2335
2336 }
2337 } else
2338 printf("%c", *ptr);
2339 }
2340
2341 if (args) {
2342 free_args(args);
2343 free(bprint_fmt);
2344 }
2345}
2346
2347static inline int log10_cpu(int nb)
2348{
2349 if (nb / 100)
2350 return 3;
2351 if (nb / 10)
2352 return 2;
2353 return 1;
2354}
2355
2356/* taken from Linux, written by Frederic Weisbecker */
2357static void print_graph_cpu(int cpu)
2358{
2359 int i;
2360 int log10_this = log10_cpu(cpu);
2361 int log10_all = log10_cpu(cpus);
2362
2363
2364 /*
2365 * Start with a space character - to make it stand out
2366 * to the right a bit when trace output is pasted into
2367 * email:
2368 */
2369 printf(" ");
2370
2371 /*
2372 * Tricky - we space the CPU field according to the max
2373 * number of online CPUs. On a 2-cpu system it would take
2374 * a maximum of 1 digit - on a 128 cpu system it would
2375 * take up to 3 digits:
2376 */
2377 for (i = 0; i < log10_all - log10_this; i++)
2378 printf(" ");
2379
2380 printf("%d) ", cpu);
2381}
2382
2383#define TRACE_GRAPH_PROCINFO_LENGTH 14
2384#define TRACE_GRAPH_INDENT 2
2385
2386static void print_graph_proc(int pid, const char *comm)
2387{
2388 /* sign + log10(MAX_INT) + '\0' */
2389 char pid_str[11];
2390 int spaces = 0;
2391 int len;
2392 int i;
2393
2394 sprintf(pid_str, "%d", pid);
2395
2396 /* 1 stands for the "-" character */
2397 len = strlen(comm) + strlen(pid_str) + 1;
2398
2399 if (len < TRACE_GRAPH_PROCINFO_LENGTH)
2400 spaces = TRACE_GRAPH_PROCINFO_LENGTH - len;
2401
2402 /* First spaces to align center */
2403 for (i = 0; i < spaces / 2; i++)
2404 printf(" ");
2405
2406 printf("%s-%s", comm, pid_str);
2407
2408 /* Last spaces to align center */
2409 for (i = 0; i < spaces - (spaces / 2); i++)
2410 printf(" ");
2411}
2412
2413static struct record *
2414get_return_for_leaf(int cpu, int cur_pid, unsigned long long cur_func,
2415 struct record *next)
2416{
2417 struct format_field *field;
2418 struct event *event;
2419 unsigned long val;
2420 int type;
2421 int pid;
2422
2423 type = parse_common_type(next->data);
2424 event = find_event(type);
2425 if (!event)
2426 return NULL;
2427
2428 if (!(event->flags & EVENT_FL_ISFUNCRET))
2429 return NULL;
2430
2431 pid = parse_common_pid(next->data);
2432 field = find_field(event, "func");
2433 if (!field)
2434 die("function return does not have field func");
2435
2436 val = read_size(next->data + field->offset, field->size);
2437
2438 if (cur_pid != pid || cur_func != val)
2439 return NULL;
2440
2441 /* this is a leaf, now advance the iterator */
2442 return trace_read_data(cpu);
2443}
2444
2445/* Signal a overhead of time execution to the output */
2446static void print_graph_overhead(unsigned long long duration)
2447{
2448 /* Non nested entry or return */
2449 if (duration == ~0ULL)
2450 return (void)printf(" ");
2451
2452 /* Duration exceeded 100 msecs */
2453 if (duration > 100000ULL)
2454 return (void)printf("! ");
2455
2456 /* Duration exceeded 10 msecs */
2457 if (duration > 10000ULL)
2458 return (void)printf("+ ");
2459
2460 printf(" ");
2461}
2462
2463static void print_graph_duration(unsigned long long duration)
2464{
2465 unsigned long usecs = duration / 1000;
2466 unsigned long nsecs_rem = duration % 1000;
2467 /* log10(ULONG_MAX) + '\0' */
2468 char msecs_str[21];
2469 char nsecs_str[5];
2470 int len;
2471 int i;
2472
2473 sprintf(msecs_str, "%lu", usecs);
2474
2475 /* Print msecs */
2476 len = printf("%lu", usecs);
2477
2478 /* Print nsecs (we don't want to exceed 7 numbers) */
2479 if (len < 7) {
2480 snprintf(nsecs_str, 8 - len, "%03lu", nsecs_rem);
2481 len += printf(".%s", nsecs_str);
2482 }
2483
2484 printf(" us ");
2485
2486 /* Print remaining spaces to fit the row's width */
2487 for (i = len; i < 7; i++)
2488 printf(" ");
2489
2490 printf("| ");
2491}
2492
2493static void
2494print_graph_entry_leaf(struct event *event, void *data, struct record *ret_rec)
2495{
2496 unsigned long long rettime, calltime;
2497 unsigned long long duration, depth;
2498 unsigned long long val;
2499 struct format_field *field;
2500 struct func_map *func;
2501 struct event *ret_event;
2502 int type;
2503 int i;
2504
2505 type = parse_common_type(ret_rec->data);
2506 ret_event = find_event(type);
2507
2508 field = find_field(ret_event, "rettime");
2509 if (!field)
2510 die("can't find rettime in return graph");
2511 rettime = read_size(ret_rec->data + field->offset, field->size);
2512
2513 field = find_field(ret_event, "calltime");
2514 if (!field)
2515 die("can't find rettime in return graph");
2516 calltime = read_size(ret_rec->data + field->offset, field->size);
2517
2518 duration = rettime - calltime;
2519
2520 /* Overhead */
2521 print_graph_overhead(duration);
2522
2523 /* Duration */
2524 print_graph_duration(duration);
2525
2526 field = find_field(event, "depth");
2527 if (!field)
2528 die("can't find depth in entry graph");
2529 depth = read_size(data + field->offset, field->size);
2530
2531 /* Function */
2532 for (i = 0; i < (int)(depth * TRACE_GRAPH_INDENT); i++)
2533 printf(" ");
2534
2535 field = find_field(event, "func");
2536 if (!field)
2537 die("can't find func in entry graph");
2538 val = read_size(data + field->offset, field->size);
2539 func = find_func(val);
2540
2541 if (func)
2542 printf("%s();", func->func);
2543 else
2544 printf("%llx();", val);
2545}
2546
2547static void print_graph_nested(struct event *event, void *data)
2548{
2549 struct format_field *field;
2550 unsigned long long depth;
2551 unsigned long long val;
2552 struct func_map *func;
2553 int i;
2554
2555 /* No overhead */
2556 print_graph_overhead(-1);
2557
2558 /* No time */
2559 printf(" | ");
2560
2561 field = find_field(event, "depth");
2562 if (!field)
2563 die("can't find depth in entry graph");
2564 depth = read_size(data + field->offset, field->size);
2565
2566 /* Function */
2567 for (i = 0; i < (int)(depth * TRACE_GRAPH_INDENT); i++)
2568 printf(" ");
2569
2570 field = find_field(event, "func");
2571 if (!field)
2572 die("can't find func in entry graph");
2573 val = read_size(data + field->offset, field->size);
2574 func = find_func(val);
2575
2576 if (func)
2577 printf("%s() {", func->func);
2578 else
2579 printf("%llx() {", val);
2580}
2581
2582static void
2583pretty_print_func_ent(void *data, int size, struct event *event,
2584 int cpu, int pid, const char *comm,
2585 unsigned long secs, unsigned long usecs)
2586{
2587 struct format_field *field;
2588 struct record *rec;
2589 void *copy_data;
2590 unsigned long val;
2591
2592 printf("%5lu.%06lu | ", secs, usecs);
2593
2594 print_graph_cpu(cpu);
2595 print_graph_proc(pid, comm);
2596
2597 printf(" | ");
2598
2599 field = find_field(event, "func");
2600 if (!field)
2601 die("function entry does not have func field");
2602
2603 val = read_size(data + field->offset, field->size);
2604
2605 /*
2606 * peek_data may unmap the data pointer. Copy it first.
2607 */
2608 copy_data = malloc_or_die(size);
2609 memcpy(copy_data, data, size);
2610 data = copy_data;
2611
2612 rec = trace_peek_data(cpu);
2613 if (rec) {
2614 rec = get_return_for_leaf(cpu, pid, val, rec);
2615 if (rec) {
2616 print_graph_entry_leaf(event, data, rec);
2617 goto out_free;
2618 }
2619 }
2620 print_graph_nested(event, data);
2621out_free:
2622 free(data);
2623}
2624
2625static void
2626pretty_print_func_ret(void *data, int size __unused, struct event *event,
2627 int cpu, int pid, const char *comm,
2628 unsigned long secs, unsigned long usecs)
2629{
2630 unsigned long long rettime, calltime;
2631 unsigned long long duration, depth;
2632 struct format_field *field;
2633 int i;
2634
2635 printf("%5lu.%06lu | ", secs, usecs);
2636
2637 print_graph_cpu(cpu);
2638 print_graph_proc(pid, comm);
2639
2640 printf(" | ");
2641
2642 field = find_field(event, "rettime");
2643 if (!field)
2644 die("can't find rettime in return graph");
2645 rettime = read_size(data + field->offset, field->size);
2646
2647 field = find_field(event, "calltime");
2648 if (!field)
2649 die("can't find calltime in return graph");
2650 calltime = read_size(data + field->offset, field->size);
2651
2652 duration = rettime - calltime;
2653
2654 /* Overhead */
2655 print_graph_overhead(duration);
2656
2657 /* Duration */
2658 print_graph_duration(duration);
2659
2660 field = find_field(event, "depth");
2661 if (!field)
2662 die("can't find depth in entry graph");
2663 depth = read_size(data + field->offset, field->size);
2664
2665 /* Function */
2666 for (i = 0; i < (int)(depth * TRACE_GRAPH_INDENT); i++)
2667 printf(" ");
2668
2669 printf("}");
2670}
2671
2672static void
2673pretty_print_func_graph(void *data, int size, struct event *event,
2674 int cpu, int pid, const char *comm,
2675 unsigned long secs, unsigned long usecs)
2676{
2677 if (event->flags & EVENT_FL_ISFUNCENT)
2678 pretty_print_func_ent(data, size, event,
2679 cpu, pid, comm, secs, usecs);
2680 else if (event->flags & EVENT_FL_ISFUNCRET)
2681 pretty_print_func_ret(data, size, event,
2682 cpu, pid, comm, secs, usecs);
2683 printf("\n");
2684}
2685
2686void print_event(int cpu, void *data, int size, unsigned long long nsecs,
2687 char *comm)
2688{
2689 struct event *event;
2690 unsigned long secs;
2691 unsigned long usecs;
2692 int type;
2693 int pid;
2694
2695 secs = nsecs / NSECS_PER_SEC;
2696 nsecs -= secs * NSECS_PER_SEC;
2697 usecs = nsecs / NSECS_PER_USEC;
2698
2699 type = parse_common_type(data);
2700
2701 event = find_event(type);
2702 if (!event)
2703 die("ug! no event found for type %d", type);
2704
2705 pid = parse_common_pid(data);
2706
2707 if (event->flags & (EVENT_FL_ISFUNCENT | EVENT_FL_ISFUNCRET))
2708 return pretty_print_func_graph(data, size, event, cpu,
2709 pid, comm, secs, usecs);
2710
2711 printf("%16s-%-5d [%03d] %5lu.%09Lu: %s: ",
2712 comm, pid, cpu,
2713 secs, nsecs, event->name);
2714
2715 pretty_print(data, size, event);
2716 printf("\n");
2717}
2718
2719static void print_fields(struct print_flag_sym *field)
2720{
2721 printf("{ %s, %s }", field->value, field->str);
2722 if (field->next) {
2723 printf(", ");
2724 print_fields(field->next);
2725 }
2726}
2727
2728static void print_args(struct print_arg *args)
2729{
2730 int print_paren = 1;
2731
2732 switch (args->type) {
2733 case PRINT_NULL:
2734 printf("null");
2735 break;
2736 case PRINT_ATOM:
2737 printf("%s", args->atom.atom);
2738 break;
2739 case PRINT_FIELD:
2740 printf("REC->%s", args->field.name);
2741 break;
2742 case PRINT_FLAGS:
2743 printf("__print_flags(");
2744 print_args(args->flags.field);
2745 printf(", %s, ", args->flags.delim);
2746 print_fields(args->flags.flags);
2747 printf(")");
2748 break;
2749 case PRINT_SYMBOL:
2750 printf("__print_symbolic(");
2751 print_args(args->symbol.field);
2752 printf(", ");
2753 print_fields(args->symbol.symbols);
2754 printf(")");
2755 break;
2756 case PRINT_STRING:
2757 printf("__get_str(%s)", args->string.string);
2758 break;
2759 case PRINT_TYPE:
2760 printf("(%s)", args->typecast.type);
2761 print_args(args->typecast.item);
2762 break;
2763 case PRINT_OP:
2764 if (strcmp(args->op.op, ":") == 0)
2765 print_paren = 0;
2766 if (print_paren)
2767 printf("(");
2768 print_args(args->op.left);
2769 printf(" %s ", args->op.op);
2770 print_args(args->op.right);
2771 if (print_paren)
2772 printf(")");
2773 break;
2774 default:
2775 /* we should warn... */
2776 return;
2777 }
2778 if (args->next) {
2779 printf("\n");
2780 print_args(args->next);
2781 }
2782}
2783
2784static void parse_header_field(char *type,
2785 int *offset, int *size)
2786{
2787 char *token;
2788
2789 if (read_expected(EVENT_ITEM, (char *)"field") < 0)
2790 return;
2791 if (read_expected(EVENT_OP, (char *)":") < 0)
2792 return;
2793 /* type */
2794 if (read_expect_type(EVENT_ITEM, &token) < 0)
2795 return;
2796 free_token(token);
2797
2798 if (read_expected(EVENT_ITEM, type) < 0)
2799 return;
2800 if (read_expected(EVENT_OP, (char *)";") < 0)
2801 return;
2802 if (read_expected(EVENT_ITEM, (char *)"offset") < 0)
2803 return;
2804 if (read_expected(EVENT_OP, (char *)":") < 0)
2805 return;
2806 if (read_expect_type(EVENT_ITEM, &token) < 0)
2807 return;
2808 *offset = atoi(token);
2809 free_token(token);
2810 if (read_expected(EVENT_OP, (char *)";") < 0)
2811 return;
2812 if (read_expected(EVENT_ITEM, (char *)"size") < 0)
2813 return;
2814 if (read_expected(EVENT_OP, (char *)":") < 0)
2815 return;
2816 if (read_expect_type(EVENT_ITEM, &token) < 0)
2817 return;
2818 *size = atoi(token);
2819 free_token(token);
2820 if (read_expected(EVENT_OP, (char *)";") < 0)
2821 return;
2822 if (read_expect_type(EVENT_NEWLINE, &token) < 0)
2823 return;
2824 free_token(token);
2825}
2826
2827int parse_header_page(char *buf, unsigned long size)
2828{
2829 init_input_buf(buf, size);
2830
2831 parse_header_field((char *)"timestamp", &header_page_ts_offset,
2832 &header_page_ts_size);
2833 parse_header_field((char *)"commit", &header_page_size_offset,
2834 &header_page_size_size);
2835 parse_header_field((char *)"data", &header_page_data_offset,
2836 &header_page_data_size);
2837
2838 return 0;
2839}
2840
2841int parse_ftrace_file(char *buf, unsigned long size)
2842{
2843 struct format_field *field;
2844 struct print_arg *arg, **list;
2845 struct event *event;
2846 int ret;
2847
2848 init_input_buf(buf, size);
2849
2850 event = alloc_event();
2851 if (!event)
2852 return -ENOMEM;
2853
2854 event->flags |= EVENT_FL_ISFTRACE;
2855
2856 event->name = event_read_name();
2857 if (!event->name)
2858 die("failed to read ftrace event name");
2859
2860 if (strcmp(event->name, "function") == 0)
2861 event->flags |= EVENT_FL_ISFUNC;
2862
2863 else if (strcmp(event->name, "funcgraph_entry") == 0)
2864 event->flags |= EVENT_FL_ISFUNCENT;
2865
2866 else if (strcmp(event->name, "funcgraph_exit") == 0)
2867 event->flags |= EVENT_FL_ISFUNCRET;
2868
2869 else if (strcmp(event->name, "bprint") == 0)
2870 event->flags |= EVENT_FL_ISBPRINT;
2871
2872 event->id = event_read_id();
2873 if (event->id < 0)
2874 die("failed to read ftrace event id");
2875
2876 add_event(event);
2877
2878 ret = event_read_format(event);
2879 if (ret < 0)
2880 die("failed to read ftrace event format");
2881
2882 ret = event_read_print(event);
2883 if (ret < 0)
2884 die("failed to read ftrace event print fmt");
2885
2886 /*
2887 * The arguments for ftrace files are parsed by the fields.
2888 * Set up the fields as their arguments.
2889 */
2890 list = &event->print_fmt.args;
2891 for (field = event->format.fields; field; field = field->next) {
2892 arg = malloc_or_die(sizeof(*arg));
2893 memset(arg, 0, sizeof(*arg));
2894 *list = arg;
2895 list = &arg->next;
2896 arg->type = PRINT_FIELD;
2897 arg->field.name = field->name;
2898 arg->field.field = field;
2899 }
2900 return 0;
2901}
2902
2903int parse_event_file(char *buf, unsigned long size, char *system__unused __unused)
2904{
2905 struct event *event;
2906 int ret;
2907
2908 init_input_buf(buf, size);
2909
2910 event = alloc_event();
2911 if (!event)
2912 return -ENOMEM;
2913
2914 event->name = event_read_name();
2915 if (!event->name)
2916 die("failed to read event name");
2917
2918 event->id = event_read_id();
2919 if (event->id < 0)
2920 die("failed to read event id");
2921
2922 ret = event_read_format(event);
2923 if (ret < 0)
2924 die("failed to read event format");
2925
2926 ret = event_read_print(event);
2927 if (ret < 0)
2928 die("failed to read event print fmt");
2929
2930#define PRINT_ARGS 0
2931 if (PRINT_ARGS && event->print_fmt.args)
2932 print_args(event->print_fmt.args);
2933
2934 add_event(event);
2935 return 0;
2936}
2937
2938void parse_set_info(int nr_cpus, int long_sz)
2939{
2940 cpus = nr_cpus;
2941 long_size = long_sz;
2942}
diff --git a/tools/perf/util/trace-event-read.c b/tools/perf/util/trace-event-read.c
new file mode 100644
index 000000000000..a1217a10632f
--- /dev/null
+++ b/tools/perf/util/trace-event-read.c
@@ -0,0 +1,512 @@
1/*
2 * Copyright (C) 2009, Steven Rostedt <srostedt@redhat.com>
3 *
4 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; version 2 of the License (not later!)
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 *
19 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
20 */
21#define _LARGEFILE64_SOURCE
22
23#include <dirent.h>
24#include <stdio.h>
25#include <stdlib.h>
26#include <string.h>
27#include <getopt.h>
28#include <stdarg.h>
29#include <sys/types.h>
30#include <sys/stat.h>
31#include <sys/wait.h>
32#include <sys/mman.h>
33#include <pthread.h>
34#include <fcntl.h>
35#include <unistd.h>
36#include <ctype.h>
37#include <errno.h>
38
39#include "../perf.h"
40#include "util.h"
41#include "trace-event.h"
42
43static int input_fd;
44
45static int read_page;
46
47int file_bigendian;
48int host_bigendian;
49static int long_size;
50
51static unsigned long page_size;
52
53static int read_or_die(void *data, int size)
54{
55 int r;
56
57 r = read(input_fd, data, size);
58 if (r != size)
59 die("reading input file (size expected=%d received=%d)",
60 size, r);
61 return r;
62}
63
64static unsigned int read4(void)
65{
66 unsigned int data;
67
68 read_or_die(&data, 4);
69 return __data2host4(data);
70}
71
72static unsigned long long read8(void)
73{
74 unsigned long long data;
75
76 read_or_die(&data, 8);
77 return __data2host8(data);
78}
79
80static char *read_string(void)
81{
82 char buf[BUFSIZ];
83 char *str = NULL;
84 int size = 0;
85 int i;
86 int r;
87
88 for (;;) {
89 r = read(input_fd, buf, BUFSIZ);
90 if (r < 0)
91 die("reading input file");
92
93 if (!r)
94 die("no data");
95
96 for (i = 0; i < r; i++) {
97 if (!buf[i])
98 break;
99 }
100 if (i < r)
101 break;
102
103 if (str) {
104 size += BUFSIZ;
105 str = realloc(str, size);
106 if (!str)
107 die("malloc of size %d", size);
108 memcpy(str + (size - BUFSIZ), buf, BUFSIZ);
109 } else {
110 size = BUFSIZ;
111 str = malloc_or_die(size);
112 memcpy(str, buf, size);
113 }
114 }
115
116 /* trailing \0: */
117 i++;
118
119 /* move the file descriptor to the end of the string */
120 r = lseek(input_fd, -(r - i), SEEK_CUR);
121 if (r < 0)
122 die("lseek");
123
124 if (str) {
125 size += i;
126 str = realloc(str, size);
127 if (!str)
128 die("malloc of size %d", size);
129 memcpy(str + (size - i), buf, i);
130 } else {
131 size = i;
132 str = malloc_or_die(i);
133 memcpy(str, buf, i);
134 }
135
136 return str;
137}
138
139static void read_proc_kallsyms(void)
140{
141 unsigned int size;
142 char *buf;
143
144 size = read4();
145 if (!size)
146 return;
147
148 buf = malloc_or_die(size);
149 read_or_die(buf, size);
150
151 parse_proc_kallsyms(buf, size);
152
153 free(buf);
154}
155
156static void read_ftrace_printk(void)
157{
158 unsigned int size;
159 char *buf;
160
161 size = read4();
162 if (!size)
163 return;
164
165 buf = malloc_or_die(size);
166 read_or_die(buf, size);
167
168 parse_ftrace_printk(buf, size);
169
170 free(buf);
171}
172
173static void read_header_files(void)
174{
175 unsigned long long size;
176 char *header_page;
177 char *header_event;
178 char buf[BUFSIZ];
179
180 read_or_die(buf, 12);
181
182 if (memcmp(buf, "header_page", 12) != 0)
183 die("did not read header page");
184
185 size = read8();
186 header_page = malloc_or_die(size);
187 read_or_die(header_page, size);
188 parse_header_page(header_page, size);
189 free(header_page);
190
191 /*
192 * The size field in the page is of type long,
193 * use that instead, since it represents the kernel.
194 */
195 long_size = header_page_size_size;
196
197 read_or_die(buf, 13);
198 if (memcmp(buf, "header_event", 13) != 0)
199 die("did not read header event");
200
201 size = read8();
202 header_event = malloc_or_die(size);
203 read_or_die(header_event, size);
204 free(header_event);
205}
206
207static void read_ftrace_file(unsigned long long size)
208{
209 char *buf;
210
211 buf = malloc_or_die(size);
212 read_or_die(buf, size);
213 parse_ftrace_file(buf, size);
214 free(buf);
215}
216
217static void read_event_file(char *sys, unsigned long long size)
218{
219 char *buf;
220
221 buf = malloc_or_die(size);
222 read_or_die(buf, size);
223 parse_event_file(buf, size, sys);
224 free(buf);
225}
226
227static void read_ftrace_files(void)
228{
229 unsigned long long size;
230 int count;
231 int i;
232
233 count = read4();
234
235 for (i = 0; i < count; i++) {
236 size = read8();
237 read_ftrace_file(size);
238 }
239}
240
241static void read_event_files(void)
242{
243 unsigned long long size;
244 char *sys;
245 int systems;
246 int count;
247 int i,x;
248
249 systems = read4();
250
251 for (i = 0; i < systems; i++) {
252 sys = read_string();
253
254 count = read4();
255 for (x=0; x < count; x++) {
256 size = read8();
257 read_event_file(sys, size);
258 }
259 }
260}
261
262struct cpu_data {
263 unsigned long long offset;
264 unsigned long long size;
265 unsigned long long timestamp;
266 struct record *next;
267 char *page;
268 int cpu;
269 int index;
270 int page_size;
271};
272
273static struct cpu_data *cpu_data;
274
275static void update_cpu_data_index(int cpu)
276{
277 cpu_data[cpu].offset += page_size;
278 cpu_data[cpu].size -= page_size;
279 cpu_data[cpu].index = 0;
280}
281
282static void get_next_page(int cpu)
283{
284 off64_t save_seek;
285 off64_t ret;
286
287 if (!cpu_data[cpu].page)
288 return;
289
290 if (read_page) {
291 if (cpu_data[cpu].size <= page_size) {
292 free(cpu_data[cpu].page);
293 cpu_data[cpu].page = NULL;
294 return;
295 }
296
297 update_cpu_data_index(cpu);
298
299 /* other parts of the code may expect the pointer to not move */
300 save_seek = lseek64(input_fd, 0, SEEK_CUR);
301
302 ret = lseek64(input_fd, cpu_data[cpu].offset, SEEK_SET);
303 if (ret < 0)
304 die("failed to lseek");
305 ret = read(input_fd, cpu_data[cpu].page, page_size);
306 if (ret < 0)
307 die("failed to read page");
308
309 /* reset the file pointer back */
310 lseek64(input_fd, save_seek, SEEK_SET);
311
312 return;
313 }
314
315 munmap(cpu_data[cpu].page, page_size);
316 cpu_data[cpu].page = NULL;
317
318 if (cpu_data[cpu].size <= page_size)
319 return;
320
321 update_cpu_data_index(cpu);
322
323 cpu_data[cpu].page = mmap(NULL, page_size, PROT_READ, MAP_PRIVATE,
324 input_fd, cpu_data[cpu].offset);
325 if (cpu_data[cpu].page == MAP_FAILED)
326 die("failed to mmap cpu %d at offset 0x%llx",
327 cpu, cpu_data[cpu].offset);
328}
329
330static unsigned int type_len4host(unsigned int type_len_ts)
331{
332 if (file_bigendian)
333 return (type_len_ts >> 27) & ((1 << 5) - 1);
334 else
335 return type_len_ts & ((1 << 5) - 1);
336}
337
338static unsigned int ts4host(unsigned int type_len_ts)
339{
340 if (file_bigendian)
341 return type_len_ts & ((1 << 27) - 1);
342 else
343 return type_len_ts >> 5;
344}
345
346static int calc_index(void *ptr, int cpu)
347{
348 return (unsigned long)ptr - (unsigned long)cpu_data[cpu].page;
349}
350
351struct record *trace_peek_data(int cpu)
352{
353 struct record *data;
354 void *page = cpu_data[cpu].page;
355 int idx = cpu_data[cpu].index;
356 void *ptr = page + idx;
357 unsigned long long extend;
358 unsigned int type_len_ts;
359 unsigned int type_len;
360 unsigned int delta;
361 unsigned int length = 0;
362
363 if (cpu_data[cpu].next)
364 return cpu_data[cpu].next;
365
366 if (!page)
367 return NULL;
368
369 if (!idx) {
370 /* FIXME: handle header page */
371 if (header_page_ts_size != 8)
372 die("expected a long long type for timestamp");
373 cpu_data[cpu].timestamp = data2host8(ptr);
374 ptr += 8;
375 switch (header_page_size_size) {
376 case 4:
377 cpu_data[cpu].page_size = data2host4(ptr);
378 ptr += 4;
379 break;
380 case 8:
381 cpu_data[cpu].page_size = data2host8(ptr);
382 ptr += 8;
383 break;
384 default:
385 die("bad long size");
386 }
387 ptr = cpu_data[cpu].page + header_page_data_offset;
388 }
389
390read_again:
391 idx = calc_index(ptr, cpu);
392
393 if (idx >= cpu_data[cpu].page_size) {
394 get_next_page(cpu);
395 return trace_peek_data(cpu);
396 }
397
398 type_len_ts = data2host4(ptr);
399 ptr += 4;
400
401 type_len = type_len4host(type_len_ts);
402 delta = ts4host(type_len_ts);
403
404 switch (type_len) {
405 case RINGBUF_TYPE_PADDING:
406 if (!delta)
407 die("error, hit unexpected end of page");
408 length = data2host4(ptr);
409 ptr += 4;
410 length *= 4;
411 ptr += length;
412 goto read_again;
413
414 case RINGBUF_TYPE_TIME_EXTEND:
415 extend = data2host4(ptr);
416 ptr += 4;
417 extend <<= TS_SHIFT;
418 extend += delta;
419 cpu_data[cpu].timestamp += extend;
420 goto read_again;
421
422 case RINGBUF_TYPE_TIME_STAMP:
423 ptr += 12;
424 break;
425 case 0:
426 length = data2host4(ptr);
427 ptr += 4;
428 die("here! length=%d", length);
429 break;
430 default:
431 length = type_len * 4;
432 break;
433 }
434
435 cpu_data[cpu].timestamp += delta;
436
437 data = malloc_or_die(sizeof(*data));
438 memset(data, 0, sizeof(*data));
439
440 data->ts = cpu_data[cpu].timestamp;
441 data->size = length;
442 data->data = ptr;
443 ptr += length;
444
445 cpu_data[cpu].index = calc_index(ptr, cpu);
446 cpu_data[cpu].next = data;
447
448 return data;
449}
450
451struct record *trace_read_data(int cpu)
452{
453 struct record *data;
454
455 data = trace_peek_data(cpu);
456 cpu_data[cpu].next = NULL;
457
458 return data;
459}
460
461void trace_report (void)
462{
463 const char *input_file = "trace.info";
464 char buf[BUFSIZ];
465 char test[] = { 23, 8, 68 };
466 char *version;
467 int show_funcs = 0;
468 int show_printk = 0;
469
470 input_fd = open(input_file, O_RDONLY);
471 if (input_fd < 0)
472 die("opening '%s'\n", input_file);
473
474 read_or_die(buf, 3);
475 if (memcmp(buf, test, 3) != 0)
476 die("not an trace data file");
477
478 read_or_die(buf, 7);
479 if (memcmp(buf, "tracing", 7) != 0)
480 die("not a trace file (missing tracing)");
481
482 version = read_string();
483 printf("version = %s\n", version);
484 free(version);
485
486 read_or_die(buf, 1);
487 file_bigendian = buf[0];
488 host_bigendian = bigendian();
489
490 read_or_die(buf, 1);
491 long_size = buf[0];
492
493 page_size = read4();
494
495 read_header_files();
496
497 read_ftrace_files();
498 read_event_files();
499 read_proc_kallsyms();
500 read_ftrace_printk();
501
502 if (show_funcs) {
503 print_funcs();
504 return;
505 }
506 if (show_printk) {
507 print_printk();
508 return;
509 }
510
511 return;
512}
diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h
new file mode 100644
index 000000000000..420294a5773e
--- /dev/null
+++ b/tools/perf/util/trace-event.h
@@ -0,0 +1,240 @@
1#ifndef _TRACE_EVENTS_H
2#define _TRACE_EVENTS_H
3
4#include "parse-events.h"
5
6#define __unused __attribute__((unused))
7
8
9#ifndef PAGE_MASK
10#define PAGE_MASK (page_size - 1)
11#endif
12
13enum {
14 RINGBUF_TYPE_PADDING = 29,
15 RINGBUF_TYPE_TIME_EXTEND = 30,
16 RINGBUF_TYPE_TIME_STAMP = 31,
17};
18
19#ifndef TS_SHIFT
20#define TS_SHIFT 27
21#endif
22
23#define NSECS_PER_SEC 1000000000ULL
24#define NSECS_PER_USEC 1000ULL
25
26enum format_flags {
27 FIELD_IS_ARRAY = 1,
28 FIELD_IS_POINTER = 2,
29};
30
31struct format_field {
32 struct format_field *next;
33 char *type;
34 char *name;
35 int offset;
36 int size;
37 unsigned long flags;
38};
39
40struct format {
41 int nr_common;
42 int nr_fields;
43 struct format_field *common_fields;
44 struct format_field *fields;
45};
46
47struct print_arg_atom {
48 char *atom;
49};
50
51struct print_arg_string {
52 char *string;
53 int offset;
54};
55
56struct print_arg_field {
57 char *name;
58 struct format_field *field;
59};
60
61struct print_flag_sym {
62 struct print_flag_sym *next;
63 char *value;
64 char *str;
65};
66
67struct print_arg_typecast {
68 char *type;
69 struct print_arg *item;
70};
71
72struct print_arg_flags {
73 struct print_arg *field;
74 char *delim;
75 struct print_flag_sym *flags;
76};
77
78struct print_arg_symbol {
79 struct print_arg *field;
80 struct print_flag_sym *symbols;
81};
82
83struct print_arg;
84
85struct print_arg_op {
86 char *op;
87 int prio;
88 struct print_arg *left;
89 struct print_arg *right;
90};
91
92struct print_arg_func {
93 char *name;
94 struct print_arg *args;
95};
96
97enum print_arg_type {
98 PRINT_NULL,
99 PRINT_ATOM,
100 PRINT_FIELD,
101 PRINT_FLAGS,
102 PRINT_SYMBOL,
103 PRINT_TYPE,
104 PRINT_STRING,
105 PRINT_OP,
106};
107
108struct print_arg {
109 struct print_arg *next;
110 enum print_arg_type type;
111 union {
112 struct print_arg_atom atom;
113 struct print_arg_field field;
114 struct print_arg_typecast typecast;
115 struct print_arg_flags flags;
116 struct print_arg_symbol symbol;
117 struct print_arg_func func;
118 struct print_arg_string string;
119 struct print_arg_op op;
120 };
121};
122
123struct print_fmt {
124 char *format;
125 struct print_arg *args;
126};
127
128struct event {
129 struct event *next;
130 char *name;
131 int id;
132 int flags;
133 struct format format;
134 struct print_fmt print_fmt;
135};
136
137enum {
138 EVENT_FL_ISFTRACE = 1,
139 EVENT_FL_ISPRINT = 2,
140 EVENT_FL_ISBPRINT = 4,
141 EVENT_FL_ISFUNC = 8,
142 EVENT_FL_ISFUNCENT = 16,
143 EVENT_FL_ISFUNCRET = 32,
144};
145
146struct record {
147 unsigned long long ts;
148 int size;
149 void *data;
150};
151
152struct record *trace_peek_data(int cpu);
153struct record *trace_read_data(int cpu);
154
155void parse_set_info(int nr_cpus, int long_sz);
156
157void trace_report(void);
158
159void *malloc_or_die(unsigned int size);
160
161void parse_cmdlines(char *file, int size);
162void parse_proc_kallsyms(char *file, unsigned int size);
163void parse_ftrace_printk(char *file, unsigned int size);
164
165void print_funcs(void);
166void print_printk(void);
167
168int parse_ftrace_file(char *buf, unsigned long size);
169int parse_event_file(char *buf, unsigned long size, char *system);
170void print_event(int cpu, void *data, int size, unsigned long long nsecs,
171 char *comm);
172
173extern int file_bigendian;
174extern int host_bigendian;
175
176int bigendian(void);
177
178static inline unsigned short __data2host2(unsigned short data)
179{
180 unsigned short swap;
181
182 if (host_bigendian == file_bigendian)
183 return data;
184
185 swap = ((data & 0xffULL) << 8) |
186 ((data & (0xffULL << 8)) >> 8);
187
188 return swap;
189}
190
191static inline unsigned int __data2host4(unsigned int data)
192{
193 unsigned int swap;
194
195 if (host_bigendian == file_bigendian)
196 return data;
197
198 swap = ((data & 0xffULL) << 24) |
199 ((data & (0xffULL << 8)) << 8) |
200 ((data & (0xffULL << 16)) >> 8) |
201 ((data & (0xffULL << 24)) >> 24);
202
203 return swap;
204}
205
206static inline unsigned long long __data2host8(unsigned long long data)
207{
208 unsigned long long swap;
209
210 if (host_bigendian == file_bigendian)
211 return data;
212
213 swap = ((data & 0xffULL) << 56) |
214 ((data & (0xffULL << 8)) << 40) |
215 ((data & (0xffULL << 16)) << 24) |
216 ((data & (0xffULL << 24)) << 8) |
217 ((data & (0xffULL << 32)) >> 8) |
218 ((data & (0xffULL << 40)) >> 24) |
219 ((data & (0xffULL << 48)) >> 40) |
220 ((data & (0xffULL << 56)) >> 56);
221
222 return swap;
223}
224
225#define data2host2(ptr) __data2host2(*(unsigned short *)ptr)
226#define data2host4(ptr) __data2host4(*(unsigned int *)ptr)
227#define data2host8(ptr) __data2host8(*(unsigned long long *)ptr)
228
229extern int header_page_ts_offset;
230extern int header_page_ts_size;
231extern int header_page_size_offset;
232extern int header_page_size_size;
233extern int header_page_data_offset;
234extern int header_page_data_size;
235
236int parse_header_page(char *buf, unsigned long size);
237
238void read_tracing_data(struct perf_counter_attr *pattrs, int nb_counters);
239
240#endif /* _TRACE_EVENTS_H */
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index 68fe157d72fb..9de2329dd44d 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -39,10 +39,6 @@
39/* Approximation of the length of the decimal representation of this type. */ 39/* Approximation of the length of the decimal representation of this type. */
40#define decimal_length(x) ((int)(sizeof(x) * 2.56 + 0.5) + 1) 40#define decimal_length(x) ((int)(sizeof(x) * 2.56 + 0.5) + 1)
41 41
42#if !defined(__APPLE__) && !defined(__FreeBSD__) && !defined(__USLC__) && !defined(_M_UNIX)
43#define _XOPEN_SOURCE 600 /* glibc2 and AIX 5.3L need 500, OpenBSD needs 600 for S_ISLNK() */
44#define _XOPEN_SOURCE_EXTENDED 1 /* AIX 5.3L needs this */
45#endif
46#define _ALL_SOURCE 1 42#define _ALL_SOURCE 1
47#define _GNU_SOURCE 1 43#define _GNU_SOURCE 1
48#define _BSD_SOURCE 1 44#define _BSD_SOURCE 1
@@ -83,6 +79,7 @@
83#include <inttypes.h> 79#include <inttypes.h>
84#include "../../../include/linux/magic.h" 80#include "../../../include/linux/magic.h"
85 81
82
86#ifndef NO_ICONV 83#ifndef NO_ICONV
87#include <iconv.h> 84#include <iconv.h>
88#endif 85#endif
@@ -310,6 +307,7 @@ static inline int has_extension(const char *filename, const char *ext)
310#undef isspace 307#undef isspace
311#undef isdigit 308#undef isdigit
312#undef isalpha 309#undef isalpha
310#undef isprint
313#undef isalnum 311#undef isalnum
314#undef tolower 312#undef tolower
315#undef toupper 313#undef toupper
diff --git a/tools/perf/util/values.c b/tools/perf/util/values.c
new file mode 100644
index 000000000000..1c15e39f99e3
--- /dev/null
+++ b/tools/perf/util/values.c
@@ -0,0 +1,230 @@
1#include <stdlib.h>
2
3#include "util.h"
4#include "values.h"
5
6void perf_read_values_init(struct perf_read_values *values)
7{
8 values->threads_max = 16;
9 values->pid = malloc(values->threads_max * sizeof(*values->pid));
10 values->tid = malloc(values->threads_max * sizeof(*values->tid));
11 values->value = malloc(values->threads_max * sizeof(*values->value));
12 if (!values->pid || !values->tid || !values->value)
13 die("failed to allocate read_values threads arrays");
14 values->threads = 0;
15
16 values->counters_max = 16;
17 values->counterrawid = malloc(values->counters_max
18 * sizeof(*values->counterrawid));
19 values->countername = malloc(values->counters_max
20 * sizeof(*values->countername));
21 if (!values->counterrawid || !values->countername)
22 die("failed to allocate read_values counters arrays");
23 values->counters = 0;
24}
25
26void perf_read_values_destroy(struct perf_read_values *values)
27{
28 int i;
29
30 if (!values->threads_max || !values->counters_max)
31 return;
32
33 for (i = 0; i < values->threads; i++)
34 free(values->value[i]);
35 free(values->pid);
36 free(values->tid);
37 free(values->counterrawid);
38 for (i = 0; i < values->counters; i++)
39 free(values->countername[i]);
40 free(values->countername);
41}
42
43static void perf_read_values__enlarge_threads(struct perf_read_values *values)
44{
45 values->threads_max *= 2;
46 values->pid = realloc(values->pid,
47 values->threads_max * sizeof(*values->pid));
48 values->tid = realloc(values->tid,
49 values->threads_max * sizeof(*values->tid));
50 values->value = realloc(values->value,
51 values->threads_max * sizeof(*values->value));
52 if (!values->pid || !values->tid || !values->value)
53 die("failed to enlarge read_values threads arrays");
54}
55
56static int perf_read_values__findnew_thread(struct perf_read_values *values,
57 u32 pid, u32 tid)
58{
59 int i;
60
61 for (i = 0; i < values->threads; i++)
62 if (values->pid[i] == pid && values->tid[i] == tid)
63 return i;
64
65 if (values->threads == values->threads_max)
66 perf_read_values__enlarge_threads(values);
67
68 i = values->threads++;
69 values->pid[i] = pid;
70 values->tid[i] = tid;
71 values->value[i] = malloc(values->counters_max * sizeof(**values->value));
72 if (!values->value[i])
73 die("failed to allocate read_values counters array");
74
75 return i;
76}
77
78static void perf_read_values__enlarge_counters(struct perf_read_values *values)
79{
80 int i;
81
82 values->counters_max *= 2;
83 values->counterrawid = realloc(values->counterrawid,
84 values->counters_max * sizeof(*values->counterrawid));
85 values->countername = realloc(values->countername,
86 values->counters_max * sizeof(*values->countername));
87 if (!values->counterrawid || !values->countername)
88 die("failed to enlarge read_values counters arrays");
89
90 for (i = 0; i < values->threads; i++) {
91 values->value[i] = realloc(values->value[i],
92 values->counters_max * sizeof(**values->value));
93 if (!values->value[i])
94 die("failed to enlarge read_values counters arrays");
95 }
96}
97
98static int perf_read_values__findnew_counter(struct perf_read_values *values,
99 u64 rawid, const char *name)
100{
101 int i;
102
103 for (i = 0; i < values->counters; i++)
104 if (values->counterrawid[i] == rawid)
105 return i;
106
107 if (values->counters == values->counters_max)
108 perf_read_values__enlarge_counters(values);
109
110 i = values->counters++;
111 values->counterrawid[i] = rawid;
112 values->countername[i] = strdup(name);
113
114 return i;
115}
116
117void perf_read_values_add_value(struct perf_read_values *values,
118 u32 pid, u32 tid,
119 u64 rawid, const char *name, u64 value)
120{
121 int tindex, cindex;
122
123 tindex = perf_read_values__findnew_thread(values, pid, tid);
124 cindex = perf_read_values__findnew_counter(values, rawid, name);
125
126 values->value[tindex][cindex] = value;
127}
128
129static void perf_read_values__display_pretty(FILE *fp,
130 struct perf_read_values *values)
131{
132 int i, j;
133 int pidwidth, tidwidth;
134 int *counterwidth;
135
136 counterwidth = malloc(values->counters * sizeof(*counterwidth));
137 if (!counterwidth)
138 die("failed to allocate counterwidth array");
139 tidwidth = 3;
140 pidwidth = 3;
141 for (j = 0; j < values->counters; j++)
142 counterwidth[j] = strlen(values->countername[j]);
143 for (i = 0; i < values->threads; i++) {
144 int width;
145
146 width = snprintf(NULL, 0, "%d", values->pid[i]);
147 if (width > pidwidth)
148 pidwidth = width;
149 width = snprintf(NULL, 0, "%d", values->tid[i]);
150 if (width > tidwidth)
151 tidwidth = width;
152 for (j = 0; j < values->counters; j++) {
153 width = snprintf(NULL, 0, "%Lu", values->value[i][j]);
154 if (width > counterwidth[j])
155 counterwidth[j] = width;
156 }
157 }
158
159 fprintf(fp, "# %*s %*s", pidwidth, "PID", tidwidth, "TID");
160 for (j = 0; j < values->counters; j++)
161 fprintf(fp, " %*s", counterwidth[j], values->countername[j]);
162 fprintf(fp, "\n");
163
164 for (i = 0; i < values->threads; i++) {
165 fprintf(fp, " %*d %*d", pidwidth, values->pid[i],
166 tidwidth, values->tid[i]);
167 for (j = 0; j < values->counters; j++)
168 fprintf(fp, " %*Lu",
169 counterwidth[j], values->value[i][j]);
170 fprintf(fp, "\n");
171 }
172}
173
174static void perf_read_values__display_raw(FILE *fp,
175 struct perf_read_values *values)
176{
177 int width, pidwidth, tidwidth, namewidth, rawwidth, countwidth;
178 int i, j;
179
180 tidwidth = 3; /* TID */
181 pidwidth = 3; /* PID */
182 namewidth = 4; /* "Name" */
183 rawwidth = 3; /* "Raw" */
184 countwidth = 5; /* "Count" */
185
186 for (i = 0; i < values->threads; i++) {
187 width = snprintf(NULL, 0, "%d", values->pid[i]);
188 if (width > pidwidth)
189 pidwidth = width;
190 width = snprintf(NULL, 0, "%d", values->tid[i]);
191 if (width > tidwidth)
192 tidwidth = width;
193 }
194 for (j = 0; j < values->counters; j++) {
195 width = strlen(values->countername[j]);
196 if (width > namewidth)
197 namewidth = width;
198 width = snprintf(NULL, 0, "%llx", values->counterrawid[j]);
199 if (width > rawwidth)
200 rawwidth = width;
201 }
202 for (i = 0; i < values->threads; i++) {
203 for (j = 0; j < values->counters; j++) {
204 width = snprintf(NULL, 0, "%Lu", values->value[i][j]);
205 if (width > countwidth)
206 countwidth = width;
207 }
208 }
209
210 fprintf(fp, "# %*s %*s %*s %*s %*s\n",
211 pidwidth, "PID", tidwidth, "TID",
212 namewidth, "Name", rawwidth, "Raw",
213 countwidth, "Count");
214 for (i = 0; i < values->threads; i++)
215 for (j = 0; j < values->counters; j++)
216 fprintf(fp, " %*d %*d %*s %*llx %*Lu\n",
217 pidwidth, values->pid[i],
218 tidwidth, values->tid[i],
219 namewidth, values->countername[j],
220 rawwidth, values->counterrawid[j],
221 countwidth, values->value[i][j]);
222}
223
224void perf_read_values_display(FILE *fp, struct perf_read_values *values, int raw)
225{
226 if (raw)
227 perf_read_values__display_raw(fp, values);
228 else
229 perf_read_values__display_pretty(fp, values);
230}
diff --git a/tools/perf/util/values.h b/tools/perf/util/values.h
new file mode 100644
index 000000000000..cadf8cf2a590
--- /dev/null
+++ b/tools/perf/util/values.h
@@ -0,0 +1,27 @@
1#ifndef _PERF_VALUES_H
2#define _PERF_VALUES_H
3
4#include "types.h"
5
6struct perf_read_values {
7 int threads;
8 int threads_max;
9 u32 *pid, *tid;
10 int counters;
11 int counters_max;
12 u64 *counterrawid;
13 char **countername;
14 u64 **value;
15};
16
17void perf_read_values_init(struct perf_read_values *values);
18void perf_read_values_destroy(struct perf_read_values *values);
19
20void perf_read_values_add_value(struct perf_read_values *values,
21 u32 pid, u32 tid,
22 u64 rawid, const char *name, u64 value);
23
24void perf_read_values_display(FILE *fp, struct perf_read_values *values,
25 int raw);
26
27#endif /* _PERF_VALUES_H */