aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/devicetree/bindings/powerpc/fsl/interlaken-lac.txt309
-rw-r--r--Documentation/powerpc/00-INDEX2
-rw-r--r--Documentation/powerpc/pmu-ebb.txt137
-rw-r--r--Documentation/vfio.txt63
-rw-r--r--MAINTAINERS8
-rw-r--r--arch/powerpc/Kconfig17
-rw-r--r--arch/powerpc/Kconfig.debug7
-rw-r--r--arch/powerpc/boot/dts/currituck.dts5
-rw-r--r--arch/powerpc/boot/dts/fsl/interlaken-lac-portals.dtsi156
-rw-r--r--arch/powerpc/boot/dts/fsl/interlaken-lac.dtsi45
-rw-r--r--arch/powerpc/configs/c2k_defconfig2
-rw-r--r--arch/powerpc/configs/g5_defconfig2
-rw-r--r--arch/powerpc/configs/maple_defconfig2
-rw-r--r--arch/powerpc/configs/mpc512x_defconfig27
-rw-r--r--arch/powerpc/configs/mpc85xx_smp_defconfig1
-rw-r--r--arch/powerpc/configs/pmac32_defconfig2
-rw-r--r--arch/powerpc/configs/ppc64_defconfig2
-rw-r--r--arch/powerpc/configs/ppc6xx_defconfig2
-rw-r--r--arch/powerpc/configs/pseries_defconfig1
-rw-r--r--arch/powerpc/include/asm/eeh.h36
-rw-r--r--arch/powerpc/include/asm/eeh_event.h2
-rw-r--r--arch/powerpc/include/asm/exception-64s.h8
-rw-r--r--arch/powerpc/include/asm/hugetlb.h8
-rw-r--r--arch/powerpc/include/asm/iommu.h33
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_64.h58
-rw-r--r--arch/powerpc/include/asm/lppaca.h3
-rw-r--r--arch/powerpc/include/asm/machdep.h11
-rw-r--r--arch/powerpc/include/asm/mmu-hash64.h14
-rw-r--r--arch/powerpc/include/asm/mpc5121.h1
-rw-r--r--arch/powerpc/include/asm/mpic.h5
-rw-r--r--arch/powerpc/include/asm/mpic_timer.h46
-rw-r--r--arch/powerpc/include/asm/opal.h140
-rw-r--r--arch/powerpc/include/asm/perf_event_server.h6
-rw-r--r--arch/powerpc/include/asm/pgalloc-64.h6
-rw-r--r--arch/powerpc/include/asm/pgtable-ppc64-64k.h3
-rw-r--r--arch/powerpc/include/asm/pgtable-ppc64.h241
-rw-r--r--arch/powerpc/include/asm/pgtable.h6
-rw-r--r--arch/powerpc/include/asm/probes.h25
-rw-r--r--arch/powerpc/include/asm/processor.h16
-rw-r--r--arch/powerpc/include/asm/reg.h9
-rw-r--r--arch/powerpc/include/asm/rtas.h4
-rw-r--r--arch/powerpc/include/asm/switch_to.h14
-rw-r--r--arch/powerpc/include/asm/tlbflush.h3
-rw-r--r--arch/powerpc/include/asm/vdso.h2
-rw-r--r--arch/powerpc/kernel/Makefile4
-rw-r--r--arch/powerpc/kernel/asm-offsets.c7
-rw-r--r--arch/powerpc/kernel/cacheinfo.c36
-rw-r--r--arch/powerpc/kernel/eeh.c (renamed from arch/powerpc/platforms/pseries/eeh.c)196
-rw-r--r--arch/powerpc/kernel/eeh_cache.c (renamed from arch/powerpc/platforms/pseries/eeh_cache.c)5
-rw-r--r--arch/powerpc/kernel/eeh_dev.c (renamed from arch/powerpc/platforms/pseries/eeh_dev.c)0
-rw-r--r--arch/powerpc/kernel/eeh_driver.c (renamed from arch/powerpc/platforms/pseries/eeh_driver.c)169
-rw-r--r--arch/powerpc/kernel/eeh_event.c (renamed from arch/powerpc/platforms/pseries/eeh_event.c)134
-rw-r--r--arch/powerpc/kernel/eeh_pe.c (renamed from arch/powerpc/platforms/pseries/eeh_pe.c)237
-rw-r--r--arch/powerpc/kernel/eeh_sysfs.c (renamed from arch/powerpc/platforms/pseries/eeh_sysfs.c)1
-rw-r--r--arch/powerpc/kernel/entry_64.S30
-rw-r--r--arch/powerpc/kernel/exceptions-64s.S56
-rw-r--r--arch/powerpc/kernel/hw_breakpoint.c3
-rw-r--r--arch/powerpc/kernel/idle.c4
-rw-r--r--arch/powerpc/kernel/io-workarounds.c11
-rw-r--r--arch/powerpc/kernel/iommu.c323
-rw-r--r--arch/powerpc/kernel/irq.c2
-rw-r--r--arch/powerpc/kernel/kprobes.c20
-rw-r--r--arch/powerpc/kernel/nvram_64.c20
-rw-r--r--arch/powerpc/kernel/pci-hotplug.c111
-rw-r--r--arch/powerpc/kernel/process.c4
-rw-r--r--arch/powerpc/kernel/prom.c42
-rw-r--r--arch/powerpc/kernel/ptrace.c4
-rw-r--r--arch/powerpc/kernel/reloc_32.S3
-rw-r--r--arch/powerpc/kernel/rtas.c4
-rw-r--r--arch/powerpc/kernel/setup_64.c2
-rw-r--r--arch/powerpc/kernel/signal_32.c70
-rw-r--r--arch/powerpc/kernel/signal_64.c8
-rw-r--r--arch/powerpc/kernel/smp.c12
-rw-r--r--arch/powerpc/kernel/sysfs.c6
-rw-r--r--arch/powerpc/kernel/time.c1
-rw-r--r--arch/powerpc/kernel/tm.S18
-rw-r--r--arch/powerpc/kernel/traps.c79
-rw-r--r--arch/powerpc/kernel/udbg.c2
-rw-r--r--arch/powerpc/kernel/vdso.c2
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_host.c2
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c8
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_mmu.c14
-rw-r--r--arch/powerpc/lib/sstep.c2
-rw-r--r--arch/powerpc/math-emu/Makefile3
-rw-r--r--arch/powerpc/math-emu/fre.c11
-rw-r--r--arch/powerpc/math-emu/frsqrtes.c11
-rw-r--r--arch/powerpc/math-emu/math.c14
-rw-r--r--arch/powerpc/mm/44x_mmu.c6
-rw-r--r--arch/powerpc/mm/Makefile8
-rw-r--r--arch/powerpc/mm/gup.c18
-rw-r--r--arch/powerpc/mm/hash_low_64.S21
-rw-r--r--arch/powerpc/mm/hash_native_64.c195
-rw-r--r--arch/powerpc/mm/hash_utils_64.c67
-rw-r--r--arch/powerpc/mm/hugepage-hash64.c175
-rw-r--r--arch/powerpc/mm/hugetlbpage-hash64.c2
-rw-r--r--arch/powerpc/mm/hugetlbpage.c299
-rw-r--r--arch/powerpc/mm/init_64.c9
-rw-r--r--arch/powerpc/mm/mem.c4
-rw-r--r--arch/powerpc/mm/mmap.c (renamed from arch/powerpc/mm/mmap_64.c)0
-rw-r--r--arch/powerpc/mm/mmu_context_nohash.c15
-rw-r--r--arch/powerpc/mm/numa.c12
-rw-r--r--arch/powerpc/mm/pgtable.c8
-rw-r--r--arch/powerpc/mm/pgtable_64.c414
-rw-r--r--arch/powerpc/mm/subpage-prot.c48
-rw-r--r--arch/powerpc/mm/tlb_hash64.c36
-rw-r--r--arch/powerpc/mm/tlb_nohash.c2
-rw-r--r--arch/powerpc/perf/core-book3s.c201
-rw-r--r--arch/powerpc/perf/power8-pmu.c62
-rw-r--r--arch/powerpc/platforms/44x/currituck.c43
-rw-r--r--arch/powerpc/platforms/44x/iss4xx.c4
-rw-r--r--arch/powerpc/platforms/512x/mpc5121_ads.c6
-rw-r--r--arch/powerpc/platforms/512x/mpc512x.h12
-rw-r--r--arch/powerpc/platforms/512x/mpc512x_generic.c4
-rw-r--r--arch/powerpc/platforms/512x/mpc512x_shared.c31
-rw-r--r--arch/powerpc/platforms/512x/pdm360ng.c4
-rw-r--r--arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c12
-rw-r--r--arch/powerpc/platforms/85xx/p5020_ds.c5
-rw-r--r--arch/powerpc/platforms/85xx/p5040_ds.c5
-rw-r--r--arch/powerpc/platforms/85xx/smp.c6
-rw-r--r--arch/powerpc/platforms/85xx/t4240_qds.c5
-rw-r--r--arch/powerpc/platforms/8xx/m8xx_setup.c14
-rw-r--r--arch/powerpc/platforms/Kconfig26
-rw-r--r--arch/powerpc/platforms/Kconfig.cputype1
-rw-r--r--arch/powerpc/platforms/cell/beat_htab.c16
-rw-r--r--arch/powerpc/platforms/cell/smp.c2
-rw-r--r--arch/powerpc/platforms/powermac/smp.c2
-rw-r--r--arch/powerpc/platforms/powernv/Makefile1
-rw-r--r--arch/powerpc/platforms/powernv/eeh-ioda.c916
-rw-r--r--arch/powerpc/platforms/powernv/eeh-powernv.c379
-rw-r--r--arch/powerpc/platforms/powernv/opal-wrappers.S3
-rw-r--r--arch/powerpc/platforms/powernv/opal.c69
-rw-r--r--arch/powerpc/platforms/powernv/pci-ioda.c62
-rw-r--r--arch/powerpc/platforms/powernv/pci-p5ioc2.c11
-rw-r--r--arch/powerpc/platforms/powernv/pci.c139
-rw-r--r--arch/powerpc/platforms/powernv/pci.h35
-rw-r--r--arch/powerpc/platforms/powernv/setup.c4
-rw-r--r--arch/powerpc/platforms/powernv/smp.c4
-rw-r--r--arch/powerpc/platforms/ps3/htab.c5
-rw-r--r--arch/powerpc/platforms/pseries/Kconfig5
-rw-r--r--arch/powerpc/platforms/pseries/Makefile4
-rw-r--r--arch/powerpc/platforms/pseries/io_event_irq.c2
-rw-r--r--arch/powerpc/platforms/pseries/iommu.c4
-rw-r--r--arch/powerpc/platforms/pseries/lpar.c142
-rw-r--r--arch/powerpc/platforms/pseries/nvram.c554
-rw-r--r--arch/powerpc/platforms/pseries/pci_dlpar.c85
-rw-r--r--arch/powerpc/platforms/pseries/ras.c8
-rw-r--r--arch/powerpc/platforms/pseries/smp.c2
-rw-r--r--arch/powerpc/sysdev/Makefile2
-rw-r--r--arch/powerpc/sysdev/cpm1.c1
-rw-r--r--arch/powerpc/sysdev/fsl_mpic_timer_wakeup.c161
-rw-r--r--arch/powerpc/sysdev/mpic.c58
-rw-r--r--arch/powerpc/sysdev/mpic_timer.c593
-rw-r--r--arch/s390/include/asm/pgtable.h5
-rw-r--r--arch/s390/mm/pgtable.c5
-rw-r--r--arch/sparc/include/asm/pgtable_64.h5
-rw-r--r--arch/sparc/mm/tlb.c5
-rw-r--r--drivers/acpi/apei/erst.c4
-rw-r--r--drivers/firmware/efi/efi-pstore.c2
-rw-r--r--drivers/i2c/busses/i2c-cpm.c8
-rw-r--r--drivers/iommu/Kconfig8
-rw-r--r--drivers/macintosh/adb.c2
-rw-r--r--drivers/macintosh/mac_hid.c8
-rw-r--r--drivers/macintosh/via-cuda.c2
-rw-r--r--drivers/macintosh/windfarm_pm121.c6
-rw-r--r--drivers/macintosh/windfarm_pm81.c6
-rw-r--r--drivers/macintosh/windfarm_pm91.c6
-rw-r--r--drivers/macintosh/windfarm_smu_sat.c1
-rw-r--r--drivers/vfio/Kconfig6
-rw-r--r--drivers/vfio/Makefile1
-rw-r--r--drivers/vfio/vfio.c1
-rw-r--r--drivers/vfio/vfio_iommu_spapr_tce.c377
-rw-r--r--drivers/watchdog/booke_wdt.c8
-rw-r--r--fs/pstore/ftrace.c2
-rw-r--r--fs/pstore/inode.c9
-rw-r--r--fs/pstore/platform.c10
-rw-r--r--fs/pstore/ram.c3
-rw-r--r--include/asm-generic/pgtable.h5
-rw-r--r--include/linux/huge_mm.h6
-rw-r--r--include/linux/pstore.h12
-rw-r--r--include/uapi/linux/vfio.h34
-rw-r--r--mm/huge_memory.c28
-rw-r--r--mm/pgtable-generic.c5
182 files changed, 7633 insertions, 1177 deletions
diff --git a/Documentation/devicetree/bindings/powerpc/fsl/interlaken-lac.txt b/Documentation/devicetree/bindings/powerpc/fsl/interlaken-lac.txt
new file mode 100644
index 000000000000..641bc13983e1
--- /dev/null
+++ b/Documentation/devicetree/bindings/powerpc/fsl/interlaken-lac.txt
@@ -0,0 +1,309 @@
1===============================================================================
2Freescale Interlaken Look-Aside Controller Device Bindings
3Copyright 2012 Freescale Semiconductor Inc.
4
5CONTENTS
6 - Interlaken Look-Aside Controller (LAC) Node
7 - Example LAC Node
8 - Interlaken Look-Aside Controller (LAC) Software Portal Node
9 - Interlaken Look-Aside Controller (LAC) Software Portal Child Nodes
10 - Example LAC SWP Node with Child Nodes
11
12==============================================================================
13Interlaken Look-Aside Controller (LAC) Node
14
15DESCRIPTION
16
17The Interlaken is a narrow, high speed channelized chip-to-chip interface. To
18facilitate interoperability between a data path device and a look-aside
19co-processor, the Interlaken Look-Aside protocol is defined for short
20transaction-related transfers. Although based on the Interlaken protocol,
21Interlaken Look-Aside is not directly compatible with Interlaken and can be
22considered a different operation mode.
23
24The Interlaken LA controller connects internal platform to Interlaken serial
25interface. It accepts LA command through software portals, which are system
26memory mapped 4KB spaces. The LA commands are then translated into the
27Interlaken control words and data words, which are sent on TX side to TCAM
28through SerDes lanes.
29
30There are two 4KiB spaces defined within the LAC global register memory map.
31There is a full register set at 0x0000-0x0FFF (also known as the "hypervisor"
32version), and a subset at 0x1000-0x1FFF. The former is a superset of the
33latter, and includes certain registers that should not be accessible to
34partitioned software. Separate nodes are used for each region, with a phandle
35linking the hypervisor node to the normal operating node.
36
37PROPERTIES
38
39 - compatible
40 Usage: required
41 Value type: <string>
42 Definition: Must include "fsl,interlaken-lac". This represents only
43 those LAC CCSR registers not protected in partitioned
44 software. The version of the device is determined by the LAC
45 IP Block Revision Register (IPBRR0) at offset 0x0BF8.
46
47 Table of correspondences between IPBRR0 values and example
48 chips:
49 Value Device
50 ----------- -------
51 0x02000100 T4240
52
53 The Hypervisor node has a different compatible. It must include
54 "fsl,interlaken-lac-hv". This node represents the protected
55 LAC register space and is required except inside a partition
56 where access to the hypervisor node is to be denied.
57
58 - fsl,non-hv-node
59 Usage: required in "fsl,interlaken-lac-hv"
60 Value type: <phandle>
61 Definition: Points to the non-protected LAC CCSR mapped register space
62 node.
63
64 - reg
65 Usage: required
66 Value type: <prop-encoded-array>
67 Definition: A standard property. The first resource represents the
68 Interlaken LAC configuration registers.
69
70 - interrupts:
71 Usage: required in non-hv node only
72 Value type: <prop-encoded-array>
73 Definition: Interrupt mapping for Interlaken LAC error IRQ.
74
75EXAMPLE
76 lac: lac@229000 {
77 compatible = "fsl,interlaken-lac"
78 reg = <0x229000 0x1000>;
79 interrupts = <16 2 1 18>;
80 };
81
82 lac-hv@228000 {
83 compatible = "fsl,interlaken-lac-hv"
84 reg = <0x228000 0x1000>;
85 fsl,non-hv-node = <&lac>;
86 };
87
88===============================================================================
89Interlaken Look-Aside Controller (LAC) Software Portal Container Node
90
91DESCRIPTION
92The Interlaken Look-Aside Controller (LAC) utilizes Software Portals to accept
93Interlaken Look-Aside (ILA) commands. The Interlaken LAC software portal
94memory map occupies 128KB of memory space. The software portal memory space is
95intended to be cache-enabled. WIMG for each software space is required to be
960010 if stashing is enabled; otherwise, WIMG can be 0000 or 0010.
97
98PROPERTIES
99
100 - #address-cells
101 Usage: required
102 Value type: <u32>
103 Definition: A standard property. Must have a value of 1.
104
105 - #size-cells
106 Usage: required
107 Value type: <u32>
108 Definition: A standard property. Must have a value of 1.
109
110 - compatible
111 Usage: required
112 Value type: <string>
113 Definition: Must include "fsl,interlaken-lac-portals"
114
115 - ranges
116 Usage: required
117 Value type: <prop-encoded-array>
118 Definition: A standard property. Specifies the address and length
119 of the LAC portal memory space.
120
121===============================================================================
122Interlaken Look-Aside Controller (LAC) Software Portals Child Nodes
123
124DESCRIPTION
125There are up to 24 available software portals with each software portal
126requiring 4KB of consecutive memory within the software portal memory mapped
127space.
128
129PROPERTIES
130
131 - compatible
132 Usage: required
133 Value type: <string>
134 Definition: Must include "fsl,interlaken-lac-portal-vX.Y" where X is
135 the Major version (IP_MJ) found in the LAC IP Block Revision
136 Register (IPBRR0), at offset 0x0BF8, and Y is the Minor version
137 (IP_MN).
138
139 Table of correspondences between version values and example chips:
140 Value Device
141 ------ -------
142 1.0 T4240
143
144 - reg
145 Usage: required
146 Value type: <prop-encoded-array>
147 Definition: A standard property. The first resource represents the
148 Interlaken LAC software portal registers.
149
150 - fsl,liodn
151 Value type: <u32>
152 Definition: The logical I/O device number (LIODN) for this device. The
153 LIODN is a number expressed by this device and used to perform
154 look-ups in the IOMMU (PAMU) address table when performing
155 DMAs. This property is automatically added by u-boot.
156
157===============================================================================
158EXAMPLE
159
160lac-portals {
161 #address-cells = <0x1>;
162 #size-cells = <0x1>;
163 compatible = "fsl,interlaken-lac-portals";
164 ranges = <0x0 0xf 0xf4400000 0x20000>;
165
166 lportal0: lac-portal@0 {
167 compatible = "fsl,interlaken-lac-portal-v1.0";
168 fsl,liodn = <0x204>;
169 reg = <0x0 0x1000>;
170 };
171
172 lportal1: lac-portal@1000 {
173 compatible = "fsl,interlaken-lac-portal-v1.0";
174 fsl,liodn = <0x205>;
175 reg = <0x1000 0x1000>;
176 };
177
178 lportal2: lac-portal@2000 {
179 compatible = "fsl,interlaken-lac-portal-v1.0";
180 fsl,liodn = <0x206>;
181 reg = <0x2000 0x1000>;
182 };
183
184 lportal3: lac-portal@3000 {
185 compatible = "fsl,interlaken-lac-portal-v1.0";
186 fsl,liodn = <0x207>;
187 reg = <0x3000 0x1000>;
188 };
189
190 lportal4: lac-portal@4000 {
191 compatible = "fsl,interlaken-lac-portal-v1.0";
192 fsl,liodn = <0x208>;
193 reg = <0x4000 0x1000>;
194 };
195
196 lportal5: lac-portal@5000 {
197 compatible = "fsl,interlaken-lac-portal-v1.0";
198 fsl,liodn = <0x209>;
199 reg = <0x5000 0x1000>;
200 };
201
202 lportal6: lac-portal@6000 {
203 compatible = "fsl,interlaken-lac-portal-v1.0";
204 fsl,liodn = <0x20A>;
205 reg = <0x6000 0x1000>;
206 };
207
208 lportal7: lac-portal@7000 {
209 compatible = "fsl,interlaken-lac-portal-v1.0";
210 fsl,liodn = <0x20B>;
211 reg = <0x7000 0x1000>;
212 };
213
214 lportal8: lac-portal@8000 {
215 compatible = "fsl,interlaken-lac-portal-v1.0";
216 fsl,liodn = <0x20C>;
217 reg = <0x8000 0x1000>;
218 };
219
220 lportal9: lac-portal@9000 {
221 compatible = "fsl,interlaken-lac-portal-v1.0";
222 fsl,liodn = <0x20D>;
223 reg = <0x9000 0x1000>;
224 };
225
226 lportal10: lac-portal@A000 {
227 compatible = "fsl,interlaken-lac-portal-v1.0";
228 fsl,liodn = <0x20E>;
229 reg = <0xA000 0x1000>;
230 };
231
232 lportal11: lac-portal@B000 {
233 compatible = "fsl,interlaken-lac-portal-v1.0";
234 fsl,liodn = <0x20F>;
235 reg = <0xB000 0x1000>;
236 };
237
238 lportal12: lac-portal@C000 {
239 compatible = "fsl,interlaken-lac-portal-v1.0";
240 fsl,liodn = <0x210>;
241 reg = <0xC000 0x1000>;
242 };
243
244 lportal13: lac-portal@D000 {
245 compatible = "fsl,interlaken-lac-portal-v1.0";
246 fsl,liodn = <0x211>;
247 reg = <0xD000 0x1000>;
248 };
249
250 lportal14: lac-portal@E000 {
251 compatible = "fsl,interlaken-lac-portal-v1.0";
252 fsl,liodn = <0x212>;
253 reg = <0xE000 0x1000>;
254 };
255
256 lportal15: lac-portal@F000 {
257 compatible = "fsl,interlaken-lac-portal-v1.0";
258 fsl,liodn = <0x213>;
259 reg = <0xF000 0x1000>;
260 };
261
262 lportal16: lac-portal@10000 {
263 compatible = "fsl,interlaken-lac-portal-v1.0";
264 fsl,liodn = <0x214>;
265 reg = <0x10000 0x1000>;
266 };
267
268 lportal17: lac-portal@11000 {
269 compatible = "fsl,interlaken-lac-portal-v1.0";
270 fsl,liodn = <0x215>;
271 reg = <0x11000 0x1000>;
272 };
273
274 lportal8: lac-portal@1200 {
275 compatible = "fsl,interlaken-lac-portal-v1.0";
276 fsl,liodn = <0x216>;
277 reg = <0x12000 0x1000>;
278 };
279
280 lportal19: lac-portal@13000 {
281 compatible = "fsl,interlaken-lac-portal-v1.0";
282 fsl,liodn = <0x217>;
283 reg = <0x13000 0x1000>;
284 };
285
286 lportal20: lac-portal@14000 {
287 compatible = "fsl,interlaken-lac-portal-v1.0";
288 fsl,liodn = <0x218>;
289 reg = <0x14000 0x1000>;
290 };
291
292 lportal21: lac-portal@15000 {
293 compatible = "fsl,interlaken-lac-portal-v1.0";
294 fsl,liodn = <0x219>;
295 reg = <0x15000 0x1000>;
296 };
297
298 lportal22: lac-portal@16000 {
299 compatible = "fsl,interlaken-lac-portal-v1.0";
300 fsl,liodn = <0x21A>;
301 reg = <0x16000 0x1000>;
302 };
303
304 lportal23: lac-portal@17000 {
305 compatible = "fsl,interlaken-lac-portal-v1.0";
306 fsl,liodn = <0x21B>;
307 reg = <0x17000 0x1000>;
308 };
309};
diff --git a/Documentation/powerpc/00-INDEX b/Documentation/powerpc/00-INDEX
index dd9e92802ec0..05026ce1875e 100644
--- a/Documentation/powerpc/00-INDEX
+++ b/Documentation/powerpc/00-INDEX
@@ -14,6 +14,8 @@ hvcs.txt
14 - IBM "Hypervisor Virtual Console Server" Installation Guide 14 - IBM "Hypervisor Virtual Console Server" Installation Guide
15mpc52xx.txt 15mpc52xx.txt
16 - Linux 2.6.x on MPC52xx family 16 - Linux 2.6.x on MPC52xx family
17pmu-ebb.txt
18 - Description of the API for using the PMU with Event Based Branches.
17qe_firmware.txt 19qe_firmware.txt
18 - describes the layout of firmware binaries for the Freescale QUICC 20 - describes the layout of firmware binaries for the Freescale QUICC
19 Engine and the code that parses and uploads the microcode therein. 21 Engine and the code that parses and uploads the microcode therein.
diff --git a/Documentation/powerpc/pmu-ebb.txt b/Documentation/powerpc/pmu-ebb.txt
new file mode 100644
index 000000000000..73cd163dbfb8
--- /dev/null
+++ b/Documentation/powerpc/pmu-ebb.txt
@@ -0,0 +1,137 @@
1PMU Event Based Branches
2========================
3
4Event Based Branches (EBBs) are a feature which allows the hardware to
5branch directly to a specified user space address when certain events occur.
6
7The full specification is available in Power ISA v2.07:
8
9 https://www.power.org/documentation/power-isa-version-2-07/
10
11One type of event for which EBBs can be configured is PMU exceptions. This
12document describes the API for configuring the Power PMU to generate EBBs,
13using the Linux perf_events API.
14
15
16Terminology
17-----------
18
19Throughout this document we will refer to an "EBB event" or "EBB events". This
20just refers to a struct perf_event which has set the "EBB" flag in its
21attr.config. All events which can be configured on the hardware PMU are
22possible "EBB events".
23
24
25Background
26----------
27
28When a PMU EBB occurs it is delivered to the currently running process. As such
29EBBs can only sensibly be used by programs for self-monitoring.
30
31It is a feature of the perf_events API that events can be created on other
32processes, subject to standard permission checks. This is also true of EBB
33events, however unless the target process enables EBBs (via mtspr(BESCR)) no
34EBBs will ever be delivered.
35
36This makes it possible for a process to enable EBBs for itself, but not
37actually configure any events. At a later time another process can come along
38and attach an EBB event to the process, which will then cause EBBs to be
39delivered to the first process. It's not clear if this is actually useful.
40
41
42When the PMU is configured for EBBs, all PMU interrupts are delivered to the
43user process. This means once an EBB event is scheduled on the PMU, no non-EBB
44events can be configured. This means that EBB events can not be run
45concurrently with regular 'perf' commands, or any other perf events.
46
47It is however safe to run 'perf' commands on a process which is using EBBs. The
48kernel will in general schedule the EBB event, and perf will be notified that
49its events could not run.
50
51The exclusion between EBB events and regular events is implemented using the
52existing "pinned" and "exclusive" attributes of perf_events. This means EBB
53events will be given priority over other events, unless they are also pinned.
54If an EBB event and a regular event are both pinned, then whichever is enabled
55first will be scheduled and the other will be put in error state. See the
56section below titled "Enabling an EBB event" for more information.
57
58
59Creating an EBB event
60---------------------
61
62To request that an event is counted using EBB, the event code should have bit
6363 set.
64
65EBB events must be created with a particular, and restrictive, set of
66attributes - this is so that they interoperate correctly with the rest of the
67perf_events subsystem.
68
69An EBB event must be created with the "pinned" and "exclusive" attributes set.
70Note that if you are creating a group of EBB events, only the leader can have
71these attributes set.
72
73An EBB event must NOT set any of the "inherit", "sample_period", "freq" or
74"enable_on_exec" attributes.
75
76An EBB event must be attached to a task. This is specified to perf_event_open()
77by passing a pid value, typically 0 indicating the current task.
78
79All events in a group must agree on whether they want EBB. That is all events
80must request EBB, or none may request EBB.
81
82EBB events must specify the PMC they are to be counted on. This ensures
83userspace is able to reliably determine which PMC the event is scheduled on.
84
85
86Enabling an EBB event
87---------------------
88
89Once an EBB event has been successfully opened, it must be enabled with the
90perf_events API. This can be achieved either via the ioctl() interface, or the
91prctl() interface.
92
93However, due to the design of the perf_events API, enabling an event does not
94guarantee that it has been scheduled on the PMU. To ensure that the EBB event
95has been scheduled on the PMU, you must perform a read() on the event. If the
96read() returns EOF, then the event has not been scheduled and EBBs are not
97enabled.
98
99This behaviour occurs because the EBB event is pinned and exclusive. When the
100EBB event is enabled it will force all other non-pinned events off the PMU. In
101this case the enable will be successful. However if there is already an event
102pinned on the PMU then the enable will not be successful.
103
104
105Reading an EBB event
106--------------------
107
108It is possible to read() from an EBB event. However the results are
109meaningless. Because interrupts are being delivered to the user process the
110kernel is not able to count the event, and so will return a junk value.
111
112
113Closing an EBB event
114--------------------
115
116When an EBB event is finished with, you can close it using close() as for any
117regular event. If this is the last EBB event the PMU will be deconfigured and
118no further PMU EBBs will be delivered.
119
120
121EBB Handler
122-----------
123
124The EBB handler is just regular userspace code, however it must be written in
125the style of an interrupt handler. When the handler is entered all registers
126are live (possibly) and so must be saved somehow before the handler can invoke
127other code.
128
129It's up to the program how to handle this. For C programs a relatively simple
130option is to create an interrupt frame on the stack and save registers there.
131
132Fork
133----
134
135EBB events are not inherited across fork. If the child process wishes to use
136EBBs it should open a new event for itself. Similarly the EBB state in
137BESCR/EBBHR/EBBRR is cleared across fork().
diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index 8eda3635a17d..c55533c0adb3 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -283,6 +283,69 @@ a direct pass through for VFIO_DEVICE_* ioctls. The read/write/mmap
283interfaces implement the device region access defined by the device's 283interfaces implement the device region access defined by the device's
284own VFIO_DEVICE_GET_REGION_INFO ioctl. 284own VFIO_DEVICE_GET_REGION_INFO ioctl.
285 285
286
287PPC64 sPAPR implementation note
288-------------------------------------------------------------------------------
289
290This implementation has some specifics:
291
2921) Only one IOMMU group per container is supported as an IOMMU group
293represents the minimal entity which isolation can be guaranteed for and
294groups are allocated statically, one per a Partitionable Endpoint (PE)
295(PE is often a PCI domain but not always).
296
2972) The hardware supports so called DMA windows - the PCI address range
298within which DMA transfer is allowed, any attempt to access address space
299out of the window leads to the whole PE isolation.
300
3013) PPC64 guests are paravirtualized but not fully emulated. There is an API
302to map/unmap pages for DMA, and it normally maps 1..32 pages per call and
303currently there is no way to reduce the number of calls. In order to make things
304faster, the map/unmap handling has been implemented in real mode which provides
305an excellent performance which has limitations such as inability to do
306locked pages accounting in real time.
307
308So 3 additional ioctls have been added:
309
310 VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
311 of the DMA window on the PCI bus.
312
313 VFIO_IOMMU_ENABLE - enables the container. The locked pages accounting
314 is done at this point. This lets user first to know what
315 the DMA window is and adjust rlimit before doing any real job.
316
317 VFIO_IOMMU_DISABLE - disables the container.
318
319
320The code flow from the example above should be slightly changed:
321
322 .....
323 /* Add the group to the container */
324 ioctl(group, VFIO_GROUP_SET_CONTAINER, &container);
325
326 /* Enable the IOMMU model we want */
327 ioctl(container, VFIO_SET_IOMMU, VFIO_SPAPR_TCE_IOMMU)
328
329 /* Get addition sPAPR IOMMU info */
330 vfio_iommu_spapr_tce_info spapr_iommu_info;
331 ioctl(container, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &spapr_iommu_info);
332
333 if (ioctl(container, VFIO_IOMMU_ENABLE))
334 /* Cannot enable container, may be low rlimit */
335
336 /* Allocate some space and setup a DMA mapping */
337 dma_map.vaddr = mmap(0, 1024 * 1024, PROT_READ | PROT_WRITE,
338 MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
339
340 dma_map.size = 1024 * 1024;
341 dma_map.iova = 0; /* 1MB starting at 0x0 from device view */
342 dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
343
344 /* Check here is .iova/.size are within DMA window from spapr_iommu_info */
345
346 ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map);
347 .....
348
286------------------------------------------------------------------------------- 349-------------------------------------------------------------------------------
287 350
288[1] VFIO was originally an acronym for "Virtual Function I/O" in its 351[1] VFIO was originally an acronym for "Virtual Function I/O" in its
diff --git a/MAINTAINERS b/MAINTAINERS
index 3f7710151a75..65730838dca4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3123,6 +3123,13 @@ M: Maxim Levitsky <maximlevitsky@gmail.com>
3123S: Maintained 3123S: Maintained
3124F: drivers/media/rc/ene_ir.* 3124F: drivers/media/rc/ene_ir.*
3125 3125
3126ENHANCED ERROR HANDLING (EEH)
3127M: Gavin Shan <shangw@linux.vnet.ibm.com>
3128L: linuxppc-dev@lists.ozlabs.org
3129S: Supported
3130F: Documentation/powerpc/eeh-pci-error-recovery.txt
3131F: arch/powerpc/kernel/eeh*.c
3132
3126EPSON S1D13XXX FRAMEBUFFER DRIVER 3133EPSON S1D13XXX FRAMEBUFFER DRIVER
3127M: Kristoffer Ericson <kristoffer.ericson@gmail.com> 3134M: Kristoffer Ericson <kristoffer.ericson@gmail.com>
3128S: Maintained 3135S: Maintained
@@ -6192,7 +6199,6 @@ M: Linas Vepstas <linasvepstas@gmail.com>
6192L: linux-pci@vger.kernel.org 6199L: linux-pci@vger.kernel.org
6193S: Supported 6200S: Supported
6194F: Documentation/PCI/pci-error-recovery.txt 6201F: Documentation/PCI/pci-error-recovery.txt
6195F: Documentation/powerpc/eeh-pci-error-recovery.txt
6196 6202
6197PCI SUBSYSTEM 6203PCI SUBSYSTEM
6198M: Bjorn Helgaas <bhelgaas@google.com> 6204M: Bjorn Helgaas <bhelgaas@google.com>
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 508e3fe934d2..1022e7b675c2 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -298,7 +298,7 @@ config HUGETLB_PAGE_SIZE_VARIABLE
298 298
299config MATH_EMULATION 299config MATH_EMULATION
300 bool "Math emulation" 300 bool "Math emulation"
301 depends on 4xx || 8xx || E200 || PPC_MPC832x || E500 301 depends on 4xx || 8xx || PPC_MPC832x || BOOKE
302 ---help--- 302 ---help---
303 Some PowerPC chips designed for embedded applications do not have 303 Some PowerPC chips designed for embedded applications do not have
304 a floating-point unit and therefore do not implement the 304 a floating-point unit and therefore do not implement the
@@ -307,6 +307,10 @@ config MATH_EMULATION
307 unit, which will allow programs that use floating-point 307 unit, which will allow programs that use floating-point
308 instructions to run. 308 instructions to run.
309 309
310 This is also useful to emulate missing (optional) instructions
311 such as fsqrt on cores that do have an FPU but do not implement
312 them (such as Freescale BookE).
313
310config PPC_TRANSACTIONAL_MEM 314config PPC_TRANSACTIONAL_MEM
311 bool "Transactional Memory support for POWERPC" 315 bool "Transactional Memory support for POWERPC"
312 depends on PPC_BOOK3S_64 316 depends on PPC_BOOK3S_64
@@ -315,17 +319,6 @@ config PPC_TRANSACTIONAL_MEM
315 ---help--- 319 ---help---
316 Support user-mode Transactional Memory on POWERPC. 320 Support user-mode Transactional Memory on POWERPC.
317 321
318config 8XX_MINIMAL_FPEMU
319 bool "Minimal math emulation for 8xx"
320 depends on 8xx && !MATH_EMULATION
321 help
322 Older arch/ppc kernels still emulated a few floating point
323 instructions such as load and store, even when full math
324 emulation is disabled. Say "Y" here if you want to preserve
325 this behavior.
326
327 It is recommended that you build a soft-float userspace instead.
328
329config IOMMU_HELPER 322config IOMMU_HELPER
330 def_bool PPC64 323 def_bool PPC64
331 324
diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
index 863d877e0b5f..d86875f3e17e 100644
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -147,6 +147,13 @@ choice
147 enable debugging for the wrong type of machine your kernel 147 enable debugging for the wrong type of machine your kernel
148 _will not boot_. 148 _will not boot_.
149 149
150config PPC_EARLY_DEBUG_BOOTX
151 bool "BootX or OpenFirmware"
152 depends on BOOTX_TEXT
153 help
154 Select this to enable early debugging for a machine using BootX
155 or OpenFirmware.
156
150config PPC_EARLY_DEBUG_LPAR 157config PPC_EARLY_DEBUG_LPAR
151 bool "LPAR HV Console" 158 bool "LPAR HV Console"
152 depends on PPC_PSERIES 159 depends on PPC_PSERIES
diff --git a/arch/powerpc/boot/dts/currituck.dts b/arch/powerpc/boot/dts/currituck.dts
index b801dd06e573..d2c8a872308e 100644
--- a/arch/powerpc/boot/dts/currituck.dts
+++ b/arch/powerpc/boot/dts/currituck.dts
@@ -103,6 +103,11 @@
103 interrupts = <34 2>; 103 interrupts = <34 2>;
104 }; 104 };
105 105
106 FPGA0: fpga@50000000 {
107 compatible = "ibm,currituck-fpga";
108 reg = <0x50000000 0x4>;
109 };
110
106 IIC0: i2c@00000000 { 111 IIC0: i2c@00000000 {
107 compatible = "ibm,iic-currituck", "ibm,iic"; 112 compatible = "ibm,iic-currituck", "ibm,iic";
108 reg = <0x0 0x00000014>; 113 reg = <0x0 0x00000014>;
diff --git a/arch/powerpc/boot/dts/fsl/interlaken-lac-portals.dtsi b/arch/powerpc/boot/dts/fsl/interlaken-lac-portals.dtsi
new file mode 100644
index 000000000000..9cffccf4e07e
--- /dev/null
+++ b/arch/powerpc/boot/dts/fsl/interlaken-lac-portals.dtsi
@@ -0,0 +1,156 @@
1/* T4240 Interlaken LAC Portal device tree stub with 24 portals.
2 *
3 * Copyright 2012 Freescale Semiconductor Inc.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Freescale Semiconductor nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 *
17 * ALTERNATIVELY, this software may be distributed under the terms of the
18 * GNU General Public License ("GPL") as published by the Free Software
19 * Foundation, either version 2 of that License or (at your option) any
20 * later version.
21 *
22 * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor "AS IS" AND ANY
23 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
24 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25 * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
26 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
27 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
29 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34#address-cells = <0x1>;
35#size-cells = <0x1>;
36compatible = "fsl,interlaken-lac-portals";
37
38lportal0: lac-portal@0 {
39 compatible = "fsl,interlaken-lac-portal-v1.0";
40 reg = <0x0 0x1000>;
41};
42
43lportal1: lac-portal@1000 {
44 compatible = "fsl,interlaken-lac-portal-v1.0";
45 reg = <0x1000 0x1000>;
46};
47
48lportal2: lac-portal@2000 {
49 compatible = "fsl,interlaken-lac-portal-v1.0";
50 reg = <0x2000 0x1000>;
51};
52
53lportal3: lac-portal@3000 {
54 compatible = "fsl,interlaken-lac-portal-v1.0";
55 reg = <0x3000 0x1000>;
56};
57
58lportal4: lac-portal@4000 {
59 compatible = "fsl,interlaken-lac-portal-v1.0";
60 reg = <0x4000 0x1000>;
61};
62
63lportal5: lac-portal@5000 {
64 compatible = "fsl,interlaken-lac-portal-v1.0";
65 reg = <0x5000 0x1000>;
66};
67
68lportal6: lac-portal@6000 {
69 compatible = "fsl,interlaken-lac-portal-v1.0";
70 reg = <0x6000 0x1000>;
71};
72
73lportal7: lac-portal@7000 {
74 compatible = "fsl,interlaken-lac-portal-v1.0";
75 reg = <0x7000 0x1000>;
76};
77
78lportal8: lac-portal@8000 {
79 compatible = "fsl,interlaken-lac-portal-v1.0";
80 reg = <0x8000 0x1000>;
81};
82
83lportal9: lac-portal@9000 {
84 compatible = "fsl,interlaken-lac-portal-v1.0";
85 reg = <0x9000 0x1000>;
86};
87
88lportal10: lac-portal@A000 {
89 compatible = "fsl,interlaken-lac-portal-v1.0";
90 reg = <0xA000 0x1000>;
91};
92
93lportal11: lac-portal@B000 {
94 compatible = "fsl,interlaken-lac-portal-v1.0";
95 reg = <0xB000 0x1000>;
96};
97
98lportal12: lac-portal@C000 {
99 compatible = "fsl,interlaken-lac-portal-v1.0";
100 reg = <0xC000 0x1000>;
101};
102
103lportal13: lac-portal@D000 {
104 compatible = "fsl,interlaken-lac-portal-v1.0";
105 reg = <0xD000 0x1000>;
106};
107
108lportal14: lac-portal@E000 {
109 compatible = "fsl,interlaken-lac-portal-v1.0";
110 reg = <0xE000 0x1000>;
111};
112
113lportal15: lac-portal@F000 {
114 compatible = "fsl,interlaken-lac-portal-v1.0";
115 reg = <0xF000 0x1000>;
116};
117
118lportal16: lac-portal@10000 {
119 compatible = "fsl,interlaken-lac-portal-v1.0";
120 reg = <0x10000 0x1000>;
121};
122
123lportal17: lac-portal@11000 {
124 compatible = "fsl,interlaken-lac-portal-v1.0";
125 reg = <0x11000 0x1000>;
126};
127
128lportal18: lac-portal@1200 {
129 compatible = "fsl,interlaken-lac-portal-v1.0";
130 reg = <0x12000 0x1000>;
131};
132
133lportal19: lac-portal@13000 {
134 compatible = "fsl,interlaken-lac-portal-v1.0";
135 reg = <0x13000 0x1000>;
136};
137
138lportal20: lac-portal@14000 {
139 compatible = "fsl,interlaken-lac-portal-v1.0";
140 reg = <0x14000 0x1000>;
141};
142
143lportal21: lac-portal@15000 {
144 compatible = "fsl,interlaken-lac-portal-v1.0";
145 reg = <0x15000 0x1000>;
146};
147
148lportal22: lac-portal@16000 {
149 compatible = "fsl,interlaken-lac-portal-v1.0";
150 reg = <0x16000 0x1000>;
151};
152
153lportal23: lac-portal@17000 {
154 compatible = "fsl,interlaken-lac-portal-v1.0";
155 reg = <0x17000 0x1000>;
156};
diff --git a/arch/powerpc/boot/dts/fsl/interlaken-lac.dtsi b/arch/powerpc/boot/dts/fsl/interlaken-lac.dtsi
new file mode 100644
index 000000000000..e8208720ac0e
--- /dev/null
+++ b/arch/powerpc/boot/dts/fsl/interlaken-lac.dtsi
@@ -0,0 +1,45 @@
1/*
2 * T4 Interlaken Look-aside Controller (LAC) device tree stub
3 *
4 * Copyright 2012 Freescale Semiconductor Inc.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Freescale Semiconductor nor the
14 * names of its contributors may be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 *
18 * ALTERNATIVELY, this software may be distributed under the terms of the
19 * GNU General Public License ("GPL") as published by the Free Software
20 * Foundation, either version 2 of that License or (at your option) any
21 * later version.
22 *
23 * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor "AS IS" AND ANY
24 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
25 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
26 * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
27 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
28 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
30 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 */
34
35lac: lac@229000 {
36 compatible = "fsl,interlaken-lac";
37 reg = <0x229000 0x1000>;
38 interrupts = <16 2 1 18>;
39};
40
41lac-hv@228000 {
42 compatible = "fsl,interlaken-lac-hv";
43 reg = <0x228000 0x1000>;
44 fsl,non-hv-node = <&lac>;
45};
diff --git a/arch/powerpc/configs/c2k_defconfig b/arch/powerpc/configs/c2k_defconfig
index 2a84fd7f631c..671a8f960afa 100644
--- a/arch/powerpc/configs/c2k_defconfig
+++ b/arch/powerpc/configs/c2k_defconfig
@@ -423,6 +423,8 @@ CONFIG_SYSCTL_SYSCALL_CHECK=y
423CONFIG_DEBUG_STACKOVERFLOW=y 423CONFIG_DEBUG_STACKOVERFLOW=y
424CONFIG_DEBUG_STACK_USAGE=y 424CONFIG_DEBUG_STACK_USAGE=y
425CONFIG_BOOTX_TEXT=y 425CONFIG_BOOTX_TEXT=y
426CONFIG_PPC_EARLY_DEBUG=y
427CONFIG_PPC_EARLY_DEBUG_BOOTX=y
426CONFIG_KEYS=y 428CONFIG_KEYS=y
427CONFIG_KEYS_DEBUG_PROC_KEYS=y 429CONFIG_KEYS_DEBUG_PROC_KEYS=y
428CONFIG_SECURITY=y 430CONFIG_SECURITY=y
diff --git a/arch/powerpc/configs/g5_defconfig b/arch/powerpc/configs/g5_defconfig
index 07b7f2af2dca..1ea22fc24ea8 100644
--- a/arch/powerpc/configs/g5_defconfig
+++ b/arch/powerpc/configs/g5_defconfig
@@ -284,6 +284,8 @@ CONFIG_DEBUG_MUTEXES=y
284CONFIG_LATENCYTOP=y 284CONFIG_LATENCYTOP=y
285CONFIG_SYSCTL_SYSCALL_CHECK=y 285CONFIG_SYSCTL_SYSCALL_CHECK=y
286CONFIG_BOOTX_TEXT=y 286CONFIG_BOOTX_TEXT=y
287CONFIG_PPC_EARLY_DEBUG=y
288CONFIG_PPC_EARLY_DEBUG_BOOTX=y
287CONFIG_CRYPTO_NULL=m 289CONFIG_CRYPTO_NULL=m
288CONFIG_CRYPTO_TEST=m 290CONFIG_CRYPTO_TEST=m
289CONFIG_CRYPTO_ECB=m 291CONFIG_CRYPTO_ECB=m
diff --git a/arch/powerpc/configs/maple_defconfig b/arch/powerpc/configs/maple_defconfig
index 02ac96b679b8..2a5afac29861 100644
--- a/arch/powerpc/configs/maple_defconfig
+++ b/arch/powerpc/configs/maple_defconfig
@@ -138,6 +138,8 @@ CONFIG_DEBUG_STACK_USAGE=y
138CONFIG_XMON=y 138CONFIG_XMON=y
139CONFIG_XMON_DEFAULT=y 139CONFIG_XMON_DEFAULT=y
140CONFIG_BOOTX_TEXT=y 140CONFIG_BOOTX_TEXT=y
141CONFIG_PPC_EARLY_DEBUG=y
142CONFIG_PPC_EARLY_DEBUG_BOOTX=y
141CONFIG_CRYPTO_ECB=m 143CONFIG_CRYPTO_ECB=m
142CONFIG_CRYPTO_PCBC=m 144CONFIG_CRYPTO_PCBC=m
143# CONFIG_CRYPTO_ANSI_CPRNG is not set 145# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/powerpc/configs/mpc512x_defconfig b/arch/powerpc/configs/mpc512x_defconfig
index 0d0d981442fd..ee853a1b1b2c 100644
--- a/arch/powerpc/configs/mpc512x_defconfig
+++ b/arch/powerpc/configs/mpc512x_defconfig
@@ -1,7 +1,6 @@
1CONFIG_EXPERIMENTAL=y
2# CONFIG_SWAP is not set 1# CONFIG_SWAP is not set
3CONFIG_SYSVIPC=y 2CONFIG_SYSVIPC=y
4CONFIG_SPARSE_IRQ=y 3CONFIG_NO_HZ=y
5CONFIG_LOG_BUF_SHIFT=16 4CONFIG_LOG_BUF_SHIFT=16
6CONFIG_BLK_DEV_INITRD=y 5CONFIG_BLK_DEV_INITRD=y
7# CONFIG_COMPAT_BRK is not set 6# CONFIG_COMPAT_BRK is not set
@@ -9,6 +8,7 @@ CONFIG_SLAB=y
9CONFIG_MODULES=y 8CONFIG_MODULES=y
10CONFIG_MODULE_UNLOAD=y 9CONFIG_MODULE_UNLOAD=y
11# CONFIG_BLK_DEV_BSG is not set 10# CONFIG_BLK_DEV_BSG is not set
11CONFIG_PARTITION_ADVANCED=y
12# CONFIG_IOSCHED_CFQ is not set 12# CONFIG_IOSCHED_CFQ is not set
13# CONFIG_PPC_CHRP is not set 13# CONFIG_PPC_CHRP is not set
14CONFIG_PPC_MPC512x=y 14CONFIG_PPC_MPC512x=y
@@ -16,9 +16,7 @@ CONFIG_MPC5121_ADS=y
16CONFIG_MPC512x_GENERIC=y 16CONFIG_MPC512x_GENERIC=y
17CONFIG_PDM360NG=y 17CONFIG_PDM360NG=y
18# CONFIG_PPC_PMAC is not set 18# CONFIG_PPC_PMAC is not set
19CONFIG_NO_HZ=y
20CONFIG_HZ_1000=y 19CONFIG_HZ_1000=y
21# CONFIG_MIGRATION is not set
22# CONFIG_SECCOMP is not set 20# CONFIG_SECCOMP is not set
23# CONFIG_PCI is not set 21# CONFIG_PCI is not set
24CONFIG_NET=y 22CONFIG_NET=y
@@ -33,8 +31,6 @@ CONFIG_IP_PNP=y
33# CONFIG_INET_DIAG is not set 31# CONFIG_INET_DIAG is not set
34# CONFIG_IPV6 is not set 32# CONFIG_IPV6 is not set
35CONFIG_CAN=y 33CONFIG_CAN=y
36CONFIG_CAN_RAW=y
37CONFIG_CAN_BCM=y
38CONFIG_CAN_VCAN=y 34CONFIG_CAN_VCAN=y
39CONFIG_CAN_MSCAN=y 35CONFIG_CAN_MSCAN=y
40CONFIG_CAN_DEBUG_DEVICES=y 36CONFIG_CAN_DEBUG_DEVICES=y
@@ -46,7 +42,6 @@ CONFIG_DEVTMPFS_MOUNT=y
46# CONFIG_FIRMWARE_IN_KERNEL is not set 42# CONFIG_FIRMWARE_IN_KERNEL is not set
47CONFIG_MTD=y 43CONFIG_MTD=y
48CONFIG_MTD_CMDLINE_PARTS=y 44CONFIG_MTD_CMDLINE_PARTS=y
49CONFIG_MTD_CHAR=y
50CONFIG_MTD_BLOCK=y 45CONFIG_MTD_BLOCK=y
51CONFIG_MTD_CFI=y 46CONFIG_MTD_CFI=y
52CONFIG_MTD_CFI_AMDSTD=y 47CONFIG_MTD_CFI_AMDSTD=y
@@ -60,7 +55,6 @@ CONFIG_BLK_DEV_RAM=y
60CONFIG_BLK_DEV_RAM_COUNT=1 55CONFIG_BLK_DEV_RAM_COUNT=1
61CONFIG_BLK_DEV_RAM_SIZE=8192 56CONFIG_BLK_DEV_RAM_SIZE=8192
62CONFIG_BLK_DEV_XIP=y 57CONFIG_BLK_DEV_XIP=y
63CONFIG_MISC_DEVICES=y
64CONFIG_EEPROM_AT24=y 58CONFIG_EEPROM_AT24=y
65CONFIG_EEPROM_AT25=y 59CONFIG_EEPROM_AT25=y
66CONFIG_SCSI=y 60CONFIG_SCSI=y
@@ -68,6 +62,7 @@ CONFIG_SCSI=y
68CONFIG_BLK_DEV_SD=y 62CONFIG_BLK_DEV_SD=y
69CONFIG_CHR_DEV_SG=y 63CONFIG_CHR_DEV_SG=y
70CONFIG_NETDEVICES=y 64CONFIG_NETDEVICES=y
65CONFIG_FS_ENET=y
71CONFIG_MARVELL_PHY=y 66CONFIG_MARVELL_PHY=y
72CONFIG_DAVICOM_PHY=y 67CONFIG_DAVICOM_PHY=y
73CONFIG_QSEMI_PHY=y 68CONFIG_QSEMI_PHY=y
@@ -83,10 +78,6 @@ CONFIG_STE10XP=y
83CONFIG_LSI_ET1011C_PHY=y 78CONFIG_LSI_ET1011C_PHY=y
84CONFIG_FIXED_PHY=y 79CONFIG_FIXED_PHY=y
85CONFIG_MDIO_BITBANG=y 80CONFIG_MDIO_BITBANG=y
86CONFIG_NET_ETHERNET=y
87CONFIG_FS_ENET=y
88# CONFIG_NETDEV_1000 is not set
89# CONFIG_NETDEV_10000 is not set
90# CONFIG_WLAN is not set 81# CONFIG_WLAN is not set
91# CONFIG_INPUT_MOUSEDEV_PSAUX is not set 82# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
92CONFIG_INPUT_EVDEV=y 83CONFIG_INPUT_EVDEV=y
@@ -106,14 +97,18 @@ CONFIG_GPIO_SYSFS=y
106CONFIG_GPIO_MPC8XXX=y 97CONFIG_GPIO_MPC8XXX=y
107# CONFIG_HWMON is not set 98# CONFIG_HWMON is not set
108CONFIG_MEDIA_SUPPORT=y 99CONFIG_MEDIA_SUPPORT=y
109CONFIG_VIDEO_DEV=y
110CONFIG_VIDEO_ADV_DEBUG=y 100CONFIG_VIDEO_ADV_DEBUG=y
111# CONFIG_VIDEO_HELPER_CHIPS_AUTO is not set
112CONFIG_VIDEO_SAA711X=y
113CONFIG_FB=y 101CONFIG_FB=y
114CONFIG_FB_FSL_DIU=y 102CONFIG_FB_FSL_DIU=y
115# CONFIG_VGA_CONSOLE is not set 103# CONFIG_VGA_CONSOLE is not set
116CONFIG_FRAMEBUFFER_CONSOLE=y 104CONFIG_FRAMEBUFFER_CONSOLE=y
105CONFIG_USB=y
106CONFIG_USB_EHCI_HCD=y
107CONFIG_USB_EHCI_FSL=y
108# CONFIG_USB_EHCI_HCD_PPC_OF is not set
109CONFIG_USB_STORAGE=y
110CONFIG_USB_GADGET=y
111CONFIG_USB_FSL_USB2=y
117CONFIG_RTC_CLASS=y 112CONFIG_RTC_CLASS=y
118CONFIG_RTC_DRV_M41T80=y 113CONFIG_RTC_DRV_M41T80=y
119CONFIG_RTC_DRV_MPC5121=y 114CONFIG_RTC_DRV_MPC5121=y
@@ -129,9 +124,7 @@ CONFIG_TMPFS=y
129CONFIG_JFFS2_FS=y 124CONFIG_JFFS2_FS=y
130CONFIG_UBIFS_FS=y 125CONFIG_UBIFS_FS=y
131CONFIG_NFS_FS=y 126CONFIG_NFS_FS=y
132CONFIG_NFS_V3=y
133CONFIG_ROOT_NFS=y 127CONFIG_ROOT_NFS=y
134CONFIG_PARTITION_ADVANCED=y
135CONFIG_NLS_CODEPAGE_437=y 128CONFIG_NLS_CODEPAGE_437=y
136CONFIG_NLS_ISO8859_1=y 129CONFIG_NLS_ISO8859_1=y
137# CONFIG_ENABLE_WARN_DEPRECATED is not set 130# CONFIG_ENABLE_WARN_DEPRECATED is not set
diff --git a/arch/powerpc/configs/mpc85xx_smp_defconfig b/arch/powerpc/configs/mpc85xx_smp_defconfig
index 165e6b32baef..152fa05b15e4 100644
--- a/arch/powerpc/configs/mpc85xx_smp_defconfig
+++ b/arch/powerpc/configs/mpc85xx_smp_defconfig
@@ -131,6 +131,7 @@ CONFIG_DUMMY=y
131CONFIG_FS_ENET=y 131CONFIG_FS_ENET=y
132CONFIG_UCC_GETH=y 132CONFIG_UCC_GETH=y
133CONFIG_GIANFAR=y 133CONFIG_GIANFAR=y
134CONFIG_E1000E=y
134CONFIG_MARVELL_PHY=y 135CONFIG_MARVELL_PHY=y
135CONFIG_DAVICOM_PHY=y 136CONFIG_DAVICOM_PHY=y
136CONFIG_CICADA_PHY=y 137CONFIG_CICADA_PHY=y
diff --git a/arch/powerpc/configs/pmac32_defconfig b/arch/powerpc/configs/pmac32_defconfig
index 29767a8dfea5..a73626b09051 100644
--- a/arch/powerpc/configs/pmac32_defconfig
+++ b/arch/powerpc/configs/pmac32_defconfig
@@ -350,6 +350,8 @@ CONFIG_SYSCTL_SYSCALL_CHECK=y
350CONFIG_XMON=y 350CONFIG_XMON=y
351CONFIG_XMON_DEFAULT=y 351CONFIG_XMON_DEFAULT=y
352CONFIG_BOOTX_TEXT=y 352CONFIG_BOOTX_TEXT=y
353CONFIG_PPC_EARLY_DEBUG=y
354CONFIG_PPC_EARLY_DEBUG_BOOTX=y
353CONFIG_CRYPTO_NULL=m 355CONFIG_CRYPTO_NULL=m
354CONFIG_CRYPTO_PCBC=m 356CONFIG_CRYPTO_PCBC=m
355CONFIG_CRYPTO_MD4=m 357CONFIG_CRYPTO_MD4=m
diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig
index aef3f71de5ad..c86fcb92358e 100644
--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@@ -398,6 +398,8 @@ CONFIG_FTR_FIXUP_SELFTEST=y
398CONFIG_MSI_BITMAP_SELFTEST=y 398CONFIG_MSI_BITMAP_SELFTEST=y
399CONFIG_XMON=y 399CONFIG_XMON=y
400CONFIG_BOOTX_TEXT=y 400CONFIG_BOOTX_TEXT=y
401CONFIG_PPC_EARLY_DEBUG=y
402CONFIG_PPC_EARLY_DEBUG_BOOTX=y
401CONFIG_CRYPTO_NULL=m 403CONFIG_CRYPTO_NULL=m
402CONFIG_CRYPTO_TEST=m 404CONFIG_CRYPTO_TEST=m
403CONFIG_CRYPTO_PCBC=m 405CONFIG_CRYPTO_PCBC=m
diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig
index be1cb6ea3a36..20ebfaf7234b 100644
--- a/arch/powerpc/configs/ppc6xx_defconfig
+++ b/arch/powerpc/configs/ppc6xx_defconfig
@@ -1264,6 +1264,8 @@ CONFIG_DEBUG_STACKOVERFLOW=y
1264CONFIG_DEBUG_STACK_USAGE=y 1264CONFIG_DEBUG_STACK_USAGE=y
1265CONFIG_XMON=y 1265CONFIG_XMON=y
1266CONFIG_BOOTX_TEXT=y 1266CONFIG_BOOTX_TEXT=y
1267CONFIG_PPC_EARLY_DEBUG=y
1268CONFIG_PPC_EARLY_DEBUG_BOOTX=y
1267CONFIG_KEYS=y 1269CONFIG_KEYS=y
1268CONFIG_KEYS_DEBUG_PROC_KEYS=y 1270CONFIG_KEYS_DEBUG_PROC_KEYS=y
1269CONFIG_SECURITY=y 1271CONFIG_SECURITY=y
diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig
index c4dfbaf8b192..bea8587c3af5 100644
--- a/arch/powerpc/configs/pseries_defconfig
+++ b/arch/powerpc/configs/pseries_defconfig
@@ -296,6 +296,7 @@ CONFIG_SQUASHFS=m
296CONFIG_SQUASHFS_XATTR=y 296CONFIG_SQUASHFS_XATTR=y
297CONFIG_SQUASHFS_LZO=y 297CONFIG_SQUASHFS_LZO=y
298CONFIG_SQUASHFS_XZ=y 298CONFIG_SQUASHFS_XZ=y
299CONFIG_PSTORE=y
299CONFIG_NFS_FS=y 300CONFIG_NFS_FS=y
300CONFIG_NFS_V3_ACL=y 301CONFIG_NFS_V3_ACL=y
301CONFIG_NFS_V4=y 302CONFIG_NFS_V4=y
diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index a80e32b46c11..09a8743143f3 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -24,6 +24,7 @@
24#include <linux/init.h> 24#include <linux/init.h>
25#include <linux/list.h> 25#include <linux/list.h>
26#include <linux/string.h> 26#include <linux/string.h>
27#include <linux/time.h>
27 28
28struct pci_dev; 29struct pci_dev;
29struct pci_bus; 30struct pci_bus;
@@ -52,6 +53,7 @@ struct device_node;
52 53
53#define EEH_PE_ISOLATED (1 << 0) /* Isolated PE */ 54#define EEH_PE_ISOLATED (1 << 0) /* Isolated PE */
54#define EEH_PE_RECOVERING (1 << 1) /* Recovering PE */ 55#define EEH_PE_RECOVERING (1 << 1) /* Recovering PE */
56#define EEH_PE_PHB_DEAD (1 << 2) /* Dead PHB */
55 57
56struct eeh_pe { 58struct eeh_pe {
57 int type; /* PE type: PHB/Bus/Device */ 59 int type; /* PE type: PHB/Bus/Device */
@@ -59,8 +61,10 @@ struct eeh_pe {
59 int config_addr; /* Traditional PCI address */ 61 int config_addr; /* Traditional PCI address */
60 int addr; /* PE configuration address */ 62 int addr; /* PE configuration address */
61 struct pci_controller *phb; /* Associated PHB */ 63 struct pci_controller *phb; /* Associated PHB */
64 struct pci_bus *bus; /* Top PCI bus for bus PE */
62 int check_count; /* Times of ignored error */ 65 int check_count; /* Times of ignored error */
63 int freeze_count; /* Times of froze up */ 66 int freeze_count; /* Times of froze up */
67 struct timeval tstamp; /* Time on first-time freeze */
64 int false_positives; /* Times of reported #ff's */ 68 int false_positives; /* Times of reported #ff's */
65 struct eeh_pe *parent; /* Parent PE */ 69 struct eeh_pe *parent; /* Parent PE */
66 struct list_head child_list; /* Link PE to the child list */ 70 struct list_head child_list; /* Link PE to the child list */
@@ -95,12 +99,12 @@ struct eeh_dev {
95 99
96static inline struct device_node *eeh_dev_to_of_node(struct eeh_dev *edev) 100static inline struct device_node *eeh_dev_to_of_node(struct eeh_dev *edev)
97{ 101{
98 return edev->dn; 102 return edev ? edev->dn : NULL;
99} 103}
100 104
101static inline struct pci_dev *eeh_dev_to_pci_dev(struct eeh_dev *edev) 105static inline struct pci_dev *eeh_dev_to_pci_dev(struct eeh_dev *edev)
102{ 106{
103 return edev->pdev; 107 return edev ? edev->pdev : NULL;
104} 108}
105 109
106/* 110/*
@@ -130,8 +134,9 @@ static inline struct pci_dev *eeh_dev_to_pci_dev(struct eeh_dev *edev)
130struct eeh_ops { 134struct eeh_ops {
131 char *name; 135 char *name;
132 int (*init)(void); 136 int (*init)(void);
137 int (*post_init)(void);
133 void* (*of_probe)(struct device_node *dn, void *flag); 138 void* (*of_probe)(struct device_node *dn, void *flag);
134 void* (*dev_probe)(struct pci_dev *dev, void *flag); 139 int (*dev_probe)(struct pci_dev *dev, void *flag);
135 int (*set_option)(struct eeh_pe *pe, int option); 140 int (*set_option)(struct eeh_pe *pe, int option);
136 int (*get_pe_addr)(struct eeh_pe *pe); 141 int (*get_pe_addr)(struct eeh_pe *pe);
137 int (*get_state)(struct eeh_pe *pe, int *state); 142 int (*get_state)(struct eeh_pe *pe, int *state);
@@ -141,11 +146,12 @@ struct eeh_ops {
141 int (*configure_bridge)(struct eeh_pe *pe); 146 int (*configure_bridge)(struct eeh_pe *pe);
142 int (*read_config)(struct device_node *dn, int where, int size, u32 *val); 147 int (*read_config)(struct device_node *dn, int where, int size, u32 *val);
143 int (*write_config)(struct device_node *dn, int where, int size, u32 val); 148 int (*write_config)(struct device_node *dn, int where, int size, u32 val);
149 int (*next_error)(struct eeh_pe **pe);
144}; 150};
145 151
146extern struct eeh_ops *eeh_ops; 152extern struct eeh_ops *eeh_ops;
147extern int eeh_subsystem_enabled; 153extern int eeh_subsystem_enabled;
148extern struct mutex eeh_mutex; 154extern raw_spinlock_t confirm_error_lock;
149extern int eeh_probe_mode; 155extern int eeh_probe_mode;
150 156
151#define EEH_PROBE_MODE_DEV (1<<0) /* From PCI device */ 157#define EEH_PROBE_MODE_DEV (1<<0) /* From PCI device */
@@ -166,14 +172,14 @@ static inline int eeh_probe_mode_dev(void)
166 return (eeh_probe_mode == EEH_PROBE_MODE_DEV); 172 return (eeh_probe_mode == EEH_PROBE_MODE_DEV);
167} 173}
168 174
169static inline void eeh_lock(void) 175static inline void eeh_serialize_lock(unsigned long *flags)
170{ 176{
171 mutex_lock(&eeh_mutex); 177 raw_spin_lock_irqsave(&confirm_error_lock, *flags);
172} 178}
173 179
174static inline void eeh_unlock(void) 180static inline void eeh_serialize_unlock(unsigned long flags)
175{ 181{
176 mutex_unlock(&eeh_mutex); 182 raw_spin_unlock_irqrestore(&confirm_error_lock, flags);
177} 183}
178 184
179/* 185/*
@@ -184,8 +190,11 @@ static inline void eeh_unlock(void)
184 190
185typedef void *(*eeh_traverse_func)(void *data, void *flag); 191typedef void *(*eeh_traverse_func)(void *data, void *flag);
186int eeh_phb_pe_create(struct pci_controller *phb); 192int eeh_phb_pe_create(struct pci_controller *phb);
193struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb);
194struct eeh_pe *eeh_pe_get(struct eeh_dev *edev);
187int eeh_add_to_parent_pe(struct eeh_dev *edev); 195int eeh_add_to_parent_pe(struct eeh_dev *edev);
188int eeh_rmv_from_parent_pe(struct eeh_dev *edev, int purge_pe); 196int eeh_rmv_from_parent_pe(struct eeh_dev *edev, int purge_pe);
197void eeh_pe_update_time_stamp(struct eeh_pe *pe);
189void *eeh_pe_dev_traverse(struct eeh_pe *root, 198void *eeh_pe_dev_traverse(struct eeh_pe *root,
190 eeh_traverse_func fn, void *flag); 199 eeh_traverse_func fn, void *flag);
191void eeh_pe_restore_bars(struct eeh_pe *pe); 200void eeh_pe_restore_bars(struct eeh_pe *pe);
@@ -193,12 +202,13 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe);
193 202
194void *eeh_dev_init(struct device_node *dn, void *data); 203void *eeh_dev_init(struct device_node *dn, void *data);
195void eeh_dev_phb_init_dynamic(struct pci_controller *phb); 204void eeh_dev_phb_init_dynamic(struct pci_controller *phb);
205int eeh_init(void);
196int __init eeh_ops_register(struct eeh_ops *ops); 206int __init eeh_ops_register(struct eeh_ops *ops);
197int __exit eeh_ops_unregister(const char *name); 207int __exit eeh_ops_unregister(const char *name);
198unsigned long eeh_check_failure(const volatile void __iomem *token, 208unsigned long eeh_check_failure(const volatile void __iomem *token,
199 unsigned long val); 209 unsigned long val);
200int eeh_dev_check_failure(struct eeh_dev *edev); 210int eeh_dev_check_failure(struct eeh_dev *edev);
201void __init eeh_addr_cache_build(void); 211void eeh_addr_cache_build(void);
202void eeh_add_device_tree_early(struct device_node *); 212void eeh_add_device_tree_early(struct device_node *);
203void eeh_add_device_tree_late(struct pci_bus *); 213void eeh_add_device_tree_late(struct pci_bus *);
204void eeh_add_sysfs_files(struct pci_bus *); 214void eeh_add_sysfs_files(struct pci_bus *);
@@ -221,6 +231,11 @@ void eeh_remove_bus_device(struct pci_dev *, int);
221 231
222#else /* !CONFIG_EEH */ 232#else /* !CONFIG_EEH */
223 233
234static inline int eeh_init(void)
235{
236 return 0;
237}
238
224static inline void *eeh_dev_init(struct device_node *dn, void *data) 239static inline void *eeh_dev_init(struct device_node *dn, void *data)
225{ 240{
226 return NULL; 241 return NULL;
@@ -245,9 +260,6 @@ static inline void eeh_add_sysfs_files(struct pci_bus *bus) { }
245 260
246static inline void eeh_remove_bus_device(struct pci_dev *dev, int purge_pe) { } 261static inline void eeh_remove_bus_device(struct pci_dev *dev, int purge_pe) { }
247 262
248static inline void eeh_lock(void) { }
249static inline void eeh_unlock(void) { }
250
251#define EEH_POSSIBLE_ERROR(val, type) (0) 263#define EEH_POSSIBLE_ERROR(val, type) (0)
252#define EEH_IO_ERROR_VALUE(size) (-1UL) 264#define EEH_IO_ERROR_VALUE(size) (-1UL)
253#endif /* CONFIG_EEH */ 265#endif /* CONFIG_EEH */
diff --git a/arch/powerpc/include/asm/eeh_event.h b/arch/powerpc/include/asm/eeh_event.h
index de67d830151b..89d5670b2eeb 100644
--- a/arch/powerpc/include/asm/eeh_event.h
+++ b/arch/powerpc/include/asm/eeh_event.h
@@ -31,7 +31,9 @@ struct eeh_event {
31 struct eeh_pe *pe; /* EEH PE */ 31 struct eeh_pe *pe; /* EEH PE */
32}; 32};
33 33
34int eeh_event_init(void);
34int eeh_send_failure_event(struct eeh_pe *pe); 35int eeh_send_failure_event(struct eeh_pe *pe);
36void eeh_remove_event(struct eeh_pe *pe);
35void eeh_handle_event(struct eeh_pe *pe); 37void eeh_handle_event(struct eeh_pe *pe);
36 38
37#endif /* __KERNEL__ */ 39#endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 46793b58a761..07ca627e52c0 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -358,12 +358,12 @@ label##_relon_pSeries: \
358 /* No guest interrupts come through here */ \ 358 /* No guest interrupts come through here */ \
359 SET_SCRATCH0(r13); /* save r13 */ \ 359 SET_SCRATCH0(r13); /* save r13 */ \
360 EXCEPTION_RELON_PROLOG_PSERIES(PACA_EXGEN, label##_common, \ 360 EXCEPTION_RELON_PROLOG_PSERIES(PACA_EXGEN, label##_common, \
361 EXC_STD, KVMTEST_PR, vec) 361 EXC_STD, NOTEST, vec)
362 362
363#define STD_RELON_EXCEPTION_PSERIES_OOL(vec, label) \ 363#define STD_RELON_EXCEPTION_PSERIES_OOL(vec, label) \
364 .globl label##_relon_pSeries; \ 364 .globl label##_relon_pSeries; \
365label##_relon_pSeries: \ 365label##_relon_pSeries: \
366 EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST_PR, vec); \ 366 EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, vec); \
367 EXCEPTION_RELON_PROLOG_PSERIES_1(label##_common, EXC_STD) 367 EXCEPTION_RELON_PROLOG_PSERIES_1(label##_common, EXC_STD)
368 368
369#define STD_RELON_EXCEPTION_HV(loc, vec, label) \ 369#define STD_RELON_EXCEPTION_HV(loc, vec, label) \
@@ -374,12 +374,12 @@ label##_relon_hv: \
374 /* No guest interrupts come through here */ \ 374 /* No guest interrupts come through here */ \
375 SET_SCRATCH0(r13); /* save r13 */ \ 375 SET_SCRATCH0(r13); /* save r13 */ \
376 EXCEPTION_RELON_PROLOG_PSERIES(PACA_EXGEN, label##_common, \ 376 EXCEPTION_RELON_PROLOG_PSERIES(PACA_EXGEN, label##_common, \
377 EXC_HV, KVMTEST, vec) 377 EXC_HV, NOTEST, vec)
378 378
379#define STD_RELON_EXCEPTION_HV_OOL(vec, label) \ 379#define STD_RELON_EXCEPTION_HV_OOL(vec, label) \
380 .globl label##_relon_hv; \ 380 .globl label##_relon_hv; \
381label##_relon_hv: \ 381label##_relon_hv: \
382 EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST, vec); \ 382 EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, vec); \
383 EXCEPTION_RELON_PROLOG_PSERIES_1(label##_common, EXC_HV) 383 EXCEPTION_RELON_PROLOG_PSERIES_1(label##_common, EXC_HV)
384 384
385/* This associate vector numbers with bits in paca->irq_happened */ 385/* This associate vector numbers with bits in paca->irq_happened */
diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h
index f2498c8e595d..d750336b171d 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -191,8 +191,14 @@ static inline void flush_hugetlb_page(struct vm_area_struct *vma,
191 unsigned long vmaddr) 191 unsigned long vmaddr)
192{ 192{
193} 193}
194#endif /* CONFIG_HUGETLB_PAGE */
195 194
195#define hugepd_shift(x) 0
196static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr,
197 unsigned pdshift)
198{
199 return 0;
200}
201#endif /* CONFIG_HUGETLB_PAGE */
196 202
197/* 203/*
198 * FSL Book3E platforms require special gpage handling - the gpages 204 * FSL Book3E platforms require special gpage handling - the gpages
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index cbfe678e3dbe..c34656a8925e 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -76,6 +76,9 @@ struct iommu_table {
76 struct iommu_pool large_pool; 76 struct iommu_pool large_pool;
77 struct iommu_pool pools[IOMMU_NR_POOLS]; 77 struct iommu_pool pools[IOMMU_NR_POOLS];
78 unsigned long *it_map; /* A simple allocation bitmap for now */ 78 unsigned long *it_map; /* A simple allocation bitmap for now */
79#ifdef CONFIG_IOMMU_API
80 struct iommu_group *it_group;
81#endif
79}; 82};
80 83
81struct scatterlist; 84struct scatterlist;
@@ -98,6 +101,8 @@ extern void iommu_free_table(struct iommu_table *tbl, const char *node_name);
98 */ 101 */
99extern struct iommu_table *iommu_init_table(struct iommu_table * tbl, 102extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
100 int nid); 103 int nid);
104extern void iommu_register_group(struct iommu_table *tbl,
105 int pci_domain_number, unsigned long pe_num);
101 106
102extern int iommu_map_sg(struct device *dev, struct iommu_table *tbl, 107extern int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
103 struct scatterlist *sglist, int nelems, 108 struct scatterlist *sglist, int nelems,
@@ -125,13 +130,6 @@ extern void iommu_init_early_pSeries(void);
125extern void iommu_init_early_dart(void); 130extern void iommu_init_early_dart(void);
126extern void iommu_init_early_pasemi(void); 131extern void iommu_init_early_pasemi(void);
127 132
128#ifdef CONFIG_PCI
129extern void pci_iommu_init(void);
130extern void pci_direct_iommu_init(void);
131#else
132static inline void pci_iommu_init(void) { }
133#endif
134
135extern void alloc_dart_table(void); 133extern void alloc_dart_table(void);
136#if defined(CONFIG_PPC64) && defined(CONFIG_PM) 134#if defined(CONFIG_PPC64) && defined(CONFIG_PM)
137static inline void iommu_save(void) 135static inline void iommu_save(void)
@@ -147,5 +145,26 @@ static inline void iommu_restore(void)
147} 145}
148#endif 146#endif
149 147
148/* The API to support IOMMU operations for VFIO */
149extern int iommu_tce_clear_param_check(struct iommu_table *tbl,
150 unsigned long ioba, unsigned long tce_value,
151 unsigned long npages);
152extern int iommu_tce_put_param_check(struct iommu_table *tbl,
153 unsigned long ioba, unsigned long tce);
154extern int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
155 unsigned long hwaddr, enum dma_data_direction direction);
156extern unsigned long iommu_clear_tce(struct iommu_table *tbl,
157 unsigned long entry);
158extern int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
159 unsigned long entry, unsigned long pages);
160extern int iommu_put_tce_user_mode(struct iommu_table *tbl,
161 unsigned long entry, unsigned long tce);
162
163extern void iommu_flush_tce(struct iommu_table *tbl);
164extern int iommu_take_ownership(struct iommu_table *tbl);
165extern void iommu_release_ownership(struct iommu_table *tbl);
166
167extern enum dma_data_direction iommu_tce_direction(unsigned long tce);
168
150#endif /* __KERNEL__ */ 169#endif /* __KERNEL__ */
151#endif /* _ASM_IOMMU_H */ 170#endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 9c1ff330c805..a1ecb14e4442 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -159,36 +159,46 @@ static inline int hpte_cache_flags_ok(unsigned long ptel, unsigned long io_type)
159} 159}
160 160
161/* 161/*
162 * Lock and read a linux PTE. If it's present and writable, atomically 162 * If it's present and writable, atomically set dirty and referenced bits and
163 * set dirty and referenced bits and return the PTE, otherwise return 0. 163 * return the PTE, otherwise return 0. If we find a transparent hugepage
164 * and if it is marked splitting we return 0;
164 */ 165 */
165static inline pte_t kvmppc_read_update_linux_pte(pte_t *p, int writing) 166static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing,
167 unsigned int hugepage)
166{ 168{
167 pte_t pte, tmp; 169 pte_t old_pte, new_pte = __pte(0);
168 170
169 /* wait until _PAGE_BUSY is clear then set it atomically */ 171 while (1) {
170 __asm__ __volatile__ ( 172 old_pte = pte_val(*ptep);
171 "1: ldarx %0,0,%3\n" 173 /*
172 " andi. %1,%0,%4\n" 174 * wait until _PAGE_BUSY is clear then set it atomically
173 " bne- 1b\n" 175 */
174 " ori %1,%0,%4\n" 176 if (unlikely(old_pte & _PAGE_BUSY)) {
175 " stdcx. %1,0,%3\n" 177 cpu_relax();
176 " bne- 1b" 178 continue;
177 : "=&r" (pte), "=&r" (tmp), "=m" (*p) 179 }
178 : "r" (p), "i" (_PAGE_BUSY) 180#ifdef CONFIG_TRANSPARENT_HUGEPAGE
179 : "cc"); 181 /* If hugepage and is trans splitting return None */
180 182 if (unlikely(hugepage &&
181 if (pte_present(pte)) { 183 pmd_trans_splitting(pte_pmd(old_pte))))
182 pte = pte_mkyoung(pte); 184 return __pte(0);
183 if (writing && pte_write(pte)) 185#endif
184 pte = pte_mkdirty(pte); 186 /* If pte is not present return None */
185 } 187 if (unlikely(!(old_pte & _PAGE_PRESENT)))
188 return __pte(0);
186 189
187 *p = pte; /* clears _PAGE_BUSY */ 190 new_pte = pte_mkyoung(old_pte);
191 if (writing && pte_write(old_pte))
192 new_pte = pte_mkdirty(new_pte);
188 193
189 return pte; 194 if (old_pte == __cmpxchg_u64((unsigned long *)ptep, old_pte,
195 new_pte))
196 break;
197 }
198 return new_pte;
190} 199}
191 200
201
192/* Return HPTE cache control bits corresponding to Linux pte bits */ 202/* Return HPTE cache control bits corresponding to Linux pte bits */
193static inline unsigned long hpte_cache_bits(unsigned long pte_val) 203static inline unsigned long hpte_cache_bits(unsigned long pte_val)
194{ 204{
diff --git a/arch/powerpc/include/asm/lppaca.h b/arch/powerpc/include/asm/lppaca.h
index b1e7f2af1016..9b12f88d4adb 100644
--- a/arch/powerpc/include/asm/lppaca.h
+++ b/arch/powerpc/include/asm/lppaca.h
@@ -66,7 +66,8 @@ struct lppaca {
66 66
67 u8 reserved6[48]; 67 u8 reserved6[48];
68 u8 cede_latency_hint; 68 u8 cede_latency_hint;
69 u8 reserved7[7]; 69 u8 ebb_regs_in_use;
70 u8 reserved7[6];
70 u8 dtl_enable_mask; /* Dispatch Trace Log mask */ 71 u8 dtl_enable_mask; /* Dispatch Trace Log mask */
71 u8 donate_dedicated_cpu; /* Donate dedicated CPU cycles */ 72 u8 donate_dedicated_cpu; /* Donate dedicated CPU cycles */
72 u8 fpregs_in_use; 73 u8 fpregs_in_use;
diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
index 92386fc4e82a..8b480901165a 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -36,13 +36,13 @@ struct machdep_calls {
36#ifdef CONFIG_PPC64 36#ifdef CONFIG_PPC64
37 void (*hpte_invalidate)(unsigned long slot, 37 void (*hpte_invalidate)(unsigned long slot,
38 unsigned long vpn, 38 unsigned long vpn,
39 int psize, int ssize, 39 int bpsize, int apsize,
40 int local); 40 int ssize, int local);
41 long (*hpte_updatepp)(unsigned long slot, 41 long (*hpte_updatepp)(unsigned long slot,
42 unsigned long newpp, 42 unsigned long newpp,
43 unsigned long vpn, 43 unsigned long vpn,
44 int psize, int ssize, 44 int bpsize, int apsize,
45 int local); 45 int ssize, int local);
46 void (*hpte_updateboltedpp)(unsigned long newpp, 46 void (*hpte_updateboltedpp)(unsigned long newpp,
47 unsigned long ea, 47 unsigned long ea,
48 int psize, int ssize); 48 int psize, int ssize);
@@ -57,6 +57,9 @@ struct machdep_calls {
57 void (*hpte_removebolted)(unsigned long ea, 57 void (*hpte_removebolted)(unsigned long ea,
58 int psize, int ssize); 58 int psize, int ssize);
59 void (*flush_hash_range)(unsigned long number, int local); 59 void (*flush_hash_range)(unsigned long number, int local);
60 void (*hugepage_invalidate)(struct mm_struct *mm,
61 unsigned char *hpte_slot_array,
62 unsigned long addr, int psize);
60 63
61 /* special for kexec, to be called in real mode, linear mapping is 64 /* special for kexec, to be called in real mode, linear mapping is
62 * destroyed as well */ 65 * destroyed as well */
diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
index 2accc9611248..c4cf01197273 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -340,6 +340,20 @@ extern int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
340int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, 340int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
341 pte_t *ptep, unsigned long trap, int local, int ssize, 341 pte_t *ptep, unsigned long trap, int local, int ssize,
342 unsigned int shift, unsigned int mmu_psize); 342 unsigned int shift, unsigned int mmu_psize);
343#ifdef CONFIG_TRANSPARENT_HUGEPAGE
344extern int __hash_page_thp(unsigned long ea, unsigned long access,
345 unsigned long vsid, pmd_t *pmdp, unsigned long trap,
346 int local, int ssize, unsigned int psize);
347#else
348static inline int __hash_page_thp(unsigned long ea, unsigned long access,
349 unsigned long vsid, pmd_t *pmdp,
350 unsigned long trap, int local,
351 int ssize, unsigned int psize)
352{
353 BUG();
354 return -1;
355}
356#endif
343extern void hash_failure_debug(unsigned long ea, unsigned long access, 357extern void hash_failure_debug(unsigned long ea, unsigned long access,
344 unsigned long vsid, unsigned long trap, 358 unsigned long vsid, unsigned long trap,
345 int ssize, int psize, int lpsize, 359 int ssize, int psize, int lpsize,
diff --git a/arch/powerpc/include/asm/mpc5121.h b/arch/powerpc/include/asm/mpc5121.h
index 885c040d6194..8ae133eaf9fa 100644
--- a/arch/powerpc/include/asm/mpc5121.h
+++ b/arch/powerpc/include/asm/mpc5121.h
@@ -68,6 +68,5 @@ struct mpc512x_lpc {
68}; 68};
69 69
70int mpc512x_cs_config(unsigned int cs, u32 val); 70int mpc512x_cs_config(unsigned int cs, u32 val);
71int __init mpc5121_clk_init(void);
72 71
73#endif /* __ASM_POWERPC_MPC5121_H__ */ 72#endif /* __ASM_POWERPC_MPC5121_H__ */
diff --git a/arch/powerpc/include/asm/mpic.h b/arch/powerpc/include/asm/mpic.h
index c0f9ef90f0b8..4a1ac9fbf186 100644
--- a/arch/powerpc/include/asm/mpic.h
+++ b/arch/powerpc/include/asm/mpic.h
@@ -339,6 +339,8 @@ struct mpic
339#endif 339#endif
340}; 340};
341 341
342extern struct bus_type mpic_subsys;
343
342/* 344/*
343 * MPIC flags (passed to mpic_alloc) 345 * MPIC flags (passed to mpic_alloc)
344 * 346 *
@@ -393,6 +395,9 @@ struct mpic
393#define MPIC_REGSET_STANDARD MPIC_REGSET(0) /* Original MPIC */ 395#define MPIC_REGSET_STANDARD MPIC_REGSET(0) /* Original MPIC */
394#define MPIC_REGSET_TSI108 MPIC_REGSET(1) /* Tsi108/109 PIC */ 396#define MPIC_REGSET_TSI108 MPIC_REGSET(1) /* Tsi108/109 PIC */
395 397
398/* Get the version of primary MPIC */
399extern u32 fsl_mpic_primary_get_version(void);
400
396/* Allocate the controller structure and setup the linux irq descs 401/* Allocate the controller structure and setup the linux irq descs
397 * for the range if interrupts passed in. No HW initialization is 402 * for the range if interrupts passed in. No HW initialization is
398 * actually performed. 403 * actually performed.
diff --git a/arch/powerpc/include/asm/mpic_timer.h b/arch/powerpc/include/asm/mpic_timer.h
new file mode 100644
index 000000000000..0e23cd4ac8aa
--- /dev/null
+++ b/arch/powerpc/include/asm/mpic_timer.h
@@ -0,0 +1,46 @@
1/*
2 * arch/powerpc/include/asm/mpic_timer.h
3 *
4 * Header file for Mpic Global Timer
5 *
6 * Copyright 2013 Freescale Semiconductor, Inc.
7 *
8 * Author: Wang Dongsheng <Dongsheng.Wang@freescale.com>
9 * Li Yang <leoli@freescale.com>
10 *
11 * This program is free software; you can redistribute it and/or modify it
12 * under the terms of the GNU General Public License as published by the
13 * Free Software Foundation; either version 2 of the License, or (at your
14 * option) any later version.
15 */
16
17#ifndef __MPIC_TIMER__
18#define __MPIC_TIMER__
19
20#include <linux/interrupt.h>
21#include <linux/time.h>
22
23struct mpic_timer {
24 void *dev;
25 struct cascade_priv *cascade_handle;
26 unsigned int num;
27 unsigned int irq;
28};
29
30#ifdef CONFIG_MPIC_TIMER
31struct mpic_timer *mpic_request_timer(irq_handler_t fn, void *dev,
32 const struct timeval *time);
33void mpic_start_timer(struct mpic_timer *handle);
34void mpic_stop_timer(struct mpic_timer *handle);
35void mpic_get_remain_time(struct mpic_timer *handle, struct timeval *time);
36void mpic_free_timer(struct mpic_timer *handle);
37#else
38struct mpic_timer *mpic_request_timer(irq_handler_t fn, void *dev,
39 const struct timeval *time) { return NULL; }
40void mpic_start_timer(struct mpic_timer *handle) { }
41void mpic_stop_timer(struct mpic_timer *handle) { }
42void mpic_get_remain_time(struct mpic_timer *handle, struct timeval *time) { }
43void mpic_free_timer(struct mpic_timer *handle) { }
44#endif
45
46#endif
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index cbb9305ab15a..029fe85722aa 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -117,7 +117,13 @@ extern int opal_enter_rtas(struct rtas_args *args,
117#define OPAL_SET_SLOT_LED_STATUS 55 117#define OPAL_SET_SLOT_LED_STATUS 55
118#define OPAL_GET_EPOW_STATUS 56 118#define OPAL_GET_EPOW_STATUS 56
119#define OPAL_SET_SYSTEM_ATTENTION_LED 57 119#define OPAL_SET_SYSTEM_ATTENTION_LED 57
120#define OPAL_RESERVED1 58
121#define OPAL_RESERVED2 59
122#define OPAL_PCI_NEXT_ERROR 60
123#define OPAL_PCI_EEH_FREEZE_STATUS2 61
124#define OPAL_PCI_POLL 62
120#define OPAL_PCI_MSI_EOI 63 125#define OPAL_PCI_MSI_EOI 63
126#define OPAL_PCI_GET_PHB_DIAG_DATA2 64
121 127
122#ifndef __ASSEMBLY__ 128#ifndef __ASSEMBLY__
123 129
@@ -125,6 +131,7 @@ extern int opal_enter_rtas(struct rtas_args *args,
125enum OpalVendorApiTokens { 131enum OpalVendorApiTokens {
126 OPAL_START_VENDOR_API_RANGE = 1000, OPAL_END_VENDOR_API_RANGE = 1999 132 OPAL_START_VENDOR_API_RANGE = 1000, OPAL_END_VENDOR_API_RANGE = 1999
127}; 133};
134
128enum OpalFreezeState { 135enum OpalFreezeState {
129 OPAL_EEH_STOPPED_NOT_FROZEN = 0, 136 OPAL_EEH_STOPPED_NOT_FROZEN = 0,
130 OPAL_EEH_STOPPED_MMIO_FREEZE = 1, 137 OPAL_EEH_STOPPED_MMIO_FREEZE = 1,
@@ -134,55 +141,69 @@ enum OpalFreezeState {
134 OPAL_EEH_STOPPED_TEMP_UNAVAIL = 5, 141 OPAL_EEH_STOPPED_TEMP_UNAVAIL = 5,
135 OPAL_EEH_STOPPED_PERM_UNAVAIL = 6 142 OPAL_EEH_STOPPED_PERM_UNAVAIL = 6
136}; 143};
144
137enum OpalEehFreezeActionToken { 145enum OpalEehFreezeActionToken {
138 OPAL_EEH_ACTION_CLEAR_FREEZE_MMIO = 1, 146 OPAL_EEH_ACTION_CLEAR_FREEZE_MMIO = 1,
139 OPAL_EEH_ACTION_CLEAR_FREEZE_DMA = 2, 147 OPAL_EEH_ACTION_CLEAR_FREEZE_DMA = 2,
140 OPAL_EEH_ACTION_CLEAR_FREEZE_ALL = 3 148 OPAL_EEH_ACTION_CLEAR_FREEZE_ALL = 3
141}; 149};
150
142enum OpalPciStatusToken { 151enum OpalPciStatusToken {
143 OPAL_EEH_PHB_NO_ERROR = 0, 152 OPAL_EEH_NO_ERROR = 0,
144 OPAL_EEH_PHB_FATAL = 1, 153 OPAL_EEH_IOC_ERROR = 1,
145 OPAL_EEH_PHB_RECOVERABLE = 2, 154 OPAL_EEH_PHB_ERROR = 2,
146 OPAL_EEH_PHB_BUS_ERROR = 3, 155 OPAL_EEH_PE_ERROR = 3,
147 OPAL_EEH_PCI_NO_DEVSEL = 4, 156 OPAL_EEH_PE_MMIO_ERROR = 4,
148 OPAL_EEH_PCI_TA = 5, 157 OPAL_EEH_PE_DMA_ERROR = 5
149 OPAL_EEH_PCIEX_UR = 6,
150 OPAL_EEH_PCIEX_CA = 7,
151 OPAL_EEH_PCI_MMIO_ERROR = 8,
152 OPAL_EEH_PCI_DMA_ERROR = 9
153}; 158};
159
160enum OpalPciErrorSeverity {
161 OPAL_EEH_SEV_NO_ERROR = 0,
162 OPAL_EEH_SEV_IOC_DEAD = 1,
163 OPAL_EEH_SEV_PHB_DEAD = 2,
164 OPAL_EEH_SEV_PHB_FENCED = 3,
165 OPAL_EEH_SEV_PE_ER = 4,
166 OPAL_EEH_SEV_INF = 5
167};
168
154enum OpalShpcAction { 169enum OpalShpcAction {
155 OPAL_SHPC_GET_LINK_STATE = 0, 170 OPAL_SHPC_GET_LINK_STATE = 0,
156 OPAL_SHPC_GET_SLOT_STATE = 1 171 OPAL_SHPC_GET_SLOT_STATE = 1
157}; 172};
173
158enum OpalShpcLinkState { 174enum OpalShpcLinkState {
159 OPAL_SHPC_LINK_DOWN = 0, 175 OPAL_SHPC_LINK_DOWN = 0,
160 OPAL_SHPC_LINK_UP = 1 176 OPAL_SHPC_LINK_UP = 1
161}; 177};
178
162enum OpalMmioWindowType { 179enum OpalMmioWindowType {
163 OPAL_M32_WINDOW_TYPE = 1, 180 OPAL_M32_WINDOW_TYPE = 1,
164 OPAL_M64_WINDOW_TYPE = 2, 181 OPAL_M64_WINDOW_TYPE = 2,
165 OPAL_IO_WINDOW_TYPE = 3 182 OPAL_IO_WINDOW_TYPE = 3
166}; 183};
184
167enum OpalShpcSlotState { 185enum OpalShpcSlotState {
168 OPAL_SHPC_DEV_NOT_PRESENT = 0, 186 OPAL_SHPC_DEV_NOT_PRESENT = 0,
169 OPAL_SHPC_DEV_PRESENT = 1 187 OPAL_SHPC_DEV_PRESENT = 1
170}; 188};
189
171enum OpalExceptionHandler { 190enum OpalExceptionHandler {
172 OPAL_MACHINE_CHECK_HANDLER = 1, 191 OPAL_MACHINE_CHECK_HANDLER = 1,
173 OPAL_HYPERVISOR_MAINTENANCE_HANDLER = 2, 192 OPAL_HYPERVISOR_MAINTENANCE_HANDLER = 2,
174 OPAL_SOFTPATCH_HANDLER = 3 193 OPAL_SOFTPATCH_HANDLER = 3
175}; 194};
195
176enum OpalPendingState { 196enum OpalPendingState {
177 OPAL_EVENT_OPAL_INTERNAL = 0x1, 197 OPAL_EVENT_OPAL_INTERNAL = 0x1,
178 OPAL_EVENT_NVRAM = 0x2, 198 OPAL_EVENT_NVRAM = 0x2,
179 OPAL_EVENT_RTC = 0x4, 199 OPAL_EVENT_RTC = 0x4,
180 OPAL_EVENT_CONSOLE_OUTPUT = 0x8, 200 OPAL_EVENT_CONSOLE_OUTPUT = 0x8,
181 OPAL_EVENT_CONSOLE_INPUT = 0x10, 201 OPAL_EVENT_CONSOLE_INPUT = 0x10,
182 OPAL_EVENT_ERROR_LOG_AVAIL = 0x20, 202 OPAL_EVENT_ERROR_LOG_AVAIL = 0x20,
183 OPAL_EVENT_ERROR_LOG = 0x40, 203 OPAL_EVENT_ERROR_LOG = 0x40,
184 OPAL_EVENT_EPOW = 0x80, 204 OPAL_EVENT_EPOW = 0x80,
185 OPAL_EVENT_LED_STATUS = 0x100 205 OPAL_EVENT_LED_STATUS = 0x100,
206 OPAL_EVENT_PCI_ERROR = 0x200
186}; 207};
187 208
188/* Machine check related definitions */ 209/* Machine check related definitions */
@@ -364,15 +385,80 @@ struct opal_machine_check_event {
364 } u; 385 } u;
365}; 386};
366 387
388enum {
389 OPAL_P7IOC_DIAG_TYPE_NONE = 0,
390 OPAL_P7IOC_DIAG_TYPE_RGC = 1,
391 OPAL_P7IOC_DIAG_TYPE_BI = 2,
392 OPAL_P7IOC_DIAG_TYPE_CI = 3,
393 OPAL_P7IOC_DIAG_TYPE_MISC = 4,
394 OPAL_P7IOC_DIAG_TYPE_I2C = 5,
395 OPAL_P7IOC_DIAG_TYPE_LAST = 6
396};
397
398struct OpalIoP7IOCErrorData {
399 uint16_t type;
400
401 /* GEM */
402 uint64_t gemXfir;
403 uint64_t gemRfir;
404 uint64_t gemRirqfir;
405 uint64_t gemMask;
406 uint64_t gemRwof;
407
408 /* LEM */
409 uint64_t lemFir;
410 uint64_t lemErrMask;
411 uint64_t lemAction0;
412 uint64_t lemAction1;
413 uint64_t lemWof;
414
415 union {
416 struct OpalIoP7IOCRgcErrorData {
417 uint64_t rgcStatus; /* 3E1C10 */
418 uint64_t rgcLdcp; /* 3E1C18 */
419 }rgc;
420 struct OpalIoP7IOCBiErrorData {
421 uint64_t biLdcp0; /* 3C0100, 3C0118 */
422 uint64_t biLdcp1; /* 3C0108, 3C0120 */
423 uint64_t biLdcp2; /* 3C0110, 3C0128 */
424 uint64_t biFenceStatus; /* 3C0130, 3C0130 */
425
426 uint8_t biDownbound; /* BI Downbound or Upbound */
427 }bi;
428 struct OpalIoP7IOCCiErrorData {
429 uint64_t ciPortStatus; /* 3Dn008 */
430 uint64_t ciPortLdcp; /* 3Dn010 */
431
432 uint8_t ciPort; /* Index of CI port: 0/1 */
433 }ci;
434 };
435};
436
367/** 437/**
368 * This structure defines the overlay which will be used to store PHB error 438 * This structure defines the overlay which will be used to store PHB error
369 * data upon request. 439 * data upon request.
370 */ 440 */
371enum { 441enum {
442 OPAL_PHB_ERROR_DATA_VERSION_1 = 1,
443};
444
445enum {
446 OPAL_PHB_ERROR_DATA_TYPE_P7IOC = 1,
447};
448
449enum {
372 OPAL_P7IOC_NUM_PEST_REGS = 128, 450 OPAL_P7IOC_NUM_PEST_REGS = 128,
373}; 451};
374 452
453struct OpalIoPhbErrorCommon {
454 uint32_t version;
455 uint32_t ioType;
456 uint32_t len;
457};
458
375struct OpalIoP7IOCPhbErrorData { 459struct OpalIoP7IOCPhbErrorData {
460 struct OpalIoPhbErrorCommon common;
461
376 uint32_t brdgCtl; 462 uint32_t brdgCtl;
377 463
378 // P7IOC utl regs 464 // P7IOC utl regs
@@ -530,14 +616,21 @@ int64_t opal_pci_map_pe_dma_window_real(uint64_t phb_id, uint16_t pe_number,
530 uint64_t pci_mem_size); 616 uint64_t pci_mem_size);
531int64_t opal_pci_reset(uint64_t phb_id, uint8_t reset_scope, uint8_t assert_state); 617int64_t opal_pci_reset(uint64_t phb_id, uint8_t reset_scope, uint8_t assert_state);
532 618
533int64_t opal_pci_get_hub_diag_data(uint64_t hub_id, void *diag_buffer, uint64_t diag_buffer_len); 619int64_t opal_pci_get_hub_diag_data(uint64_t hub_id, void *diag_buffer,
534int64_t opal_pci_get_phb_diag_data(uint64_t phb_id, void *diag_buffer, uint64_t diag_buffer_len); 620 uint64_t diag_buffer_len);
621int64_t opal_pci_get_phb_diag_data(uint64_t phb_id, void *diag_buffer,
622 uint64_t diag_buffer_len);
623int64_t opal_pci_get_phb_diag_data2(uint64_t phb_id, void *diag_buffer,
624 uint64_t diag_buffer_len);
535int64_t opal_pci_fence_phb(uint64_t phb_id); 625int64_t opal_pci_fence_phb(uint64_t phb_id);
536int64_t opal_pci_reinit(uint64_t phb_id, uint8_t reinit_scope); 626int64_t opal_pci_reinit(uint64_t phb_id, uint8_t reinit_scope);
537int64_t opal_pci_mask_pe_error(uint64_t phb_id, uint16_t pe_number, uint8_t error_type, uint8_t mask_action); 627int64_t opal_pci_mask_pe_error(uint64_t phb_id, uint16_t pe_number, uint8_t error_type, uint8_t mask_action);
538int64_t opal_set_slot_led_status(uint64_t phb_id, uint64_t slot_id, uint8_t led_type, uint8_t led_action); 628int64_t opal_set_slot_led_status(uint64_t phb_id, uint64_t slot_id, uint8_t led_type, uint8_t led_action);
539int64_t opal_get_epow_status(uint64_t *status); 629int64_t opal_get_epow_status(uint64_t *status);
540int64_t opal_set_system_attention_led(uint8_t led_action); 630int64_t opal_set_system_attention_led(uint8_t led_action);
631int64_t opal_pci_next_error(uint64_t phb_id, uint64_t *first_frozen_pe,
632 uint16_t *pci_error_type, uint16_t *severity);
633int64_t opal_pci_poll(uint64_t phb_id);
541 634
542/* Internal functions */ 635/* Internal functions */
543extern int early_init_dt_scan_opal(unsigned long node, const char *uname, int depth, void *data); 636extern int early_init_dt_scan_opal(unsigned long node, const char *uname, int depth, void *data);
@@ -551,6 +644,11 @@ extern void hvc_opal_init_early(void);
551extern int early_init_dt_scan_opal(unsigned long node, const char *uname, 644extern int early_init_dt_scan_opal(unsigned long node, const char *uname,
552 int depth, void *data); 645 int depth, void *data);
553 646
647extern int opal_notifier_register(struct notifier_block *nb);
648extern void opal_notifier_enable(void);
649extern void opal_notifier_disable(void);
650extern void opal_notifier_update_evt(uint64_t evt_mask, uint64_t evt_val);
651
554extern int opal_get_chars(uint32_t vtermno, char *buf, int count); 652extern int opal_get_chars(uint32_t vtermno, char *buf, int count);
555extern int opal_put_chars(uint32_t vtermno, const char *buf, int total_len); 653extern int opal_put_chars(uint32_t vtermno, const char *buf, int total_len);
556 654
diff --git a/arch/powerpc/include/asm/perf_event_server.h b/arch/powerpc/include/asm/perf_event_server.h
index f265049dd7d6..2dd7bfc459be 100644
--- a/arch/powerpc/include/asm/perf_event_server.h
+++ b/arch/powerpc/include/asm/perf_event_server.h
@@ -60,6 +60,7 @@ struct power_pmu {
60#define PPMU_HAS_SSLOT 0x00000020 /* Has sampled slot in MMCRA */ 60#define PPMU_HAS_SSLOT 0x00000020 /* Has sampled slot in MMCRA */
61#define PPMU_HAS_SIER 0x00000040 /* Has SIER */ 61#define PPMU_HAS_SIER 0x00000040 /* Has SIER */
62#define PPMU_BHRB 0x00000080 /* has BHRB feature enabled */ 62#define PPMU_BHRB 0x00000080 /* has BHRB feature enabled */
63#define PPMU_EBB 0x00000100 /* supports event based branch */
63 64
64/* 65/*
65 * Values for flags to get_alternatives() 66 * Values for flags to get_alternatives()
@@ -68,6 +69,11 @@ struct power_pmu {
68#define PPMU_LIMITED_PMC_REQD 2 /* have to put this on a limited PMC */ 69#define PPMU_LIMITED_PMC_REQD 2 /* have to put this on a limited PMC */
69#define PPMU_ONLY_COUNT_RUN 4 /* only counting in run state */ 70#define PPMU_ONLY_COUNT_RUN 4 /* only counting in run state */
70 71
72/*
73 * We use the event config bit 63 as a flag to request EBB.
74 */
75#define EVENT_CONFIG_EBB_SHIFT 63
76
71extern int register_power_pmu(struct power_pmu *); 77extern int register_power_pmu(struct power_pmu *);
72 78
73struct pt_regs; 79struct pt_regs;
diff --git a/arch/powerpc/include/asm/pgalloc-64.h b/arch/powerpc/include/asm/pgalloc-64.h
index b66ae722a8e9..f65e27b09bd3 100644
--- a/arch/powerpc/include/asm/pgalloc-64.h
+++ b/arch/powerpc/include/asm/pgalloc-64.h
@@ -221,17 +221,17 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
221 221
222static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) 222static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
223{ 223{
224 return kmem_cache_alloc(PGT_CACHE(PMD_INDEX_SIZE), 224 return kmem_cache_alloc(PGT_CACHE(PMD_CACHE_INDEX),
225 GFP_KERNEL|__GFP_REPEAT); 225 GFP_KERNEL|__GFP_REPEAT);
226} 226}
227 227
228static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) 228static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
229{ 229{
230 kmem_cache_free(PGT_CACHE(PMD_INDEX_SIZE), pmd); 230 kmem_cache_free(PGT_CACHE(PMD_CACHE_INDEX), pmd);
231} 231}
232 232
233#define __pmd_free_tlb(tlb, pmd, addr) \ 233#define __pmd_free_tlb(tlb, pmd, addr) \
234 pgtable_free_tlb(tlb, pmd, PMD_INDEX_SIZE) 234 pgtable_free_tlb(tlb, pmd, PMD_CACHE_INDEX)
235#ifndef CONFIG_PPC_64K_PAGES 235#ifndef CONFIG_PPC_64K_PAGES
236#define __pud_free_tlb(tlb, pud, addr) \ 236#define __pud_free_tlb(tlb, pud, addr) \
237 pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE) 237 pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE)
diff --git a/arch/powerpc/include/asm/pgtable-ppc64-64k.h b/arch/powerpc/include/asm/pgtable-ppc64-64k.h
index 45142d640720..a56b82fb0609 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64-64k.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64-64k.h
@@ -33,7 +33,8 @@
33#define PGDIR_MASK (~(PGDIR_SIZE-1)) 33#define PGDIR_MASK (~(PGDIR_SIZE-1))
34 34
35/* Bits to mask out from a PMD to get to the PTE page */ 35/* Bits to mask out from a PMD to get to the PTE page */
36#define PMD_MASKED_BITS 0x1ff 36/* PMDs point to PTE table fragments which are 4K aligned. */
37#define PMD_MASKED_BITS 0xfff
37/* Bits to mask out from a PGD/PUD to get to the PMD page */ 38/* Bits to mask out from a PGD/PUD to get to the PMD page */
38#define PUD_MASKED_BITS 0x1ff 39#define PUD_MASKED_BITS 0x1ff
39 40
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index e3d55f6f24fe..46db09414a10 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -10,6 +10,7 @@
10#else 10#else
11#include <asm/pgtable-ppc64-4k.h> 11#include <asm/pgtable-ppc64-4k.h>
12#endif 12#endif
13#include <asm/barrier.h>
13 14
14#define FIRST_USER_ADDRESS 0 15#define FIRST_USER_ADDRESS 0
15 16
@@ -20,7 +21,11 @@
20 PUD_INDEX_SIZE + PGD_INDEX_SIZE + PAGE_SHIFT) 21 PUD_INDEX_SIZE + PGD_INDEX_SIZE + PAGE_SHIFT)
21#define PGTABLE_RANGE (ASM_CONST(1) << PGTABLE_EADDR_SIZE) 22#define PGTABLE_RANGE (ASM_CONST(1) << PGTABLE_EADDR_SIZE)
22 23
23 24#ifdef CONFIG_TRANSPARENT_HUGEPAGE
25#define PMD_CACHE_INDEX (PMD_INDEX_SIZE + 1)
26#else
27#define PMD_CACHE_INDEX PMD_INDEX_SIZE
28#endif
24/* 29/*
25 * Define the address range of the kernel non-linear virtual area 30 * Define the address range of the kernel non-linear virtual area
26 */ 31 */
@@ -150,7 +155,7 @@
150#define pmd_present(pmd) (pmd_val(pmd) != 0) 155#define pmd_present(pmd) (pmd_val(pmd) != 0)
151#define pmd_clear(pmdp) (pmd_val(*(pmdp)) = 0) 156#define pmd_clear(pmdp) (pmd_val(*(pmdp)) = 0)
152#define pmd_page_vaddr(pmd) (pmd_val(pmd) & ~PMD_MASKED_BITS) 157#define pmd_page_vaddr(pmd) (pmd_val(pmd) & ~PMD_MASKED_BITS)
153#define pmd_page(pmd) virt_to_page(pmd_page_vaddr(pmd)) 158extern struct page *pmd_page(pmd_t pmd);
154 159
155#define pud_set(pudp, pudval) (pud_val(*(pudp)) = (pudval)) 160#define pud_set(pudp, pudval) (pud_val(*(pudp)) = (pudval))
156#define pud_none(pud) (!pud_val(pud)) 161#define pud_none(pud) (!pud_val(pud))
@@ -339,43 +344,217 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
339 344
340void pgtable_cache_add(unsigned shift, void (*ctor)(void *)); 345void pgtable_cache_add(unsigned shift, void (*ctor)(void *));
341void pgtable_cache_init(void); 346void pgtable_cache_init(void);
347#endif /* __ASSEMBLY__ */
348
349/*
350 * THP pages can't be special. So use the _PAGE_SPECIAL
351 */
352#define _PAGE_SPLITTING _PAGE_SPECIAL
353
354/*
355 * We need to differentiate between explicit huge page and THP huge
356 * page, since THP huge page also need to track real subpage details
357 */
358#define _PAGE_THP_HUGE _PAGE_4K_PFN
342 359
343/* 360/*
344 * find_linux_pte returns the address of a linux pte for a given 361 * set of bits not changed in pmd_modify.
345 * effective address and directory. If not found, it returns zero.
346 */ 362 */
347static inline pte_t *find_linux_pte(pgd_t *pgdir, unsigned long ea) 363#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | \
364 _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_SPLITTING | \
365 _PAGE_THP_HUGE)
366
367#ifndef __ASSEMBLY__
368/*
369 * The linux hugepage PMD now include the pmd entries followed by the address
370 * to the stashed pgtable_t. The stashed pgtable_t contains the hpte bits.
371 * [ 1 bit secondary | 3 bit hidx | 1 bit valid | 000]. We use one byte per
372 * each HPTE entry. With 16MB hugepage and 64K HPTE we need 256 entries and
373 * with 4K HPTE we need 4096 entries. Both will fit in a 4K pgtable_t.
374 *
375 * The last three bits are intentionally left to zero. This memory location
376 * are also used as normal page PTE pointers. So if we have any pointers
377 * left around while we collapse a hugepage, we need to make sure
378 * _PAGE_PRESENT and _PAGE_FILE bits of that are zero when we look at them
379 */
380static inline unsigned int hpte_valid(unsigned char *hpte_slot_array, int index)
348{ 381{
349 pgd_t *pg; 382 return (hpte_slot_array[index] >> 3) & 0x1;
350 pud_t *pu;
351 pmd_t *pm;
352 pte_t *pt = NULL;
353
354 pg = pgdir + pgd_index(ea);
355 if (!pgd_none(*pg)) {
356 pu = pud_offset(pg, ea);
357 if (!pud_none(*pu)) {
358 pm = pmd_offset(pu, ea);
359 if (pmd_present(*pm))
360 pt = pte_offset_kernel(pm, ea);
361 }
362 }
363 return pt;
364} 383}
365 384
366#ifdef CONFIG_HUGETLB_PAGE 385static inline unsigned int hpte_hash_index(unsigned char *hpte_slot_array,
367pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, 386 int index)
368 unsigned *shift);
369#else
370static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
371 unsigned *shift)
372{ 387{
373 if (shift) 388 return hpte_slot_array[index] >> 4;
374 *shift = 0;
375 return find_linux_pte(pgdir, ea);
376} 389}
377#endif /* !CONFIG_HUGETLB_PAGE */
378 390
379#endif /* __ASSEMBLY__ */ 391static inline void mark_hpte_slot_valid(unsigned char *hpte_slot_array,
392 unsigned int index, unsigned int hidx)
393{
394 hpte_slot_array[index] = hidx << 4 | 0x1 << 3;
395}
380 396
397static inline char *get_hpte_slot_array(pmd_t *pmdp)
398{
399 /*
400 * The hpte hindex is stored in the pgtable whose address is in the
401 * second half of the PMD
402 *
403 * Order this load with the test for pmd_trans_huge in the caller
404 */
405 smp_rmb();
406 return *(char **)(pmdp + PTRS_PER_PMD);
407
408
409}
410
411extern void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
412 pmd_t *pmdp);
413#ifdef CONFIG_TRANSPARENT_HUGEPAGE
414extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot);
415extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot);
416extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot);
417extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,
418 pmd_t *pmdp, pmd_t pmd);
419extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
420 pmd_t *pmd);
421
422static inline int pmd_trans_huge(pmd_t pmd)
423{
424 /*
425 * leaf pte for huge page, bottom two bits != 00
426 */
427 return (pmd_val(pmd) & 0x3) && (pmd_val(pmd) & _PAGE_THP_HUGE);
428}
429
430static inline int pmd_large(pmd_t pmd)
431{
432 /*
433 * leaf pte for huge page, bottom two bits != 00
434 */
435 if (pmd_trans_huge(pmd))
436 return pmd_val(pmd) & _PAGE_PRESENT;
437 return 0;
438}
439
440static inline int pmd_trans_splitting(pmd_t pmd)
441{
442 if (pmd_trans_huge(pmd))
443 return pmd_val(pmd) & _PAGE_SPLITTING;
444 return 0;
445}
446
447extern int has_transparent_hugepage(void);
448#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
449
450static inline pte_t pmd_pte(pmd_t pmd)
451{
452 return __pte(pmd_val(pmd));
453}
454
455static inline pmd_t pte_pmd(pte_t pte)
456{
457 return __pmd(pte_val(pte));
458}
459
460static inline pte_t *pmdp_ptep(pmd_t *pmd)
461{
462 return (pte_t *)pmd;
463}
464
465#define pmd_pfn(pmd) pte_pfn(pmd_pte(pmd))
466#define pmd_young(pmd) pte_young(pmd_pte(pmd))
467#define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd)))
468#define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd)))
469#define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd)))
470#define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd)))
471#define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd)))
472
473#define __HAVE_ARCH_PMD_WRITE
474#define pmd_write(pmd) pte_write(pmd_pte(pmd))
475
476static inline pmd_t pmd_mkhuge(pmd_t pmd)
477{
478 /* Do nothing, mk_pmd() does this part. */
479 return pmd;
480}
481
482static inline pmd_t pmd_mknotpresent(pmd_t pmd)
483{
484 pmd_val(pmd) &= ~_PAGE_PRESENT;
485 return pmd;
486}
487
488static inline pmd_t pmd_mksplitting(pmd_t pmd)
489{
490 pmd_val(pmd) |= _PAGE_SPLITTING;
491 return pmd;
492}
493
494#define __HAVE_ARCH_PMD_SAME
495static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
496{
497 return (((pmd_val(pmd_a) ^ pmd_val(pmd_b)) & ~_PAGE_HPTEFLAGS) == 0);
498}
499
500#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
501extern int pmdp_set_access_flags(struct vm_area_struct *vma,
502 unsigned long address, pmd_t *pmdp,
503 pmd_t entry, int dirty);
504
505extern unsigned long pmd_hugepage_update(struct mm_struct *mm,
506 unsigned long addr,
507 pmd_t *pmdp, unsigned long clr);
508
509static inline int __pmdp_test_and_clear_young(struct mm_struct *mm,
510 unsigned long addr, pmd_t *pmdp)
511{
512 unsigned long old;
513
514 if ((pmd_val(*pmdp) & (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0)
515 return 0;
516 old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED);
517 return ((old & _PAGE_ACCESSED) != 0);
518}
519
520#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
521extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
522 unsigned long address, pmd_t *pmdp);
523#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
524extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
525 unsigned long address, pmd_t *pmdp);
526
527#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
528extern pmd_t pmdp_get_and_clear(struct mm_struct *mm,
529 unsigned long addr, pmd_t *pmdp);
530
531#define __HAVE_ARCH_PMDP_CLEAR_FLUSH
532extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
533 pmd_t *pmdp);
534
535#define __HAVE_ARCH_PMDP_SET_WRPROTECT
536static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
537 pmd_t *pmdp)
538{
539
540 if ((pmd_val(*pmdp) & _PAGE_RW) == 0)
541 return;
542
543 pmd_hugepage_update(mm, addr, pmdp, _PAGE_RW);
544}
545
546#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
547extern void pmdp_splitting_flush(struct vm_area_struct *vma,
548 unsigned long address, pmd_t *pmdp);
549
550#define __HAVE_ARCH_PGTABLE_DEPOSIT
551extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
552 pgtable_t pgtable);
553#define __HAVE_ARCH_PGTABLE_WITHDRAW
554extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
555
556#define __HAVE_ARCH_PMDP_INVALIDATE
557extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
558 pmd_t *pmdp);
559#endif /* __ASSEMBLY__ */
381#endif /* _ASM_POWERPC_PGTABLE_PPC64_H_ */ 560#endif /* _ASM_POWERPC_PGTABLE_PPC64_H_ */
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index b6293d26bd39..7d6eacf249cf 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -217,6 +217,12 @@ extern int gup_hugepd(hugepd_t *hugepd, unsigned pdshift, unsigned long addr,
217 217
218extern int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, 218extern int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
219 unsigned long end, int write, struct page **pages, int *nr); 219 unsigned long end, int write, struct page **pages, int *nr);
220#ifndef CONFIG_TRANSPARENT_HUGEPAGE
221#define pmd_large(pmd) 0
222#define has_transparent_hugepage() 0
223#endif
224pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
225 unsigned *shift);
220#endif /* __ASSEMBLY__ */ 226#endif /* __ASSEMBLY__ */
221 227
222#endif /* __KERNEL__ */ 228#endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/probes.h b/arch/powerpc/include/asm/probes.h
index 5f1e15b68704..3421637cfd7b 100644
--- a/arch/powerpc/include/asm/probes.h
+++ b/arch/powerpc/include/asm/probes.h
@@ -38,5 +38,30 @@ typedef u32 ppc_opcode_t;
38#define is_trap(instr) (IS_TW(instr) || IS_TWI(instr)) 38#define is_trap(instr) (IS_TW(instr) || IS_TWI(instr))
39#endif /* CONFIG_PPC64 */ 39#endif /* CONFIG_PPC64 */
40 40
41#ifdef CONFIG_PPC_ADV_DEBUG_REGS
42#define MSR_SINGLESTEP (MSR_DE)
43#else
44#define MSR_SINGLESTEP (MSR_SE)
45#endif
46
47/* Enable single stepping for the current task */
48static inline void enable_single_step(struct pt_regs *regs)
49{
50 regs->msr |= MSR_SINGLESTEP;
51#ifdef CONFIG_PPC_ADV_DEBUG_REGS
52 /*
53 * We turn off Critical Input Exception(CE) to ensure that the single
54 * step will be for the instruction we have the probe on; if we don't,
55 * it is possible we'd get the single step reported for CE.
56 */
57 regs->msr &= ~MSR_CE;
58 mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) | DBCR0_IC | DBCR0_IDM);
59#ifdef CONFIG_PPC_47x
60 isync();
61#endif
62#endif
63}
64
65
41#endif /* __KERNEL__ */ 66#endif /* __KERNEL__ */
42#endif /* _ASM_POWERPC_PROBES_H */ 67#endif /* _ASM_POWERPC_PROBES_H */
diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index 14a658363698..47a35b08b963 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -168,10 +168,10 @@ struct thread_struct {
168 * The following help to manage the use of Debug Control Registers 168 * The following help to manage the use of Debug Control Registers
169 * om the BookE platforms. 169 * om the BookE platforms.
170 */ 170 */
171 unsigned long dbcr0; 171 uint32_t dbcr0;
172 unsigned long dbcr1; 172 uint32_t dbcr1;
173#ifdef CONFIG_BOOKE 173#ifdef CONFIG_BOOKE
174 unsigned long dbcr2; 174 uint32_t dbcr2;
175#endif 175#endif
176 /* 176 /*
177 * The stored value of the DBSR register will be the value at the 177 * The stored value of the DBSR register will be the value at the
@@ -179,7 +179,7 @@ struct thread_struct {
179 * user (will never be written to) and has value while helping to 179 * user (will never be written to) and has value while helping to
180 * describe the reason for the last debug trap. Torez 180 * describe the reason for the last debug trap. Torez
181 */ 181 */
182 unsigned long dbsr; 182 uint32_t dbsr;
183 /* 183 /*
184 * The following will contain addresses used by debug applications 184 * The following will contain addresses used by debug applications
185 * to help trace and trap on particular address locations. 185 * to help trace and trap on particular address locations.
@@ -200,7 +200,7 @@ struct thread_struct {
200#endif 200#endif
201#endif 201#endif
202 /* FP and VSX 0-31 register set */ 202 /* FP and VSX 0-31 register set */
203 double fpr[32][TS_FPRWIDTH]; 203 double fpr[32][TS_FPRWIDTH] __attribute__((aligned(16)));
204 struct { 204 struct {
205 205
206 unsigned int pad; 206 unsigned int pad;
@@ -287,9 +287,9 @@ struct thread_struct {
287 unsigned long siar; 287 unsigned long siar;
288 unsigned long sdar; 288 unsigned long sdar;
289 unsigned long sier; 289 unsigned long sier;
290 unsigned long mmcr0;
291 unsigned long mmcr2; 290 unsigned long mmcr2;
292 unsigned long mmcra; 291 unsigned mmcr0;
292 unsigned used_ebb;
293#endif 293#endif
294}; 294};
295 295
@@ -404,9 +404,7 @@ static inline void prefetchw(const void *x)
404 404
405#define spin_lock_prefetch(x) prefetchw(x) 405#define spin_lock_prefetch(x) prefetchw(x)
406 406
407#ifdef CONFIG_PPC64
408#define HAVE_ARCH_PICK_MMAP_LAYOUT 407#define HAVE_ARCH_PICK_MMAP_LAYOUT
409#endif
410 408
411#ifdef CONFIG_PPC64 409#ifdef CONFIG_PPC64
412static inline unsigned long get_clean_sp(unsigned long sp, int is_32) 410static inline unsigned long get_clean_sp(unsigned long sp, int is_32)
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 4a9e408644fe..5d7d9c2a5473 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -621,11 +621,15 @@
621#define MMCR0_PMXE 0x04000000UL /* performance monitor exception enable */ 621#define MMCR0_PMXE 0x04000000UL /* performance monitor exception enable */
622#define MMCR0_FCECE 0x02000000UL /* freeze ctrs on enabled cond or event */ 622#define MMCR0_FCECE 0x02000000UL /* freeze ctrs on enabled cond or event */
623#define MMCR0_TBEE 0x00400000UL /* time base exception enable */ 623#define MMCR0_TBEE 0x00400000UL /* time base exception enable */
624#define MMCR0_EBE 0x00100000UL /* Event based branch enable */
625#define MMCR0_PMCC 0x000c0000UL /* PMC control */
626#define MMCR0_PMCC_U6 0x00080000UL /* PMC1-6 are R/W by user (PR) */
624#define MMCR0_PMC1CE 0x00008000UL /* PMC1 count enable*/ 627#define MMCR0_PMC1CE 0x00008000UL /* PMC1 count enable*/
625#define MMCR0_PMCjCE 0x00004000UL /* PMCj count enable*/ 628#define MMCR0_PMCjCE 0x00004000UL /* PMCj count enable*/
626#define MMCR0_TRIGGER 0x00002000UL /* TRIGGER enable */ 629#define MMCR0_TRIGGER 0x00002000UL /* TRIGGER enable */
627#define MMCR0_PMAO 0x00000080UL /* performance monitor alert has occurred, set to 0 after handling exception */ 630#define MMCR0_PMAO 0x00000080UL /* performance monitor alert has occurred, set to 0 after handling exception */
628#define MMCR0_SHRFC 0x00000040UL /* SHRre freeze conditions between threads */ 631#define MMCR0_SHRFC 0x00000040UL /* SHRre freeze conditions between threads */
632#define MMCR0_FC56 0x00000010UL /* freeze counters 5 and 6 */
629#define MMCR0_FCTI 0x00000008UL /* freeze counters in tags inactive mode */ 633#define MMCR0_FCTI 0x00000008UL /* freeze counters in tags inactive mode */
630#define MMCR0_FCTA 0x00000004UL /* freeze counters in tags active mode */ 634#define MMCR0_FCTA 0x00000004UL /* freeze counters in tags active mode */
631#define MMCR0_FCWAIT 0x00000002UL /* freeze counter in WAIT state */ 635#define MMCR0_FCWAIT 0x00000002UL /* freeze counter in WAIT state */
@@ -673,6 +677,11 @@
673#define SIER_SIAR_VALID 0x0400000 /* SIAR contents valid */ 677#define SIER_SIAR_VALID 0x0400000 /* SIAR contents valid */
674#define SIER_SDAR_VALID 0x0200000 /* SDAR contents valid */ 678#define SIER_SDAR_VALID 0x0200000 /* SDAR contents valid */
675 679
680/* When EBB is enabled, some of MMCR0/MMCR2/SIER are user accessible */
681#define MMCR0_USER_MASK (MMCR0_FC | MMCR0_PMXE | MMCR0_PMAO)
682#define MMCR2_USER_MASK 0x4020100804020000UL /* (FC1P|FC2P|FC3P|FC4P|FC5P|FC6P) */
683#define SIER_USER_MASK 0x7fffffUL
684
676#define SPRN_PA6T_MMCR0 795 685#define SPRN_PA6T_MMCR0 795
677#define PA6T_MMCR0_EN0 0x0000000000000001UL 686#define PA6T_MMCR0_EN0 0x0000000000000001UL
678#define PA6T_MMCR0_EN1 0x0000000000000002UL 687#define PA6T_MMCR0_EN1 0x0000000000000002UL
diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index 34fd70488d83..c7a8bfc9f6f5 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -350,8 +350,8 @@ static inline u32 rtas_config_addr(int busno, int devfn, int reg)
350 (devfn << 8) | (reg & 0xff); 350 (devfn << 8) | (reg & 0xff);
351} 351}
352 352
353extern void __cpuinit rtas_give_timebase(void); 353extern void rtas_give_timebase(void);
354extern void __cpuinit rtas_take_timebase(void); 354extern void rtas_take_timebase(void);
355 355
356#ifdef CONFIG_PPC_RTAS 356#ifdef CONFIG_PPC_RTAS
357static inline int page_is_rtas_user_buf(unsigned long pfn) 357static inline int page_is_rtas_user_buf(unsigned long pfn)
diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h
index 200d763a0a67..49a13e0ef234 100644
--- a/arch/powerpc/include/asm/switch_to.h
+++ b/arch/powerpc/include/asm/switch_to.h
@@ -67,4 +67,18 @@ static inline void flush_spe_to_thread(struct task_struct *t)
67} 67}
68#endif 68#endif
69 69
70static inline void clear_task_ebb(struct task_struct *t)
71{
72#ifdef CONFIG_PPC_BOOK3S_64
73 /* EBB perf events are not inherited, so clear all EBB state. */
74 t->thread.bescr = 0;
75 t->thread.mmcr2 = 0;
76 t->thread.mmcr0 = 0;
77 t->thread.siar = 0;
78 t->thread.sdar = 0;
79 t->thread.sier = 0;
80 t->thread.used_ebb = 0;
81#endif
82}
83
70#endif /* _ASM_POWERPC_SWITCH_TO_H */ 84#endif /* _ASM_POWERPC_SWITCH_TO_H */
diff --git a/arch/powerpc/include/asm/tlbflush.h b/arch/powerpc/include/asm/tlbflush.h
index 61a59271665b..2def01ed0cb2 100644
--- a/arch/powerpc/include/asm/tlbflush.h
+++ b/arch/powerpc/include/asm/tlbflush.h
@@ -165,7 +165,8 @@ static inline void flush_tlb_kernel_range(unsigned long start,
165/* Private function for use by PCI IO mapping code */ 165/* Private function for use by PCI IO mapping code */
166extern void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, 166extern void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
167 unsigned long end); 167 unsigned long end);
168 168extern void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd,
169 unsigned long addr);
169#else 170#else
170#error Unsupported MMU type 171#error Unsupported MMU type
171#endif 172#endif
diff --git a/arch/powerpc/include/asm/vdso.h b/arch/powerpc/include/asm/vdso.h
index 50f261bc3e95..0d9cecddf8a4 100644
--- a/arch/powerpc/include/asm/vdso.h
+++ b/arch/powerpc/include/asm/vdso.h
@@ -22,7 +22,7 @@ extern unsigned long vdso64_rt_sigtramp;
22extern unsigned long vdso32_sigtramp; 22extern unsigned long vdso32_sigtramp;
23extern unsigned long vdso32_rt_sigtramp; 23extern unsigned long vdso32_rt_sigtramp;
24 24
25int __cpuinit vdso_getcpu_init(void); 25int vdso_getcpu_init(void);
26 26
27#else /* __ASSEMBLY__ */ 27#else /* __ASSEMBLY__ */
28 28
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index f960a7944553..a8619bfe879e 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -58,6 +58,8 @@ obj-$(CONFIG_RTAS_PROC) += rtas-proc.o
58obj-$(CONFIG_LPARCFG) += lparcfg.o 58obj-$(CONFIG_LPARCFG) += lparcfg.o
59obj-$(CONFIG_IBMVIO) += vio.o 59obj-$(CONFIG_IBMVIO) += vio.o
60obj-$(CONFIG_IBMEBUS) += ibmebus.o 60obj-$(CONFIG_IBMEBUS) += ibmebus.o
61obj-$(CONFIG_EEH) += eeh.o eeh_pe.o eeh_dev.o eeh_cache.o \
62 eeh_driver.o eeh_event.o eeh_sysfs.o
61obj-$(CONFIG_GENERIC_TBSYNC) += smp-tbsync.o 63obj-$(CONFIG_GENERIC_TBSYNC) += smp-tbsync.o
62obj-$(CONFIG_CRASH_DUMP) += crash_dump.o 64obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
63obj-$(CONFIG_FA_DUMP) += fadump.o 65obj-$(CONFIG_FA_DUMP) += fadump.o
@@ -100,7 +102,7 @@ obj-$(CONFIG_PPC_UDBG_16550) += legacy_serial.o udbg_16550.o
100obj-$(CONFIG_STACKTRACE) += stacktrace.o 102obj-$(CONFIG_STACKTRACE) += stacktrace.o
101obj-$(CONFIG_SWIOTLB) += dma-swiotlb.o 103obj-$(CONFIG_SWIOTLB) += dma-swiotlb.o
102 104
103pci64-$(CONFIG_PPC64) += pci_dn.o isa-bridge.o 105pci64-$(CONFIG_PPC64) += pci_dn.o pci-hotplug.o isa-bridge.o
104obj-$(CONFIG_PCI) += pci_$(CONFIG_WORD_SIZE).o $(pci64-y) \ 106obj-$(CONFIG_PCI) += pci_$(CONFIG_WORD_SIZE).o $(pci64-y) \
105 pci-common.o pci_of_scan.o 107 pci-common.o pci_of_scan.o
106obj-$(CONFIG_PCI_MSI) += msi.o 108obj-$(CONFIG_PCI_MSI) += msi.o
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 6f16ffafa6f0..c7e8afc2ead0 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -105,9 +105,6 @@ int main(void)
105 DEFINE(KSP_VSID, offsetof(struct thread_struct, ksp_vsid)); 105 DEFINE(KSP_VSID, offsetof(struct thread_struct, ksp_vsid));
106#else /* CONFIG_PPC64 */ 106#else /* CONFIG_PPC64 */
107 DEFINE(PGDIR, offsetof(struct thread_struct, pgdir)); 107 DEFINE(PGDIR, offsetof(struct thread_struct, pgdir));
108#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
109 DEFINE(THREAD_DBCR0, offsetof(struct thread_struct, dbcr0));
110#endif
111#ifdef CONFIG_SPE 108#ifdef CONFIG_SPE
112 DEFINE(THREAD_EVR0, offsetof(struct thread_struct, evr[0])); 109 DEFINE(THREAD_EVR0, offsetof(struct thread_struct, evr[0]));
113 DEFINE(THREAD_ACC, offsetof(struct thread_struct, acc)); 110 DEFINE(THREAD_ACC, offsetof(struct thread_struct, acc));
@@ -115,6 +112,9 @@ int main(void)
115 DEFINE(THREAD_USED_SPE, offsetof(struct thread_struct, used_spe)); 112 DEFINE(THREAD_USED_SPE, offsetof(struct thread_struct, used_spe));
116#endif /* CONFIG_SPE */ 113#endif /* CONFIG_SPE */
117#endif /* CONFIG_PPC64 */ 114#endif /* CONFIG_PPC64 */
115#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
116 DEFINE(THREAD_DBCR0, offsetof(struct thread_struct, dbcr0));
117#endif
118#ifdef CONFIG_KVM_BOOK3S_32_HANDLER 118#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
119 DEFINE(THREAD_KVM_SVCPU, offsetof(struct thread_struct, kvm_shadow_vcpu)); 119 DEFINE(THREAD_KVM_SVCPU, offsetof(struct thread_struct, kvm_shadow_vcpu));
120#endif 120#endif
@@ -132,7 +132,6 @@ int main(void)
132 DEFINE(THREAD_SIER, offsetof(struct thread_struct, sier)); 132 DEFINE(THREAD_SIER, offsetof(struct thread_struct, sier));
133 DEFINE(THREAD_MMCR0, offsetof(struct thread_struct, mmcr0)); 133 DEFINE(THREAD_MMCR0, offsetof(struct thread_struct, mmcr0));
134 DEFINE(THREAD_MMCR2, offsetof(struct thread_struct, mmcr2)); 134 DEFINE(THREAD_MMCR2, offsetof(struct thread_struct, mmcr2));
135 DEFINE(THREAD_MMCRA, offsetof(struct thread_struct, mmcra));
136#endif 135#endif
137#ifdef CONFIG_PPC_TRANSACTIONAL_MEM 136#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
138 DEFINE(PACATMSCRATCH, offsetof(struct paca_struct, tm_scratch)); 137 DEFINE(PACATMSCRATCH, offsetof(struct paca_struct, tm_scratch));
diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c
index 92c6b008dd2b..9262cf2bec4b 100644
--- a/arch/powerpc/kernel/cacheinfo.c
+++ b/arch/powerpc/kernel/cacheinfo.c
@@ -131,7 +131,8 @@ static const char *cache_type_string(const struct cache *cache)
131 return cache_type_info[cache->type].name; 131 return cache_type_info[cache->type].name;
132} 132}
133 133
134static void __cpuinit cache_init(struct cache *cache, int type, int level, struct device_node *ofnode) 134static void cache_init(struct cache *cache, int type, int level,
135 struct device_node *ofnode)
135{ 136{
136 cache->type = type; 137 cache->type = type;
137 cache->level = level; 138 cache->level = level;
@@ -140,7 +141,7 @@ static void __cpuinit cache_init(struct cache *cache, int type, int level, struc
140 list_add(&cache->list, &cache_list); 141 list_add(&cache->list, &cache_list);
141} 142}
142 143
143static struct cache *__cpuinit new_cache(int type, int level, struct device_node *ofnode) 144static struct cache *new_cache(int type, int level, struct device_node *ofnode)
144{ 145{
145 struct cache *cache; 146 struct cache *cache;
146 147
@@ -324,7 +325,8 @@ static bool cache_node_is_unified(const struct device_node *np)
324 return of_get_property(np, "cache-unified", NULL); 325 return of_get_property(np, "cache-unified", NULL);
325} 326}
326 327
327static struct cache *__cpuinit cache_do_one_devnode_unified(struct device_node *node, int level) 328static struct cache *cache_do_one_devnode_unified(struct device_node *node,
329 int level)
328{ 330{
329 struct cache *cache; 331 struct cache *cache;
330 332
@@ -335,7 +337,8 @@ static struct cache *__cpuinit cache_do_one_devnode_unified(struct device_node *
335 return cache; 337 return cache;
336} 338}
337 339
338static struct cache *__cpuinit cache_do_one_devnode_split(struct device_node *node, int level) 340static struct cache *cache_do_one_devnode_split(struct device_node *node,
341 int level)
339{ 342{
340 struct cache *dcache, *icache; 343 struct cache *dcache, *icache;
341 344
@@ -357,7 +360,7 @@ err:
357 return NULL; 360 return NULL;
358} 361}
359 362
360static struct cache *__cpuinit cache_do_one_devnode(struct device_node *node, int level) 363static struct cache *cache_do_one_devnode(struct device_node *node, int level)
361{ 364{
362 struct cache *cache; 365 struct cache *cache;
363 366
@@ -369,7 +372,8 @@ static struct cache *__cpuinit cache_do_one_devnode(struct device_node *node, in
369 return cache; 372 return cache;
370} 373}
371 374
372static struct cache *__cpuinit cache_lookup_or_instantiate(struct device_node *node, int level) 375static struct cache *cache_lookup_or_instantiate(struct device_node *node,
376 int level)
373{ 377{
374 struct cache *cache; 378 struct cache *cache;
375 379
@@ -385,7 +389,7 @@ static struct cache *__cpuinit cache_lookup_or_instantiate(struct device_node *n
385 return cache; 389 return cache;
386} 390}
387 391
388static void __cpuinit link_cache_lists(struct cache *smaller, struct cache *bigger) 392static void link_cache_lists(struct cache *smaller, struct cache *bigger)
389{ 393{
390 while (smaller->next_local) { 394 while (smaller->next_local) {
391 if (smaller->next_local == bigger) 395 if (smaller->next_local == bigger)
@@ -396,13 +400,13 @@ static void __cpuinit link_cache_lists(struct cache *smaller, struct cache *bigg
396 smaller->next_local = bigger; 400 smaller->next_local = bigger;
397} 401}
398 402
399static void __cpuinit do_subsidiary_caches_debugcheck(struct cache *cache) 403static void do_subsidiary_caches_debugcheck(struct cache *cache)
400{ 404{
401 WARN_ON_ONCE(cache->level != 1); 405 WARN_ON_ONCE(cache->level != 1);
402 WARN_ON_ONCE(strcmp(cache->ofnode->type, "cpu")); 406 WARN_ON_ONCE(strcmp(cache->ofnode->type, "cpu"));
403} 407}
404 408
405static void __cpuinit do_subsidiary_caches(struct cache *cache) 409static void do_subsidiary_caches(struct cache *cache)
406{ 410{
407 struct device_node *subcache_node; 411 struct device_node *subcache_node;
408 int level = cache->level; 412 int level = cache->level;
@@ -423,7 +427,7 @@ static void __cpuinit do_subsidiary_caches(struct cache *cache)
423 } 427 }
424} 428}
425 429
426static struct cache *__cpuinit cache_chain_instantiate(unsigned int cpu_id) 430static struct cache *cache_chain_instantiate(unsigned int cpu_id)
427{ 431{
428 struct device_node *cpu_node; 432 struct device_node *cpu_node;
429 struct cache *cpu_cache = NULL; 433 struct cache *cpu_cache = NULL;
@@ -448,7 +452,7 @@ out:
448 return cpu_cache; 452 return cpu_cache;
449} 453}
450 454
451static struct cache_dir *__cpuinit cacheinfo_create_cache_dir(unsigned int cpu_id) 455static struct cache_dir *cacheinfo_create_cache_dir(unsigned int cpu_id)
452{ 456{
453 struct cache_dir *cache_dir; 457 struct cache_dir *cache_dir;
454 struct device *dev; 458 struct device *dev;
@@ -653,7 +657,7 @@ static struct kobj_type cache_index_type = {
653 .default_attrs = cache_index_default_attrs, 657 .default_attrs = cache_index_default_attrs,
654}; 658};
655 659
656static void __cpuinit cacheinfo_create_index_opt_attrs(struct cache_index_dir *dir) 660static void cacheinfo_create_index_opt_attrs(struct cache_index_dir *dir)
657{ 661{
658 const char *cache_name; 662 const char *cache_name;
659 const char *cache_type; 663 const char *cache_type;
@@ -696,7 +700,8 @@ static void __cpuinit cacheinfo_create_index_opt_attrs(struct cache_index_dir *d
696 kfree(buf); 700 kfree(buf);
697} 701}
698 702
699static void __cpuinit cacheinfo_create_index_dir(struct cache *cache, int index, struct cache_dir *cache_dir) 703static void cacheinfo_create_index_dir(struct cache *cache, int index,
704 struct cache_dir *cache_dir)
700{ 705{
701 struct cache_index_dir *index_dir; 706 struct cache_index_dir *index_dir;
702 int rc; 707 int rc;
@@ -722,7 +727,8 @@ err:
722 kfree(index_dir); 727 kfree(index_dir);
723} 728}
724 729
725static void __cpuinit cacheinfo_sysfs_populate(unsigned int cpu_id, struct cache *cache_list) 730static void cacheinfo_sysfs_populate(unsigned int cpu_id,
731 struct cache *cache_list)
726{ 732{
727 struct cache_dir *cache_dir; 733 struct cache_dir *cache_dir;
728 struct cache *cache; 734 struct cache *cache;
@@ -740,7 +746,7 @@ static void __cpuinit cacheinfo_sysfs_populate(unsigned int cpu_id, struct cache
740 } 746 }
741} 747}
742 748
743void __cpuinit cacheinfo_cpu_online(unsigned int cpu_id) 749void cacheinfo_cpu_online(unsigned int cpu_id)
744{ 750{
745 struct cache *cache; 751 struct cache *cache;
746 752
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/kernel/eeh.c
index 6b73d6c44f51..39954fe941b8 100644
--- a/arch/powerpc/platforms/pseries/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -103,11 +103,8 @@ EXPORT_SYMBOL(eeh_subsystem_enabled);
103 */ 103 */
104int eeh_probe_mode; 104int eeh_probe_mode;
105 105
106/* Global EEH mutex */
107DEFINE_MUTEX(eeh_mutex);
108
109/* Lock to avoid races due to multiple reports of an error */ 106/* Lock to avoid races due to multiple reports of an error */
110static DEFINE_RAW_SPINLOCK(confirm_error_lock); 107DEFINE_RAW_SPINLOCK(confirm_error_lock);
111 108
112/* Buffer for reporting pci register dumps. Its here in BSS, and 109/* Buffer for reporting pci register dumps. Its here in BSS, and
113 * not dynamically alloced, so that it ends up in RMO where RTAS 110 * not dynamically alloced, so that it ends up in RMO where RTAS
@@ -235,16 +232,30 @@ void eeh_slot_error_detail(struct eeh_pe *pe, int severity)
235{ 232{
236 size_t loglen = 0; 233 size_t loglen = 0;
237 struct eeh_dev *edev; 234 struct eeh_dev *edev;
235 bool valid_cfg_log = true;
238 236
239 eeh_pci_enable(pe, EEH_OPT_THAW_MMIO); 237 /*
240 eeh_ops->configure_bridge(pe); 238 * When the PHB is fenced or dead, it's pointless to collect
241 eeh_pe_restore_bars(pe); 239 * the data from PCI config space because it should return
242 240 * 0xFF's. For ER, we still retrieve the data from the PCI
243 pci_regs_buf[0] = 0; 241 * config space.
244 eeh_pe_for_each_dev(pe, edev) { 242 */
245 loglen += eeh_gather_pci_data(edev, pci_regs_buf, 243 if (eeh_probe_mode_dev() &&
246 EEH_PCI_REGS_LOG_LEN); 244 (pe->type & EEH_PE_PHB) &&
247 } 245 (pe->state & (EEH_PE_ISOLATED | EEH_PE_PHB_DEAD)))
246 valid_cfg_log = false;
247
248 if (valid_cfg_log) {
249 eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
250 eeh_ops->configure_bridge(pe);
251 eeh_pe_restore_bars(pe);
252
253 pci_regs_buf[0] = 0;
254 eeh_pe_for_each_dev(pe, edev) {
255 loglen += eeh_gather_pci_data(edev, pci_regs_buf + loglen,
256 EEH_PCI_REGS_LOG_LEN - loglen);
257 }
258 }
248 259
249 eeh_ops->get_log(pe, severity, pci_regs_buf, loglen); 260 eeh_ops->get_log(pe, severity, pci_regs_buf, loglen);
250} 261}
@@ -260,15 +271,74 @@ static inline unsigned long eeh_token_to_phys(unsigned long token)
260{ 271{
261 pte_t *ptep; 272 pte_t *ptep;
262 unsigned long pa; 273 unsigned long pa;
274 int hugepage_shift;
263 275
264 ptep = find_linux_pte(init_mm.pgd, token); 276 /*
277 * We won't find hugepages here, iomem
278 */
279 ptep = find_linux_pte_or_hugepte(init_mm.pgd, token, &hugepage_shift);
265 if (!ptep) 280 if (!ptep)
266 return token; 281 return token;
282 WARN_ON(hugepage_shift);
267 pa = pte_pfn(*ptep) << PAGE_SHIFT; 283 pa = pte_pfn(*ptep) << PAGE_SHIFT;
268 284
269 return pa | (token & (PAGE_SIZE-1)); 285 return pa | (token & (PAGE_SIZE-1));
270} 286}
271 287
288/*
289 * On PowerNV platform, we might already have fenced PHB there.
290 * For that case, it's meaningless to recover frozen PE. Intead,
291 * We have to handle fenced PHB firstly.
292 */
293static int eeh_phb_check_failure(struct eeh_pe *pe)
294{
295 struct eeh_pe *phb_pe;
296 unsigned long flags;
297 int ret;
298
299 if (!eeh_probe_mode_dev())
300 return -EPERM;
301
302 /* Find the PHB PE */
303 phb_pe = eeh_phb_pe_get(pe->phb);
304 if (!phb_pe) {
305 pr_warning("%s Can't find PE for PHB#%d\n",
306 __func__, pe->phb->global_number);
307 return -EEXIST;
308 }
309
310 /* If the PHB has been in problematic state */
311 eeh_serialize_lock(&flags);
312 if (phb_pe->state & (EEH_PE_ISOLATED | EEH_PE_PHB_DEAD)) {
313 ret = 0;
314 goto out;
315 }
316
317 /* Check PHB state */
318 ret = eeh_ops->get_state(phb_pe, NULL);
319 if ((ret < 0) ||
320 (ret == EEH_STATE_NOT_SUPPORT) ||
321 (ret & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) ==
322 (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) {
323 ret = 0;
324 goto out;
325 }
326
327 /* Isolate the PHB and send event */
328 eeh_pe_state_mark(phb_pe, EEH_PE_ISOLATED);
329 eeh_serialize_unlock(flags);
330 eeh_send_failure_event(phb_pe);
331
332 pr_err("EEH: PHB#%x failure detected\n",
333 phb_pe->phb->global_number);
334 dump_stack();
335
336 return 1;
337out:
338 eeh_serialize_unlock(flags);
339 return ret;
340}
341
272/** 342/**
273 * eeh_dev_check_failure - Check if all 1's data is due to EEH slot freeze 343 * eeh_dev_check_failure - Check if all 1's data is due to EEH slot freeze
274 * @edev: eeh device 344 * @edev: eeh device
@@ -319,13 +389,21 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
319 return 0; 389 return 0;
320 } 390 }
321 391
392 /*
393 * On PowerNV platform, we might already have fenced PHB
394 * there and we need take care of that firstly.
395 */
396 ret = eeh_phb_check_failure(pe);
397 if (ret > 0)
398 return ret;
399
322 /* If we already have a pending isolation event for this 400 /* If we already have a pending isolation event for this
323 * slot, we know it's bad already, we don't need to check. 401 * slot, we know it's bad already, we don't need to check.
324 * Do this checking under a lock; as multiple PCI devices 402 * Do this checking under a lock; as multiple PCI devices
325 * in one slot might report errors simultaneously, and we 403 * in one slot might report errors simultaneously, and we
326 * only want one error recovery routine running. 404 * only want one error recovery routine running.
327 */ 405 */
328 raw_spin_lock_irqsave(&confirm_error_lock, flags); 406 eeh_serialize_lock(&flags);
329 rc = 1; 407 rc = 1;
330 if (pe->state & EEH_PE_ISOLATED) { 408 if (pe->state & EEH_PE_ISOLATED) {
331 pe->check_count++; 409 pe->check_count++;
@@ -368,13 +446,13 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
368 } 446 }
369 447
370 eeh_stats.slot_resets++; 448 eeh_stats.slot_resets++;
371 449
372 /* Avoid repeated reports of this failure, including problems 450 /* Avoid repeated reports of this failure, including problems
373 * with other functions on this device, and functions under 451 * with other functions on this device, and functions under
374 * bridges. 452 * bridges.
375 */ 453 */
376 eeh_pe_state_mark(pe, EEH_PE_ISOLATED); 454 eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
377 raw_spin_unlock_irqrestore(&confirm_error_lock, flags); 455 eeh_serialize_unlock(flags);
378 456
379 eeh_send_failure_event(pe); 457 eeh_send_failure_event(pe);
380 458
@@ -382,11 +460,14 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
382 * a stack trace will help the device-driver authors figure 460 * a stack trace will help the device-driver authors figure
383 * out what happened. So print that out. 461 * out what happened. So print that out.
384 */ 462 */
385 WARN(1, "EEH: failure detected\n"); 463 pr_err("EEH: Frozen PE#%x detected on PHB#%x\n",
464 pe->addr, pe->phb->global_number);
465 dump_stack();
466
386 return 1; 467 return 1;
387 468
388dn_unlock: 469dn_unlock:
389 raw_spin_unlock_irqrestore(&confirm_error_lock, flags); 470 eeh_serialize_unlock(flags);
390 return rc; 471 return rc;
391} 472}
392 473
@@ -525,7 +606,7 @@ static void eeh_reset_pe_once(struct eeh_pe *pe)
525 * or a fundamental reset (3). 606 * or a fundamental reset (3).
526 * A fundamental reset required by any device under 607 * A fundamental reset required by any device under
527 * Partitionable Endpoint trumps hot-reset. 608 * Partitionable Endpoint trumps hot-reset.
528 */ 609 */
529 eeh_pe_dev_traverse(pe, eeh_set_dev_freset, &freset); 610 eeh_pe_dev_traverse(pe, eeh_set_dev_freset, &freset);
530 611
531 if (freset) 612 if (freset)
@@ -538,8 +619,8 @@ static void eeh_reset_pe_once(struct eeh_pe *pe)
538 */ 619 */
539#define PCI_BUS_RST_HOLD_TIME_MSEC 250 620#define PCI_BUS_RST_HOLD_TIME_MSEC 250
540 msleep(PCI_BUS_RST_HOLD_TIME_MSEC); 621 msleep(PCI_BUS_RST_HOLD_TIME_MSEC);
541 622
542 /* We might get hit with another EEH freeze as soon as the 623 /* We might get hit with another EEH freeze as soon as the
543 * pci slot reset line is dropped. Make sure we don't miss 624 * pci slot reset line is dropped. Make sure we don't miss
544 * these, and clear the flag now. 625 * these, and clear the flag now.
545 */ 626 */
@@ -565,6 +646,7 @@ static void eeh_reset_pe_once(struct eeh_pe *pe)
565 */ 646 */
566int eeh_reset_pe(struct eeh_pe *pe) 647int eeh_reset_pe(struct eeh_pe *pe)
567{ 648{
649 int flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
568 int i, rc; 650 int i, rc;
569 651
570 /* Take three shots at resetting the bus */ 652 /* Take three shots at resetting the bus */
@@ -572,7 +654,7 @@ int eeh_reset_pe(struct eeh_pe *pe)
572 eeh_reset_pe_once(pe); 654 eeh_reset_pe_once(pe);
573 655
574 rc = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC); 656 rc = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC);
575 if (rc == (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) 657 if ((rc & flags) == flags)
576 return 0; 658 return 0;
577 659
578 if (rc < 0) { 660 if (rc < 0) {
@@ -604,7 +686,7 @@ void eeh_save_bars(struct eeh_dev *edev)
604 if (!edev) 686 if (!edev)
605 return; 687 return;
606 dn = eeh_dev_to_of_node(edev); 688 dn = eeh_dev_to_of_node(edev);
607 689
608 for (i = 0; i < 16; i++) 690 for (i = 0; i < 16; i++)
609 eeh_ops->read_config(dn, i * 4, 4, &edev->config_space[i]); 691 eeh_ops->read_config(dn, i * 4, 4, &edev->config_space[i]);
610} 692}
@@ -674,11 +756,21 @@ int __exit eeh_ops_unregister(const char *name)
674 * Even if force-off is set, the EEH hardware is still enabled, so that 756 * Even if force-off is set, the EEH hardware is still enabled, so that
675 * newer systems can boot. 757 * newer systems can boot.
676 */ 758 */
677static int __init eeh_init(void) 759int eeh_init(void)
678{ 760{
679 struct pci_controller *hose, *tmp; 761 struct pci_controller *hose, *tmp;
680 struct device_node *phb; 762 struct device_node *phb;
681 int ret; 763 static int cnt = 0;
764 int ret = 0;
765
766 /*
767 * We have to delay the initialization on PowerNV after
768 * the PCI hierarchy tree has been built because the PEs
769 * are figured out based on PCI devices instead of device
770 * tree nodes
771 */
772 if (machine_is(powernv) && cnt++ <= 0)
773 return ret;
682 774
683 /* call platform initialization function */ 775 /* call platform initialization function */
684 if (!eeh_ops) { 776 if (!eeh_ops) {
@@ -691,7 +783,10 @@ static int __init eeh_init(void)
691 return ret; 783 return ret;
692 } 784 }
693 785
694 raw_spin_lock_init(&confirm_error_lock); 786 /* Initialize EEH event */
787 ret = eeh_event_init();
788 if (ret)
789 return ret;
695 790
696 /* Enable EEH for all adapters */ 791 /* Enable EEH for all adapters */
697 if (eeh_probe_mode_devtree()) { 792 if (eeh_probe_mode_devtree()) {
@@ -700,6 +795,25 @@ static int __init eeh_init(void)
700 phb = hose->dn; 795 phb = hose->dn;
701 traverse_pci_devices(phb, eeh_ops->of_probe, NULL); 796 traverse_pci_devices(phb, eeh_ops->of_probe, NULL);
702 } 797 }
798 } else if (eeh_probe_mode_dev()) {
799 list_for_each_entry_safe(hose, tmp,
800 &hose_list, list_node)
801 pci_walk_bus(hose->bus, eeh_ops->dev_probe, NULL);
802 } else {
803 pr_warning("%s: Invalid probe mode %d\n",
804 __func__, eeh_probe_mode);
805 return -EINVAL;
806 }
807
808 /*
809 * Call platform post-initialization. Actually, It's good chance
810 * to inform platform that EEH is ready to supply service if the
811 * I/O cache stuff has been built up.
812 */
813 if (eeh_ops->post_init) {
814 ret = eeh_ops->post_init();
815 if (ret)
816 return ret;
703 } 817 }
704 818
705 if (eeh_subsystem_enabled) 819 if (eeh_subsystem_enabled)
@@ -728,6 +842,14 @@ static void eeh_add_device_early(struct device_node *dn)
728{ 842{
729 struct pci_controller *phb; 843 struct pci_controller *phb;
730 844
845 /*
846 * If we're doing EEH probe based on PCI device, we
847 * would delay the probe until late stage because
848 * the PCI device isn't available this moment.
849 */
850 if (!eeh_probe_mode_devtree())
851 return;
852
731 if (!of_node_to_eeh_dev(dn)) 853 if (!of_node_to_eeh_dev(dn))
732 return; 854 return;
733 phb = of_node_to_eeh_dev(dn)->phb; 855 phb = of_node_to_eeh_dev(dn)->phb;
@@ -736,7 +858,6 @@ static void eeh_add_device_early(struct device_node *dn)
736 if (NULL == phb || 0 == phb->buid) 858 if (NULL == phb || 0 == phb->buid)
737 return; 859 return;
738 860
739 /* FIXME: hotplug support on POWERNV */
740 eeh_ops->of_probe(dn, NULL); 861 eeh_ops->of_probe(dn, NULL);
741} 862}
742 863
@@ -787,6 +908,13 @@ static void eeh_add_device_late(struct pci_dev *dev)
787 edev->pdev = dev; 908 edev->pdev = dev;
788 dev->dev.archdata.edev = edev; 909 dev->dev.archdata.edev = edev;
789 910
911 /*
912 * We have to do the EEH probe here because the PCI device
913 * hasn't been created yet in the early stage.
914 */
915 if (eeh_probe_mode_dev())
916 eeh_ops->dev_probe(dev, NULL);
917
790 eeh_addr_cache_insert_dev(dev); 918 eeh_addr_cache_insert_dev(dev);
791} 919}
792 920
@@ -803,12 +931,12 @@ void eeh_add_device_tree_late(struct pci_bus *bus)
803 struct pci_dev *dev; 931 struct pci_dev *dev;
804 932
805 list_for_each_entry(dev, &bus->devices, bus_list) { 933 list_for_each_entry(dev, &bus->devices, bus_list) {
806 eeh_add_device_late(dev); 934 eeh_add_device_late(dev);
807 if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { 935 if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
808 struct pci_bus *subbus = dev->subordinate; 936 struct pci_bus *subbus = dev->subordinate;
809 if (subbus) 937 if (subbus)
810 eeh_add_device_tree_late(subbus); 938 eeh_add_device_tree_late(subbus);
811 } 939 }
812 } 940 }
813} 941}
814EXPORT_SYMBOL_GPL(eeh_add_device_tree_late); 942EXPORT_SYMBOL_GPL(eeh_add_device_tree_late);
diff --git a/arch/powerpc/platforms/pseries/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c
index 5ce3ba7ad137..f9ac1232a746 100644
--- a/arch/powerpc/platforms/pseries/eeh_cache.c
+++ b/arch/powerpc/kernel/eeh_cache.c
@@ -194,7 +194,7 @@ static void __eeh_addr_cache_insert_dev(struct pci_dev *dev)
194 } 194 }
195 195
196 /* Skip any devices for which EEH is not enabled. */ 196 /* Skip any devices for which EEH is not enabled. */
197 if (!edev->pe) { 197 if (!eeh_probe_mode_dev() && !edev->pe) {
198#ifdef DEBUG 198#ifdef DEBUG
199 pr_info("PCI: skip building address cache for=%s - %s\n", 199 pr_info("PCI: skip building address cache for=%s - %s\n",
200 pci_name(dev), dn->full_name); 200 pci_name(dev), dn->full_name);
@@ -285,7 +285,7 @@ void eeh_addr_cache_rmv_dev(struct pci_dev *dev)
285 * Must be run late in boot process, after the pci controllers 285 * Must be run late in boot process, after the pci controllers
286 * have been scanned for devices (after all device resources are known). 286 * have been scanned for devices (after all device resources are known).
287 */ 287 */
288void __init eeh_addr_cache_build(void) 288void eeh_addr_cache_build(void)
289{ 289{
290 struct device_node *dn; 290 struct device_node *dn;
291 struct eeh_dev *edev; 291 struct eeh_dev *edev;
@@ -316,4 +316,3 @@ void __init eeh_addr_cache_build(void)
316 eeh_addr_cache_print(&pci_io_addr_cache_root); 316 eeh_addr_cache_print(&pci_io_addr_cache_root);
317#endif 317#endif
318} 318}
319
diff --git a/arch/powerpc/platforms/pseries/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c
index 1efa28f5fc54..1efa28f5fc54 100644
--- a/arch/powerpc/platforms/pseries/eeh_dev.c
+++ b/arch/powerpc/kernel/eeh_dev.c
diff --git a/arch/powerpc/platforms/pseries/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index a3fefb61097c..2b1ce17cae50 100644
--- a/arch/powerpc/platforms/pseries/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -154,9 +154,9 @@ static void eeh_enable_irq(struct pci_dev *dev)
154 * eeh_report_error - Report pci error to each device driver 154 * eeh_report_error - Report pci error to each device driver
155 * @data: eeh device 155 * @data: eeh device
156 * @userdata: return value 156 * @userdata: return value
157 * 157 *
158 * Report an EEH error to each device driver, collect up and 158 * Report an EEH error to each device driver, collect up and
159 * merge the device driver responses. Cumulative response 159 * merge the device driver responses. Cumulative response
160 * passed back in "userdata". 160 * passed back in "userdata".
161 */ 161 */
162static void *eeh_report_error(void *data, void *userdata) 162static void *eeh_report_error(void *data, void *userdata)
@@ -349,10 +349,12 @@ static void *eeh_report_failure(void *data, void *userdata)
349 */ 349 */
350static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus) 350static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
351{ 351{
352 struct timeval tstamp;
352 int cnt, rc; 353 int cnt, rc;
353 354
354 /* pcibios will clear the counter; save the value */ 355 /* pcibios will clear the counter; save the value */
355 cnt = pe->freeze_count; 356 cnt = pe->freeze_count;
357 tstamp = pe->tstamp;
356 358
357 /* 359 /*
358 * We don't remove the corresponding PE instances because 360 * We don't remove the corresponding PE instances because
@@ -376,15 +378,17 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
376 eeh_pe_restore_bars(pe); 378 eeh_pe_restore_bars(pe);
377 379
378 /* Give the system 5 seconds to finish running the user-space 380 /* Give the system 5 seconds to finish running the user-space
379 * hotplug shutdown scripts, e.g. ifdown for ethernet. Yes, 381 * hotplug shutdown scripts, e.g. ifdown for ethernet. Yes,
380 * this is a hack, but if we don't do this, and try to bring 382 * this is a hack, but if we don't do this, and try to bring
381 * the device up before the scripts have taken it down, 383 * the device up before the scripts have taken it down,
382 * potentially weird things happen. 384 * potentially weird things happen.
383 */ 385 */
384 if (bus) { 386 if (bus) {
385 ssleep(5); 387 ssleep(5);
386 pcibios_add_pci_devices(bus); 388 pcibios_add_pci_devices(bus);
387 } 389 }
390
391 pe->tstamp = tstamp;
388 pe->freeze_count = cnt; 392 pe->freeze_count = cnt;
389 393
390 return 0; 394 return 0;
@@ -395,24 +399,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
395 */ 399 */
396#define MAX_WAIT_FOR_RECOVERY 150 400#define MAX_WAIT_FOR_RECOVERY 150
397 401
398/** 402static void eeh_handle_normal_event(struct eeh_pe *pe)
399 * eeh_handle_event - Reset a PCI device after hard lockup.
400 * @pe: EEH PE
401 *
402 * While PHB detects address or data parity errors on particular PCI
403 * slot, the associated PE will be frozen. Besides, DMA's occurring
404 * to wild addresses (which usually happen due to bugs in device
405 * drivers or in PCI adapter firmware) can cause EEH error. #SERR,
406 * #PERR or other misc PCI-related errors also can trigger EEH errors.
407 *
408 * Recovery process consists of unplugging the device driver (which
409 * generated hotplug events to userspace), then issuing a PCI #RST to
410 * the device, then reconfiguring the PCI config space for all bridges
411 * & devices under this slot, and then finally restarting the device
412 * drivers (which cause a second set of hotplug events to go out to
413 * userspace).
414 */
415void eeh_handle_event(struct eeh_pe *pe)
416{ 403{
417 struct pci_bus *frozen_bus; 404 struct pci_bus *frozen_bus;
418 int rc = 0; 405 int rc = 0;
@@ -425,6 +412,7 @@ void eeh_handle_event(struct eeh_pe *pe)
425 return; 412 return;
426 } 413 }
427 414
415 eeh_pe_update_time_stamp(pe);
428 pe->freeze_count++; 416 pe->freeze_count++;
429 if (pe->freeze_count > EEH_MAX_ALLOWED_FREEZES) 417 if (pe->freeze_count > EEH_MAX_ALLOWED_FREEZES)
430 goto excess_failures; 418 goto excess_failures;
@@ -437,6 +425,7 @@ void eeh_handle_event(struct eeh_pe *pe)
437 * status ... if any child can't handle the reset, then the entire 425 * status ... if any child can't handle the reset, then the entire
438 * slot is dlpar removed and added. 426 * slot is dlpar removed and added.
439 */ 427 */
428 pr_info("EEH: Notify device drivers to shutdown\n");
440 eeh_pe_dev_traverse(pe, eeh_report_error, &result); 429 eeh_pe_dev_traverse(pe, eeh_report_error, &result);
441 430
442 /* Get the current PCI slot state. This can take a long time, 431 /* Get the current PCI slot state. This can take a long time,
@@ -444,7 +433,7 @@ void eeh_handle_event(struct eeh_pe *pe)
444 */ 433 */
445 rc = eeh_ops->wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000); 434 rc = eeh_ops->wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000);
446 if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { 435 if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) {
447 printk(KERN_WARNING "EEH: Permanent failure\n"); 436 pr_warning("EEH: Permanent failure\n");
448 goto hard_fail; 437 goto hard_fail;
449 } 438 }
450 439
@@ -452,6 +441,7 @@ void eeh_handle_event(struct eeh_pe *pe)
452 * don't post the error log until after all dev drivers 441 * don't post the error log until after all dev drivers
453 * have been informed. 442 * have been informed.
454 */ 443 */
444 pr_info("EEH: Collect temporary log\n");
455 eeh_slot_error_detail(pe, EEH_LOG_TEMP); 445 eeh_slot_error_detail(pe, EEH_LOG_TEMP);
456 446
457 /* If all device drivers were EEH-unaware, then shut 447 /* If all device drivers were EEH-unaware, then shut
@@ -459,15 +449,18 @@ void eeh_handle_event(struct eeh_pe *pe)
459 * go down willingly, without panicing the system. 449 * go down willingly, without panicing the system.
460 */ 450 */
461 if (result == PCI_ERS_RESULT_NONE) { 451 if (result == PCI_ERS_RESULT_NONE) {
452 pr_info("EEH: Reset with hotplug activity\n");
462 rc = eeh_reset_device(pe, frozen_bus); 453 rc = eeh_reset_device(pe, frozen_bus);
463 if (rc) { 454 if (rc) {
464 printk(KERN_WARNING "EEH: Unable to reset, rc=%d\n", rc); 455 pr_warning("%s: Unable to reset, err=%d\n",
456 __func__, rc);
465 goto hard_fail; 457 goto hard_fail;
466 } 458 }
467 } 459 }
468 460
469 /* If all devices reported they can proceed, then re-enable MMIO */ 461 /* If all devices reported they can proceed, then re-enable MMIO */
470 if (result == PCI_ERS_RESULT_CAN_RECOVER) { 462 if (result == PCI_ERS_RESULT_CAN_RECOVER) {
463 pr_info("EEH: Enable I/O for affected devices\n");
471 rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO); 464 rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
472 465
473 if (rc < 0) 466 if (rc < 0)
@@ -475,6 +468,7 @@ void eeh_handle_event(struct eeh_pe *pe)
475 if (rc) { 468 if (rc) {
476 result = PCI_ERS_RESULT_NEED_RESET; 469 result = PCI_ERS_RESULT_NEED_RESET;
477 } else { 470 } else {
471 pr_info("EEH: Notify device drivers to resume I/O\n");
478 result = PCI_ERS_RESULT_NONE; 472 result = PCI_ERS_RESULT_NONE;
479 eeh_pe_dev_traverse(pe, eeh_report_mmio_enabled, &result); 473 eeh_pe_dev_traverse(pe, eeh_report_mmio_enabled, &result);
480 } 474 }
@@ -482,6 +476,7 @@ void eeh_handle_event(struct eeh_pe *pe)
482 476
483 /* If all devices reported they can proceed, then re-enable DMA */ 477 /* If all devices reported they can proceed, then re-enable DMA */
484 if (result == PCI_ERS_RESULT_CAN_RECOVER) { 478 if (result == PCI_ERS_RESULT_CAN_RECOVER) {
479 pr_info("EEH: Enabled DMA for affected devices\n");
485 rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA); 480 rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA);
486 481
487 if (rc < 0) 482 if (rc < 0)
@@ -494,17 +489,22 @@ void eeh_handle_event(struct eeh_pe *pe)
494 489
495 /* If any device has a hard failure, then shut off everything. */ 490 /* If any device has a hard failure, then shut off everything. */
496 if (result == PCI_ERS_RESULT_DISCONNECT) { 491 if (result == PCI_ERS_RESULT_DISCONNECT) {
497 printk(KERN_WARNING "EEH: Device driver gave up\n"); 492 pr_warning("EEH: Device driver gave up\n");
498 goto hard_fail; 493 goto hard_fail;
499 } 494 }
500 495
501 /* If any device called out for a reset, then reset the slot */ 496 /* If any device called out for a reset, then reset the slot */
502 if (result == PCI_ERS_RESULT_NEED_RESET) { 497 if (result == PCI_ERS_RESULT_NEED_RESET) {
498 pr_info("EEH: Reset without hotplug activity\n");
503 rc = eeh_reset_device(pe, NULL); 499 rc = eeh_reset_device(pe, NULL);
504 if (rc) { 500 if (rc) {
505 printk(KERN_WARNING "EEH: Cannot reset, rc=%d\n", rc); 501 pr_warning("%s: Cannot reset, err=%d\n",
502 __func__, rc);
506 goto hard_fail; 503 goto hard_fail;
507 } 504 }
505
506 pr_info("EEH: Notify device drivers "
507 "the completion of reset\n");
508 result = PCI_ERS_RESULT_NONE; 508 result = PCI_ERS_RESULT_NONE;
509 eeh_pe_dev_traverse(pe, eeh_report_reset, &result); 509 eeh_pe_dev_traverse(pe, eeh_report_reset, &result);
510 } 510 }
@@ -512,15 +512,16 @@ void eeh_handle_event(struct eeh_pe *pe)
512 /* All devices should claim they have recovered by now. */ 512 /* All devices should claim they have recovered by now. */
513 if ((result != PCI_ERS_RESULT_RECOVERED) && 513 if ((result != PCI_ERS_RESULT_RECOVERED) &&
514 (result != PCI_ERS_RESULT_NONE)) { 514 (result != PCI_ERS_RESULT_NONE)) {
515 printk(KERN_WARNING "EEH: Not recovered\n"); 515 pr_warning("EEH: Not recovered\n");
516 goto hard_fail; 516 goto hard_fail;
517 } 517 }
518 518
519 /* Tell all device drivers that they can resume operations */ 519 /* Tell all device drivers that they can resume operations */
520 pr_info("EEH: Notify device driver to resume\n");
520 eeh_pe_dev_traverse(pe, eeh_report_resume, NULL); 521 eeh_pe_dev_traverse(pe, eeh_report_resume, NULL);
521 522
522 return; 523 return;
523 524
524excess_failures: 525excess_failures:
525 /* 526 /*
526 * About 90% of all real-life EEH failures in the field 527 * About 90% of all real-life EEH failures in the field
@@ -550,3 +551,111 @@ perm_error:
550 pcibios_remove_pci_devices(frozen_bus); 551 pcibios_remove_pci_devices(frozen_bus);
551} 552}
552 553
554static void eeh_handle_special_event(void)
555{
556 struct eeh_pe *pe, *phb_pe;
557 struct pci_bus *bus;
558 struct pci_controller *hose, *tmp;
559 unsigned long flags;
560 int rc = 0;
561
562 /*
563 * The return value from next_error() has been classified as follows.
564 * It might be good to enumerate them. However, next_error() is only
565 * supported by PowerNV platform for now. So it would be fine to use
566 * integer directly:
567 *
568 * 4 - Dead IOC 3 - Dead PHB
569 * 2 - Fenced PHB 1 - Frozen PE
570 * 0 - No error found
571 *
572 */
573 rc = eeh_ops->next_error(&pe);
574 if (rc <= 0)
575 return;
576
577 switch (rc) {
578 case 4:
579 /* Mark all PHBs in dead state */
580 eeh_serialize_lock(&flags);
581 list_for_each_entry_safe(hose, tmp,
582 &hose_list, list_node) {
583 phb_pe = eeh_phb_pe_get(hose);
584 if (!phb_pe) continue;
585
586 eeh_pe_state_mark(phb_pe,
587 EEH_PE_ISOLATED | EEH_PE_PHB_DEAD);
588 }
589 eeh_serialize_unlock(flags);
590
591 /* Purge all events */
592 eeh_remove_event(NULL);
593 break;
594 case 3:
595 case 2:
596 case 1:
597 /* Mark the PE in fenced state */
598 eeh_serialize_lock(&flags);
599 if (rc == 3)
600 eeh_pe_state_mark(pe,
601 EEH_PE_ISOLATED | EEH_PE_PHB_DEAD);
602 else
603 eeh_pe_state_mark(pe,
604 EEH_PE_ISOLATED | EEH_PE_RECOVERING);
605 eeh_serialize_unlock(flags);
606
607 /* Purge all events of the PHB */
608 eeh_remove_event(pe);
609 break;
610 default:
611 pr_err("%s: Invalid value %d from next_error()\n",
612 __func__, rc);
613 return;
614 }
615
616 /*
617 * For fenced PHB and frozen PE, it's handled as normal
618 * event. We have to remove the affected PHBs for dead
619 * PHB and IOC
620 */
621 if (rc == 2 || rc == 1)
622 eeh_handle_normal_event(pe);
623 else {
624 list_for_each_entry_safe(hose, tmp,
625 &hose_list, list_node) {
626 phb_pe = eeh_phb_pe_get(hose);
627 if (!phb_pe || !(phb_pe->state & EEH_PE_PHB_DEAD))
628 continue;
629
630 bus = eeh_pe_bus_get(phb_pe);
631 /* Notify all devices that they're about to go down. */
632 eeh_pe_dev_traverse(pe, eeh_report_failure, NULL);
633 pcibios_remove_pci_devices(bus);
634 }
635 }
636}
637
638/**
639 * eeh_handle_event - Reset a PCI device after hard lockup.
640 * @pe: EEH PE
641 *
642 * While PHB detects address or data parity errors on particular PCI
643 * slot, the associated PE will be frozen. Besides, DMA's occurring
644 * to wild addresses (which usually happen due to bugs in device
645 * drivers or in PCI adapter firmware) can cause EEH error. #SERR,
646 * #PERR or other misc PCI-related errors also can trigger EEH errors.
647 *
648 * Recovery process consists of unplugging the device driver (which
649 * generated hotplug events to userspace), then issuing a PCI #RST to
650 * the device, then reconfiguring the PCI config space for all bridges
651 * & devices under this slot, and then finally restarting the device
652 * drivers (which cause a second set of hotplug events to go out to
653 * userspace).
654 */
655void eeh_handle_event(struct eeh_pe *pe)
656{
657 if (pe)
658 eeh_handle_normal_event(pe);
659 else
660 eeh_handle_special_event();
661}
diff --git a/arch/powerpc/platforms/pseries/eeh_event.c b/arch/powerpc/kernel/eeh_event.c
index 185bedd926df..d27c5afc90ae 100644
--- a/arch/powerpc/platforms/pseries/eeh_event.c
+++ b/arch/powerpc/kernel/eeh_event.c
@@ -18,11 +18,10 @@
18 18
19#include <linux/delay.h> 19#include <linux/delay.h>
20#include <linux/list.h> 20#include <linux/list.h>
21#include <linux/mutex.h>
22#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/semaphore.h>
23#include <linux/pci.h> 23#include <linux/pci.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/workqueue.h>
26#include <linux/kthread.h> 25#include <linux/kthread.h>
27#include <asm/eeh_event.h> 26#include <asm/eeh_event.h>
28#include <asm/ppc-pci.h> 27#include <asm/ppc-pci.h>
@@ -35,14 +34,9 @@
35 * work-queue, where a worker thread can drive recovery. 34 * work-queue, where a worker thread can drive recovery.
36 */ 35 */
37 36
38/* EEH event workqueue setup. */
39static DEFINE_SPINLOCK(eeh_eventlist_lock); 37static DEFINE_SPINLOCK(eeh_eventlist_lock);
38static struct semaphore eeh_eventlist_sem;
40LIST_HEAD(eeh_eventlist); 39LIST_HEAD(eeh_eventlist);
41static void eeh_thread_launcher(struct work_struct *);
42DECLARE_WORK(eeh_event_wq, eeh_thread_launcher);
43
44/* Serialize reset sequences for a given pci device */
45DEFINE_MUTEX(eeh_event_mutex);
46 40
47/** 41/**
48 * eeh_event_handler - Dispatch EEH events. 42 * eeh_event_handler - Dispatch EEH events.
@@ -60,55 +54,63 @@ static int eeh_event_handler(void * dummy)
60 struct eeh_event *event; 54 struct eeh_event *event;
61 struct eeh_pe *pe; 55 struct eeh_pe *pe;
62 56
63 spin_lock_irqsave(&eeh_eventlist_lock, flags); 57 while (!kthread_should_stop()) {
64 event = NULL; 58 if (down_interruptible(&eeh_eventlist_sem))
65 59 break;
66 /* Unqueue the event, get ready to process. */ 60
67 if (!list_empty(&eeh_eventlist)) { 61 /* Fetch EEH event from the queue */
68 event = list_entry(eeh_eventlist.next, struct eeh_event, list); 62 spin_lock_irqsave(&eeh_eventlist_lock, flags);
69 list_del(&event->list); 63 event = NULL;
70 } 64 if (!list_empty(&eeh_eventlist)) {
71 spin_unlock_irqrestore(&eeh_eventlist_lock, flags); 65 event = list_entry(eeh_eventlist.next,
72 66 struct eeh_event, list);
73 if (event == NULL) 67 list_del(&event->list);
74 return 0; 68 }
75 69 spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
76 /* Serialize processing of EEH events */ 70 if (!event)
77 mutex_lock(&eeh_event_mutex); 71 continue;
78 pe = event->pe; 72
79 eeh_pe_state_mark(pe, EEH_PE_RECOVERING); 73 /* We might have event without binding PE */
80 pr_info("EEH: Detected PCI bus error on PHB#%d-PE#%x\n", 74 pe = event->pe;
81 pe->phb->global_number, pe->addr); 75 if (pe) {
82 76 eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
83 set_current_state(TASK_INTERRUPTIBLE); /* Don't add to load average */ 77 pr_info("EEH: Detected PCI bus error on PHB#%d-PE#%x\n",
84 eeh_handle_event(pe); 78 pe->phb->global_number, pe->addr);
85 eeh_pe_state_clear(pe, EEH_PE_RECOVERING); 79 eeh_handle_event(pe);
86 80 eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
87 kfree(event); 81 } else {
88 mutex_unlock(&eeh_event_mutex); 82 eeh_handle_event(NULL);
89 83 }
90 /* If there are no new errors after an hour, clear the counter. */ 84
91 if (pe && pe->freeze_count > 0) { 85 kfree(event);
92 msleep_interruptible(3600*1000);
93 if (pe->freeze_count > 0)
94 pe->freeze_count--;
95
96 } 86 }
97 87
98 return 0; 88 return 0;
99} 89}
100 90
101/** 91/**
102 * eeh_thread_launcher - Start kernel thread to handle EEH events 92 * eeh_event_init - Start kernel thread to handle EEH events
103 * @dummy - unused
104 * 93 *
105 * This routine is called to start the kernel thread for processing 94 * This routine is called to start the kernel thread for processing
106 * EEH event. 95 * EEH event.
107 */ 96 */
108static void eeh_thread_launcher(struct work_struct *dummy) 97int eeh_event_init(void)
109{ 98{
110 if (IS_ERR(kthread_run(eeh_event_handler, NULL, "eehd"))) 99 struct task_struct *t;
111 printk(KERN_ERR "Failed to start EEH daemon\n"); 100 int ret = 0;
101
102 /* Initialize semaphore */
103 sema_init(&eeh_eventlist_sem, 0);
104
105 t = kthread_run(eeh_event_handler, NULL, "eehd");
106 if (IS_ERR(t)) {
107 ret = PTR_ERR(t);
108 pr_err("%s: Failed to start EEH daemon (%d)\n",
109 __func__, ret);
110 return ret;
111 }
112
113 return 0;
112} 114}
113 115
114/** 116/**
@@ -136,7 +138,45 @@ int eeh_send_failure_event(struct eeh_pe *pe)
136 list_add(&event->list, &eeh_eventlist); 138 list_add(&event->list, &eeh_eventlist);
137 spin_unlock_irqrestore(&eeh_eventlist_lock, flags); 139 spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
138 140
139 schedule_work(&eeh_event_wq); 141 /* For EEH deamon to knick in */
142 up(&eeh_eventlist_sem);
140 143
141 return 0; 144 return 0;
142} 145}
146
147/**
148 * eeh_remove_event - Remove EEH event from the queue
149 * @pe: Event binding to the PE
150 *
151 * On PowerNV platform, we might have subsequent coming events
152 * is part of the former one. For that case, those subsequent
153 * coming events are totally duplicated and unnecessary, thus
154 * they should be removed.
155 */
156void eeh_remove_event(struct eeh_pe *pe)
157{
158 unsigned long flags;
159 struct eeh_event *event, *tmp;
160
161 spin_lock_irqsave(&eeh_eventlist_lock, flags);
162 list_for_each_entry_safe(event, tmp, &eeh_eventlist, list) {
163 /*
164 * If we don't have valid PE passed in, that means
165 * we already have event corresponding to dead IOC
166 * and all events should be purged.
167 */
168 if (!pe) {
169 list_del(&event->list);
170 kfree(event);
171 } else if (pe->type & EEH_PE_PHB) {
172 if (event->pe && event->pe->phb == pe->phb) {
173 list_del(&event->list);
174 kfree(event);
175 }
176 } else if (event->pe == pe) {
177 list_del(&event->list);
178 kfree(event);
179 }
180 }
181 spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
182}
diff --git a/arch/powerpc/platforms/pseries/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index 9d4a9e8562b2..016588a6f5ed 100644
--- a/arch/powerpc/platforms/pseries/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -22,6 +22,7 @@
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 */ 23 */
24 24
25#include <linux/delay.h>
25#include <linux/export.h> 26#include <linux/export.h>
26#include <linux/gfp.h> 27#include <linux/gfp.h>
27#include <linux/init.h> 28#include <linux/init.h>
@@ -78,9 +79,7 @@ int eeh_phb_pe_create(struct pci_controller *phb)
78 } 79 }
79 80
80 /* Put it into the list */ 81 /* Put it into the list */
81 eeh_lock();
82 list_add_tail(&pe->child, &eeh_phb_pe); 82 list_add_tail(&pe->child, &eeh_phb_pe);
83 eeh_unlock();
84 83
85 pr_debug("EEH: Add PE for PHB#%d\n", phb->global_number); 84 pr_debug("EEH: Add PE for PHB#%d\n", phb->global_number);
86 85
@@ -95,7 +94,7 @@ int eeh_phb_pe_create(struct pci_controller *phb)
95 * hierarchy tree is composed of PHB PEs. The function is used 94 * hierarchy tree is composed of PHB PEs. The function is used
96 * to retrieve the corresponding PHB PE according to the given PHB. 95 * to retrieve the corresponding PHB PE according to the given PHB.
97 */ 96 */
98static struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb) 97struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb)
99{ 98{
100 struct eeh_pe *pe; 99 struct eeh_pe *pe;
101 100
@@ -185,21 +184,15 @@ void *eeh_pe_dev_traverse(struct eeh_pe *root,
185 return NULL; 184 return NULL;
186 } 185 }
187 186
188 eeh_lock();
189
190 /* Traverse root PE */ 187 /* Traverse root PE */
191 for (pe = root; pe; pe = eeh_pe_next(pe, root)) { 188 for (pe = root; pe; pe = eeh_pe_next(pe, root)) {
192 eeh_pe_for_each_dev(pe, edev) { 189 eeh_pe_for_each_dev(pe, edev) {
193 ret = fn(edev, flag); 190 ret = fn(edev, flag);
194 if (ret) { 191 if (ret)
195 eeh_unlock();
196 return ret; 192 return ret;
197 }
198 } 193 }
199 } 194 }
200 195
201 eeh_unlock();
202
203 return NULL; 196 return NULL;
204} 197}
205 198
@@ -228,7 +221,7 @@ static void *__eeh_pe_get(void *data, void *flag)
228 return pe; 221 return pe;
229 222
230 /* Try BDF address */ 223 /* Try BDF address */
231 if (edev->pe_config_addr && 224 if (edev->config_addr &&
232 (edev->config_addr == pe->config_addr)) 225 (edev->config_addr == pe->config_addr))
233 return pe; 226 return pe;
234 227
@@ -246,7 +239,7 @@ static void *__eeh_pe_get(void *data, void *flag)
246 * which is composed of PCI bus/device/function number, or unified 239 * which is composed of PCI bus/device/function number, or unified
247 * PE address. 240 * PE address.
248 */ 241 */
249static struct eeh_pe *eeh_pe_get(struct eeh_dev *edev) 242struct eeh_pe *eeh_pe_get(struct eeh_dev *edev)
250{ 243{
251 struct eeh_pe *root = eeh_phb_pe_get(edev->phb); 244 struct eeh_pe *root = eeh_phb_pe_get(edev->phb);
252 struct eeh_pe *pe; 245 struct eeh_pe *pe;
@@ -305,8 +298,6 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
305{ 298{
306 struct eeh_pe *pe, *parent; 299 struct eeh_pe *pe, *parent;
307 300
308 eeh_lock();
309
310 /* 301 /*
311 * Search the PE has been existing or not according 302 * Search the PE has been existing or not according
312 * to the PE address. If that has been existing, the 303 * to the PE address. If that has been existing, the
@@ -316,7 +307,6 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
316 pe = eeh_pe_get(edev); 307 pe = eeh_pe_get(edev);
317 if (pe && !(pe->type & EEH_PE_INVALID)) { 308 if (pe && !(pe->type & EEH_PE_INVALID)) {
318 if (!edev->pe_config_addr) { 309 if (!edev->pe_config_addr) {
319 eeh_unlock();
320 pr_err("%s: PE with addr 0x%x already exists\n", 310 pr_err("%s: PE with addr 0x%x already exists\n",
321 __func__, edev->config_addr); 311 __func__, edev->config_addr);
322 return -EEXIST; 312 return -EEXIST;
@@ -328,7 +318,6 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
328 318
329 /* Put the edev to PE */ 319 /* Put the edev to PE */
330 list_add_tail(&edev->list, &pe->edevs); 320 list_add_tail(&edev->list, &pe->edevs);
331 eeh_unlock();
332 pr_debug("EEH: Add %s to Bus PE#%x\n", 321 pr_debug("EEH: Add %s to Bus PE#%x\n",
333 edev->dn->full_name, pe->addr); 322 edev->dn->full_name, pe->addr);
334 323
@@ -347,7 +336,6 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
347 parent->type &= ~EEH_PE_INVALID; 336 parent->type &= ~EEH_PE_INVALID;
348 parent = parent->parent; 337 parent = parent->parent;
349 } 338 }
350 eeh_unlock();
351 pr_debug("EEH: Add %s to Device PE#%x, Parent PE#%x\n", 339 pr_debug("EEH: Add %s to Device PE#%x, Parent PE#%x\n",
352 edev->dn->full_name, pe->addr, pe->parent->addr); 340 edev->dn->full_name, pe->addr, pe->parent->addr);
353 341
@@ -357,7 +345,6 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
357 /* Create a new EEH PE */ 345 /* Create a new EEH PE */
358 pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE); 346 pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE);
359 if (!pe) { 347 if (!pe) {
360 eeh_unlock();
361 pr_err("%s: out of memory!\n", __func__); 348 pr_err("%s: out of memory!\n", __func__);
362 return -ENOMEM; 349 return -ENOMEM;
363 } 350 }
@@ -365,6 +352,17 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
365 pe->config_addr = edev->config_addr; 352 pe->config_addr = edev->config_addr;
366 353
367 /* 354 /*
355 * While doing PE reset, we probably hot-reset the
356 * upstream bridge. However, the PCI devices including
357 * the associated EEH devices might be removed when EEH
358 * core is doing recovery. So that won't safe to retrieve
359 * the bridge through downstream EEH device. We have to
360 * trace the parent PCI bus, then the upstream bridge.
361 */
362 if (eeh_probe_mode_dev())
363 pe->bus = eeh_dev_to_pci_dev(edev)->bus;
364
365 /*
368 * Put the new EEH PE into hierarchy tree. If the parent 366 * Put the new EEH PE into hierarchy tree. If the parent
369 * can't be found, the newly created PE will be attached 367 * can't be found, the newly created PE will be attached
370 * to PHB directly. Otherwise, we have to associate the 368 * to PHB directly. Otherwise, we have to associate the
@@ -374,7 +372,6 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
374 if (!parent) { 372 if (!parent) {
375 parent = eeh_phb_pe_get(edev->phb); 373 parent = eeh_phb_pe_get(edev->phb);
376 if (!parent) { 374 if (!parent) {
377 eeh_unlock();
378 pr_err("%s: No PHB PE is found (PHB Domain=%d)\n", 375 pr_err("%s: No PHB PE is found (PHB Domain=%d)\n",
379 __func__, edev->phb->global_number); 376 __func__, edev->phb->global_number);
380 edev->pe = NULL; 377 edev->pe = NULL;
@@ -391,7 +388,6 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
391 list_add_tail(&pe->child, &parent->child_list); 388 list_add_tail(&pe->child, &parent->child_list);
392 list_add_tail(&edev->list, &pe->edevs); 389 list_add_tail(&edev->list, &pe->edevs);
393 edev->pe = pe; 390 edev->pe = pe;
394 eeh_unlock();
395 pr_debug("EEH: Add %s to Device PE#%x, Parent PE#%x\n", 391 pr_debug("EEH: Add %s to Device PE#%x, Parent PE#%x\n",
396 edev->dn->full_name, pe->addr, pe->parent->addr); 392 edev->dn->full_name, pe->addr, pe->parent->addr);
397 393
@@ -419,8 +415,6 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev, int purge_pe)
419 return -EEXIST; 415 return -EEXIST;
420 } 416 }
421 417
422 eeh_lock();
423
424 /* Remove the EEH device */ 418 /* Remove the EEH device */
425 pe = edev->pe; 419 pe = edev->pe;
426 edev->pe = NULL; 420 edev->pe = NULL;
@@ -465,12 +459,37 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev, int purge_pe)
465 pe = parent; 459 pe = parent;
466 } 460 }
467 461
468 eeh_unlock();
469
470 return 0; 462 return 0;
471} 463}
472 464
473/** 465/**
466 * eeh_pe_update_time_stamp - Update PE's frozen time stamp
467 * @pe: EEH PE
468 *
469 * We have time stamp for each PE to trace its time of getting
470 * frozen in last hour. The function should be called to update
471 * the time stamp on first error of the specific PE. On the other
472 * handle, we needn't account for errors happened in last hour.
473 */
474void eeh_pe_update_time_stamp(struct eeh_pe *pe)
475{
476 struct timeval tstamp;
477
478 if (!pe) return;
479
480 if (pe->freeze_count <= 0) {
481 pe->freeze_count = 0;
482 do_gettimeofday(&pe->tstamp);
483 } else {
484 do_gettimeofday(&tstamp);
485 if (tstamp.tv_sec - pe->tstamp.tv_sec > 3600) {
486 pe->tstamp = tstamp;
487 pe->freeze_count = 0;
488 }
489 }
490}
491
492/**
474 * __eeh_pe_state_mark - Mark the state for the PE 493 * __eeh_pe_state_mark - Mark the state for the PE
475 * @data: EEH PE 494 * @data: EEH PE
476 * @flag: state 495 * @flag: state
@@ -512,9 +531,7 @@ static void *__eeh_pe_state_mark(void *data, void *flag)
512 */ 531 */
513void eeh_pe_state_mark(struct eeh_pe *pe, int state) 532void eeh_pe_state_mark(struct eeh_pe *pe, int state)
514{ 533{
515 eeh_lock();
516 eeh_pe_traverse(pe, __eeh_pe_state_mark, &state); 534 eeh_pe_traverse(pe, __eeh_pe_state_mark, &state);
517 eeh_unlock();
518} 535}
519 536
520/** 537/**
@@ -548,35 +565,135 @@ static void *__eeh_pe_state_clear(void *data, void *flag)
548 */ 565 */
549void eeh_pe_state_clear(struct eeh_pe *pe, int state) 566void eeh_pe_state_clear(struct eeh_pe *pe, int state)
550{ 567{
551 eeh_lock();
552 eeh_pe_traverse(pe, __eeh_pe_state_clear, &state); 568 eeh_pe_traverse(pe, __eeh_pe_state_clear, &state);
553 eeh_unlock();
554} 569}
555 570
556/** 571/*
557 * eeh_restore_one_device_bars - Restore the Base Address Registers for one device 572 * Some PCI bridges (e.g. PLX bridges) have primary/secondary
558 * @data: EEH device 573 * buses assigned explicitly by firmware, and we probably have
559 * @flag: Unused 574 * lost that after reset. So we have to delay the check until
575 * the PCI-CFG registers have been restored for the parent
576 * bridge.
560 * 577 *
561 * Loads the PCI configuration space base address registers, 578 * Don't use normal PCI-CFG accessors, which probably has been
562 * the expansion ROM base address, the latency timer, and etc. 579 * blocked on normal path during the stage. So we need utilize
563 * from the saved values in the device node. 580 * eeh operations, which is always permitted.
564 */ 581 */
565static void *eeh_restore_one_device_bars(void *data, void *flag) 582static void eeh_bridge_check_link(struct pci_dev *pdev,
583 struct device_node *dn)
584{
585 int cap;
586 uint32_t val;
587 int timeout = 0;
588
589 /*
590 * We only check root port and downstream ports of
591 * PCIe switches
592 */
593 if (!pci_is_pcie(pdev) ||
594 (pci_pcie_type(pdev) != PCI_EXP_TYPE_ROOT_PORT &&
595 pci_pcie_type(pdev) != PCI_EXP_TYPE_DOWNSTREAM))
596 return;
597
598 pr_debug("%s: Check PCIe link for %s ...\n",
599 __func__, pci_name(pdev));
600
601 /* Check slot status */
602 cap = pdev->pcie_cap;
603 eeh_ops->read_config(dn, cap + PCI_EXP_SLTSTA, 2, &val);
604 if (!(val & PCI_EXP_SLTSTA_PDS)) {
605 pr_debug(" No card in the slot (0x%04x) !\n", val);
606 return;
607 }
608
609 /* Check power status if we have the capability */
610 eeh_ops->read_config(dn, cap + PCI_EXP_SLTCAP, 2, &val);
611 if (val & PCI_EXP_SLTCAP_PCP) {
612 eeh_ops->read_config(dn, cap + PCI_EXP_SLTCTL, 2, &val);
613 if (val & PCI_EXP_SLTCTL_PCC) {
614 pr_debug(" In power-off state, power it on ...\n");
615 val &= ~(PCI_EXP_SLTCTL_PCC | PCI_EXP_SLTCTL_PIC);
616 val |= (0x0100 & PCI_EXP_SLTCTL_PIC);
617 eeh_ops->write_config(dn, cap + PCI_EXP_SLTCTL, 2, val);
618 msleep(2 * 1000);
619 }
620 }
621
622 /* Enable link */
623 eeh_ops->read_config(dn, cap + PCI_EXP_LNKCTL, 2, &val);
624 val &= ~PCI_EXP_LNKCTL_LD;
625 eeh_ops->write_config(dn, cap + PCI_EXP_LNKCTL, 2, val);
626
627 /* Check link */
628 eeh_ops->read_config(dn, cap + PCI_EXP_LNKCAP, 4, &val);
629 if (!(val & PCI_EXP_LNKCAP_DLLLARC)) {
630 pr_debug(" No link reporting capability (0x%08x) \n", val);
631 msleep(1000);
632 return;
633 }
634
635 /* Wait the link is up until timeout (5s) */
636 timeout = 0;
637 while (timeout < 5000) {
638 msleep(20);
639 timeout += 20;
640
641 eeh_ops->read_config(dn, cap + PCI_EXP_LNKSTA, 2, &val);
642 if (val & PCI_EXP_LNKSTA_DLLLA)
643 break;
644 }
645
646 if (val & PCI_EXP_LNKSTA_DLLLA)
647 pr_debug(" Link up (%s)\n",
648 (val & PCI_EXP_LNKSTA_CLS_2_5GB) ? "2.5GB" : "5GB");
649 else
650 pr_debug(" Link not ready (0x%04x)\n", val);
651}
652
653#define BYTE_SWAP(OFF) (8*((OFF)/4)+3-(OFF))
654#define SAVED_BYTE(OFF) (((u8 *)(edev->config_space))[BYTE_SWAP(OFF)])
655
656static void eeh_restore_bridge_bars(struct pci_dev *pdev,
657 struct eeh_dev *edev,
658 struct device_node *dn)
659{
660 int i;
661
662 /*
663 * Device BARs: 0x10 - 0x18
664 * Bus numbers and windows: 0x18 - 0x30
665 */
666 for (i = 4; i < 13; i++)
667 eeh_ops->write_config(dn, i*4, 4, edev->config_space[i]);
668 /* Rom: 0x38 */
669 eeh_ops->write_config(dn, 14*4, 4, edev->config_space[14]);
670
671 /* Cache line & Latency timer: 0xC 0xD */
672 eeh_ops->write_config(dn, PCI_CACHE_LINE_SIZE, 1,
673 SAVED_BYTE(PCI_CACHE_LINE_SIZE));
674 eeh_ops->write_config(dn, PCI_LATENCY_TIMER, 1,
675 SAVED_BYTE(PCI_LATENCY_TIMER));
676 /* Max latency, min grant, interrupt ping and line: 0x3C */
677 eeh_ops->write_config(dn, 15*4, 4, edev->config_space[15]);
678
679 /* PCI Command: 0x4 */
680 eeh_ops->write_config(dn, PCI_COMMAND, 4, edev->config_space[1]);
681
682 /* Check the PCIe link is ready */
683 eeh_bridge_check_link(pdev, dn);
684}
685
686static void eeh_restore_device_bars(struct eeh_dev *edev,
687 struct device_node *dn)
566{ 688{
567 int i; 689 int i;
568 u32 cmd; 690 u32 cmd;
569 struct eeh_dev *edev = (struct eeh_dev *)data;
570 struct device_node *dn = eeh_dev_to_of_node(edev);
571 691
572 for (i = 4; i < 10; i++) 692 for (i = 4; i < 10; i++)
573 eeh_ops->write_config(dn, i*4, 4, edev->config_space[i]); 693 eeh_ops->write_config(dn, i*4, 4, edev->config_space[i]);
574 /* 12 == Expansion ROM Address */ 694 /* 12 == Expansion ROM Address */
575 eeh_ops->write_config(dn, 12*4, 4, edev->config_space[12]); 695 eeh_ops->write_config(dn, 12*4, 4, edev->config_space[12]);
576 696
577#define BYTE_SWAP(OFF) (8*((OFF)/4)+3-(OFF))
578#define SAVED_BYTE(OFF) (((u8 *)(edev->config_space))[BYTE_SWAP(OFF)])
579
580 eeh_ops->write_config(dn, PCI_CACHE_LINE_SIZE, 1, 697 eeh_ops->write_config(dn, PCI_CACHE_LINE_SIZE, 1,
581 SAVED_BYTE(PCI_CACHE_LINE_SIZE)); 698 SAVED_BYTE(PCI_CACHE_LINE_SIZE));
582 eeh_ops->write_config(dn, PCI_LATENCY_TIMER, 1, 699 eeh_ops->write_config(dn, PCI_LATENCY_TIMER, 1,
@@ -599,6 +716,34 @@ static void *eeh_restore_one_device_bars(void *data, void *flag)
599 else 716 else
600 cmd &= ~PCI_COMMAND_SERR; 717 cmd &= ~PCI_COMMAND_SERR;
601 eeh_ops->write_config(dn, PCI_COMMAND, 4, cmd); 718 eeh_ops->write_config(dn, PCI_COMMAND, 4, cmd);
719}
720
721/**
722 * eeh_restore_one_device_bars - Restore the Base Address Registers for one device
723 * @data: EEH device
724 * @flag: Unused
725 *
726 * Loads the PCI configuration space base address registers,
727 * the expansion ROM base address, the latency timer, and etc.
728 * from the saved values in the device node.
729 */
730static void *eeh_restore_one_device_bars(void *data, void *flag)
731{
732 struct pci_dev *pdev = NULL;
733 struct eeh_dev *edev = (struct eeh_dev *)data;
734 struct device_node *dn = eeh_dev_to_of_node(edev);
735
736 /* Trace the PCI bridge */
737 if (eeh_probe_mode_dev()) {
738 pdev = eeh_dev_to_pci_dev(edev);
739 if (pdev->hdr_type != PCI_HEADER_TYPE_BRIDGE)
740 pdev = NULL;
741 }
742
743 if (pdev)
744 eeh_restore_bridge_bars(pdev, edev, dn);
745 else
746 eeh_restore_device_bars(edev, dn);
602 747
603 return NULL; 748 return NULL;
604} 749}
@@ -635,19 +780,21 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe)
635 struct eeh_dev *edev; 780 struct eeh_dev *edev;
636 struct pci_dev *pdev; 781 struct pci_dev *pdev;
637 782
638 eeh_lock();
639
640 if (pe->type & EEH_PE_PHB) { 783 if (pe->type & EEH_PE_PHB) {
641 bus = pe->phb->bus; 784 bus = pe->phb->bus;
642 } else if (pe->type & EEH_PE_BUS || 785 } else if (pe->type & EEH_PE_BUS ||
643 pe->type & EEH_PE_DEVICE) { 786 pe->type & EEH_PE_DEVICE) {
787 if (pe->bus) {
788 bus = pe->bus;
789 goto out;
790 }
791
644 edev = list_first_entry(&pe->edevs, struct eeh_dev, list); 792 edev = list_first_entry(&pe->edevs, struct eeh_dev, list);
645 pdev = eeh_dev_to_pci_dev(edev); 793 pdev = eeh_dev_to_pci_dev(edev);
646 if (pdev) 794 if (pdev)
647 bus = pdev->bus; 795 bus = pdev->bus;
648 } 796 }
649 797
650 eeh_unlock(); 798out:
651
652 return bus; 799 return bus;
653} 800}
diff --git a/arch/powerpc/platforms/pseries/eeh_sysfs.c b/arch/powerpc/kernel/eeh_sysfs.c
index d37708360f2e..e7ae3484918c 100644
--- a/arch/powerpc/platforms/pseries/eeh_sysfs.c
+++ b/arch/powerpc/kernel/eeh_sysfs.c
@@ -72,4 +72,3 @@ void eeh_sysfs_remove_device(struct pci_dev *pdev)
72 device_remove_file(&pdev->dev, &dev_attr_eeh_config_addr); 72 device_remove_file(&pdev->dev, &dev_attr_eeh_config_addr);
73 device_remove_file(&pdev->dev, &dev_attr_eeh_pe_config_addr); 73 device_remove_file(&pdev->dev, &dev_attr_eeh_pe_config_addr);
74} 74}
75
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 8741c854e03d..ab15b8d057ad 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -629,21 +629,43 @@ _GLOBAL(ret_from_except_lite)
629 629
630 CURRENT_THREAD_INFO(r9, r1) 630 CURRENT_THREAD_INFO(r9, r1)
631 ld r3,_MSR(r1) 631 ld r3,_MSR(r1)
632#ifdef CONFIG_PPC_BOOK3E
633 ld r10,PACACURRENT(r13)
634#endif /* CONFIG_PPC_BOOK3E */
632 ld r4,TI_FLAGS(r9) 635 ld r4,TI_FLAGS(r9)
633 andi. r3,r3,MSR_PR 636 andi. r3,r3,MSR_PR
634 beq resume_kernel 637 beq resume_kernel
638#ifdef CONFIG_PPC_BOOK3E
639 lwz r3,(THREAD+THREAD_DBCR0)(r10)
640#endif /* CONFIG_PPC_BOOK3E */
635 641
636 /* Check current_thread_info()->flags */ 642 /* Check current_thread_info()->flags */
637 andi. r0,r4,_TIF_USER_WORK_MASK 643 andi. r0,r4,_TIF_USER_WORK_MASK
644#ifdef CONFIG_PPC_BOOK3E
645 bne 1f
646 /*
647 * Check to see if the dbcr0 register is set up to debug.
648 * Use the internal debug mode bit to do this.
649 */
650 andis. r0,r3,DBCR0_IDM@h
638 beq restore 651 beq restore
639 652 mfmsr r0
640 andi. r0,r4,_TIF_NEED_RESCHED 653 rlwinm r0,r0,0,~MSR_DE /* Clear MSR.DE */
641 beq 1f 654 mtmsr r0
655 mtspr SPRN_DBCR0,r3
656 li r10, -1
657 mtspr SPRN_DBSR,r10
658 b restore
659#else
660 beq restore
661#endif
6621: andi. r0,r4,_TIF_NEED_RESCHED
663 beq 2f
642 bl .restore_interrupts 664 bl .restore_interrupts
643 SCHEDULE_USER 665 SCHEDULE_USER
644 b .ret_from_except_lite 666 b .ret_from_except_lite
645 667
6461: bl .save_nvgprs 6682: bl .save_nvgprs
647 bl .restore_interrupts 669 bl .restore_interrupts
648 addi r3,r1,STACK_FRAME_OVERHEAD 670 addi r3,r1,STACK_FRAME_OVERHEAD
649 bl .do_notify_resume 671 bl .do_notify_resume
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 40e4a17c8ba0..4e00d223b2e3 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -341,10 +341,17 @@ vsx_unavailable_pSeries_1:
341 EXCEPTION_PROLOG_0(PACA_EXGEN) 341 EXCEPTION_PROLOG_0(PACA_EXGEN)
342 b vsx_unavailable_pSeries 342 b vsx_unavailable_pSeries
343 343
344facility_unavailable_trampoline:
344 . = 0xf60 345 . = 0xf60
345 SET_SCRATCH0(r13) 346 SET_SCRATCH0(r13)
346 EXCEPTION_PROLOG_0(PACA_EXGEN) 347 EXCEPTION_PROLOG_0(PACA_EXGEN)
347 b tm_unavailable_pSeries 348 b facility_unavailable_pSeries
349
350hv_facility_unavailable_trampoline:
351 . = 0xf80
352 SET_SCRATCH0(r13)
353 EXCEPTION_PROLOG_0(PACA_EXGEN)
354 b facility_unavailable_hv
348 355
349#ifdef CONFIG_CBE_RAS 356#ifdef CONFIG_CBE_RAS
350 STD_EXCEPTION_HV(0x1200, 0x1202, cbe_system_error) 357 STD_EXCEPTION_HV(0x1200, 0x1202, cbe_system_error)
@@ -522,8 +529,10 @@ denorm_done:
522 KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf20) 529 KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf20)
523 STD_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavailable) 530 STD_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavailable)
524 KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf40) 531 KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf40)
525 STD_EXCEPTION_PSERIES_OOL(0xf60, tm_unavailable) 532 STD_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable)
526 KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf60) 533 KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf60)
534 STD_EXCEPTION_HV_OOL(0xf82, facility_unavailable)
535 KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xf82)
527 536
528/* 537/*
529 * An interrupt came in while soft-disabled. We set paca->irq_happened, then: 538 * An interrupt came in while soft-disabled. We set paca->irq_happened, then:
@@ -793,14 +802,10 @@ system_call_relon_pSeries:
793 STD_RELON_EXCEPTION_PSERIES(0x4d00, 0xd00, single_step) 802 STD_RELON_EXCEPTION_PSERIES(0x4d00, 0xd00, single_step)
794 803
795 . = 0x4e00 804 . = 0x4e00
796 SET_SCRATCH0(r13) 805 b . /* Can't happen, see v2.07 Book III-S section 6.5 */
797 EXCEPTION_PROLOG_0(PACA_EXGEN)
798 b h_data_storage_relon_hv
799 806
800 . = 0x4e20 807 . = 0x4e20
801 SET_SCRATCH0(r13) 808 b . /* Can't happen, see v2.07 Book III-S section 6.5 */
802 EXCEPTION_PROLOG_0(PACA_EXGEN)
803 b h_instr_storage_relon_hv
804 809
805 . = 0x4e40 810 . = 0x4e40
806 SET_SCRATCH0(r13) 811 SET_SCRATCH0(r13)
@@ -808,9 +813,7 @@ system_call_relon_pSeries:
808 b emulation_assist_relon_hv 813 b emulation_assist_relon_hv
809 814
810 . = 0x4e60 815 . = 0x4e60
811 SET_SCRATCH0(r13) 816 b . /* Can't happen, see v2.07 Book III-S section 6.5 */
812 EXCEPTION_PROLOG_0(PACA_EXGEN)
813 b hmi_exception_relon_hv
814 817
815 . = 0x4e80 818 . = 0x4e80
816 SET_SCRATCH0(r13) 819 SET_SCRATCH0(r13)
@@ -835,11 +838,17 @@ vsx_unavailable_relon_pSeries_1:
835 EXCEPTION_PROLOG_0(PACA_EXGEN) 838 EXCEPTION_PROLOG_0(PACA_EXGEN)
836 b vsx_unavailable_relon_pSeries 839 b vsx_unavailable_relon_pSeries
837 840
838tm_unavailable_relon_pSeries_1: 841facility_unavailable_relon_trampoline:
839 . = 0x4f60 842 . = 0x4f60
840 SET_SCRATCH0(r13) 843 SET_SCRATCH0(r13)
841 EXCEPTION_PROLOG_0(PACA_EXGEN) 844 EXCEPTION_PROLOG_0(PACA_EXGEN)
842 b tm_unavailable_relon_pSeries 845 b facility_unavailable_relon_pSeries
846
847hv_facility_unavailable_relon_trampoline:
848 . = 0x4f80
849 SET_SCRATCH0(r13)
850 EXCEPTION_PROLOG_0(PACA_EXGEN)
851 b facility_unavailable_relon_hv
843 852
844 STD_RELON_EXCEPTION_PSERIES(0x5300, 0x1300, instruction_breakpoint) 853 STD_RELON_EXCEPTION_PSERIES(0x5300, 0x1300, instruction_breakpoint)
845#ifdef CONFIG_PPC_DENORMALISATION 854#ifdef CONFIG_PPC_DENORMALISATION
@@ -1165,36 +1174,21 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
1165 bl .vsx_unavailable_exception 1174 bl .vsx_unavailable_exception
1166 b .ret_from_except 1175 b .ret_from_except
1167 1176
1168 .align 7 1177 STD_EXCEPTION_COMMON(0xf60, facility_unavailable, .facility_unavailable_exception)
1169 .globl tm_unavailable_common
1170tm_unavailable_common:
1171 EXCEPTION_PROLOG_COMMON(0xf60, PACA_EXGEN)
1172 bl .save_nvgprs
1173 DISABLE_INTS
1174 addi r3,r1,STACK_FRAME_OVERHEAD
1175 bl .tm_unavailable_exception
1176 b .ret_from_except
1177 1178
1178 .align 7 1179 .align 7
1179 .globl __end_handlers 1180 .globl __end_handlers
1180__end_handlers: 1181__end_handlers:
1181 1182
1182 /* Equivalents to the above handlers for relocation-on interrupt vectors */ 1183 /* Equivalents to the above handlers for relocation-on interrupt vectors */
1183 STD_RELON_EXCEPTION_HV_OOL(0xe00, h_data_storage)
1184 KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe00)
1185 STD_RELON_EXCEPTION_HV_OOL(0xe20, h_instr_storage)
1186 KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe20)
1187 STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist) 1184 STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist)
1188 KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe40)
1189 STD_RELON_EXCEPTION_HV_OOL(0xe60, hmi_exception)
1190 KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe60)
1191 MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell) 1185 MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell)
1192 KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe80)
1193 1186
1194 STD_RELON_EXCEPTION_PSERIES_OOL(0xf00, performance_monitor) 1187 STD_RELON_EXCEPTION_PSERIES_OOL(0xf00, performance_monitor)
1195 STD_RELON_EXCEPTION_PSERIES_OOL(0xf20, altivec_unavailable) 1188 STD_RELON_EXCEPTION_PSERIES_OOL(0xf20, altivec_unavailable)
1196 STD_RELON_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavailable) 1189 STD_RELON_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavailable)
1197 STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, tm_unavailable) 1190 STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable)
1191 STD_RELON_EXCEPTION_HV_OOL(0xf80, facility_unavailable)
1198 1192
1199#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) 1193#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
1200/* 1194/*
diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c
index a949bdfc9623..f0b47d1a6b0e 100644
--- a/arch/powerpc/kernel/hw_breakpoint.c
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -176,7 +176,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
176 length_max = 512 ; /* 64 doublewords */ 176 length_max = 512 ; /* 64 doublewords */
177 /* DAWR region can't cross 512 boundary */ 177 /* DAWR region can't cross 512 boundary */
178 if ((bp->attr.bp_addr >> 10) != 178 if ((bp->attr.bp_addr >> 10) !=
179 ((bp->attr.bp_addr + bp->attr.bp_len) >> 10)) 179 ((bp->attr.bp_addr + bp->attr.bp_len - 1) >> 10))
180 return -EINVAL; 180 return -EINVAL;
181 } 181 }
182 if (info->len > 182 if (info->len >
@@ -250,6 +250,7 @@ int __kprobes hw_breakpoint_handler(struct die_args *args)
250 * we still need to single-step the instruction, but we don't 250 * we still need to single-step the instruction, but we don't
251 * generate an event. 251 * generate an event.
252 */ 252 */
253 info->type &= ~HW_BRK_TYPE_EXTRANEOUS_IRQ;
253 if (!((bp->attr.bp_addr <= dar) && 254 if (!((bp->attr.bp_addr <= dar) &&
254 (dar - bp->attr.bp_addr < bp->attr.bp_len))) 255 (dar - bp->attr.bp_addr < bp->attr.bp_len)))
255 info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ; 256 info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ;
diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c
index 939ea7ef0dc8..d7216c9abda1 100644
--- a/arch/powerpc/kernel/idle.c
+++ b/arch/powerpc/kernel/idle.c
@@ -85,7 +85,7 @@ int powersave_nap;
85/* 85/*
86 * Register the sysctl to set/clear powersave_nap. 86 * Register the sysctl to set/clear powersave_nap.
87 */ 87 */
88static ctl_table powersave_nap_ctl_table[]={ 88static struct ctl_table powersave_nap_ctl_table[] = {
89 { 89 {
90 .procname = "powersave-nap", 90 .procname = "powersave-nap",
91 .data = &powersave_nap, 91 .data = &powersave_nap,
@@ -95,7 +95,7 @@ static ctl_table powersave_nap_ctl_table[]={
95 }, 95 },
96 {} 96 {}
97}; 97};
98static ctl_table powersave_nap_sysctl_root[] = { 98static struct ctl_table powersave_nap_sysctl_root[] = {
99 { 99 {
100 .procname = "kernel", 100 .procname = "kernel",
101 .mode = 0555, 101 .mode = 0555,
diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c
index 50e90b7e7139..fa0b54b2a362 100644
--- a/arch/powerpc/kernel/io-workarounds.c
+++ b/arch/powerpc/kernel/io-workarounds.c
@@ -55,6 +55,7 @@ static struct iowa_bus *iowa_pci_find(unsigned long vaddr, unsigned long paddr)
55 55
56struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr) 56struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr)
57{ 57{
58 unsigned hugepage_shift;
58 struct iowa_bus *bus; 59 struct iowa_bus *bus;
59 int token; 60 int token;
60 61
@@ -70,11 +71,17 @@ struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr)
70 if (vaddr < PHB_IO_BASE || vaddr >= PHB_IO_END) 71 if (vaddr < PHB_IO_BASE || vaddr >= PHB_IO_END)
71 return NULL; 72 return NULL;
72 73
73 ptep = find_linux_pte(init_mm.pgd, vaddr); 74 ptep = find_linux_pte_or_hugepte(init_mm.pgd, vaddr,
75 &hugepage_shift);
74 if (ptep == NULL) 76 if (ptep == NULL)
75 paddr = 0; 77 paddr = 0;
76 else 78 else {
79 /*
80 * we don't have hugepages backing iomem
81 */
82 WARN_ON(hugepage_shift);
77 paddr = pte_pfn(*ptep) << PAGE_SHIFT; 83 paddr = pte_pfn(*ptep) << PAGE_SHIFT;
84 }
78 bus = iowa_pci_find(vaddr, paddr); 85 bus = iowa_pci_find(vaddr, paddr);
79 86
80 if (bus == NULL) 87 if (bus == NULL)
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index c0d0dbddfba1..b20ff173a671 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -36,6 +36,8 @@
36#include <linux/hash.h> 36#include <linux/hash.h>
37#include <linux/fault-inject.h> 37#include <linux/fault-inject.h>
38#include <linux/pci.h> 38#include <linux/pci.h>
39#include <linux/iommu.h>
40#include <linux/sched.h>
39#include <asm/io.h> 41#include <asm/io.h>
40#include <asm/prom.h> 42#include <asm/prom.h>
41#include <asm/iommu.h> 43#include <asm/iommu.h>
@@ -44,6 +46,7 @@
44#include <asm/kdump.h> 46#include <asm/kdump.h>
45#include <asm/fadump.h> 47#include <asm/fadump.h>
46#include <asm/vio.h> 48#include <asm/vio.h>
49#include <asm/tce.h>
47 50
48#define DBG(...) 51#define DBG(...)
49 52
@@ -724,6 +727,13 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
724 if (tbl->it_offset == 0) 727 if (tbl->it_offset == 0)
725 clear_bit(0, tbl->it_map); 728 clear_bit(0, tbl->it_map);
726 729
730#ifdef CONFIG_IOMMU_API
731 if (tbl->it_group) {
732 iommu_group_put(tbl->it_group);
733 BUG_ON(tbl->it_group);
734 }
735#endif
736
727 /* verify that table contains no entries */ 737 /* verify that table contains no entries */
728 if (!bitmap_empty(tbl->it_map, tbl->it_size)) 738 if (!bitmap_empty(tbl->it_map, tbl->it_size))
729 pr_warn("%s: Unexpected TCEs for %s\n", __func__, node_name); 739 pr_warn("%s: Unexpected TCEs for %s\n", __func__, node_name);
@@ -860,3 +870,316 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
860 free_pages((unsigned long)vaddr, get_order(size)); 870 free_pages((unsigned long)vaddr, get_order(size));
861 } 871 }
862} 872}
873
874#ifdef CONFIG_IOMMU_API
875/*
876 * SPAPR TCE API
877 */
878static void group_release(void *iommu_data)
879{
880 struct iommu_table *tbl = iommu_data;
881 tbl->it_group = NULL;
882}
883
884void iommu_register_group(struct iommu_table *tbl,
885 int pci_domain_number, unsigned long pe_num)
886{
887 struct iommu_group *grp;
888 char *name;
889
890 grp = iommu_group_alloc();
891 if (IS_ERR(grp)) {
892 pr_warn("powerpc iommu api: cannot create new group, err=%ld\n",
893 PTR_ERR(grp));
894 return;
895 }
896 tbl->it_group = grp;
897 iommu_group_set_iommudata(grp, tbl, group_release);
898 name = kasprintf(GFP_KERNEL, "domain%d-pe%lx",
899 pci_domain_number, pe_num);
900 if (!name)
901 return;
902 iommu_group_set_name(grp, name);
903 kfree(name);
904}
905
906enum dma_data_direction iommu_tce_direction(unsigned long tce)
907{
908 if ((tce & TCE_PCI_READ) && (tce & TCE_PCI_WRITE))
909 return DMA_BIDIRECTIONAL;
910 else if (tce & TCE_PCI_READ)
911 return DMA_TO_DEVICE;
912 else if (tce & TCE_PCI_WRITE)
913 return DMA_FROM_DEVICE;
914 else
915 return DMA_NONE;
916}
917EXPORT_SYMBOL_GPL(iommu_tce_direction);
918
919void iommu_flush_tce(struct iommu_table *tbl)
920{
921 /* Flush/invalidate TLB caches if necessary */
922 if (ppc_md.tce_flush)
923 ppc_md.tce_flush(tbl);
924
925 /* Make sure updates are seen by hardware */
926 mb();
927}
928EXPORT_SYMBOL_GPL(iommu_flush_tce);
929
930int iommu_tce_clear_param_check(struct iommu_table *tbl,
931 unsigned long ioba, unsigned long tce_value,
932 unsigned long npages)
933{
934 /* ppc_md.tce_free() does not support any value but 0 */
935 if (tce_value)
936 return -EINVAL;
937
938 if (ioba & ~IOMMU_PAGE_MASK)
939 return -EINVAL;
940
941 ioba >>= IOMMU_PAGE_SHIFT;
942 if (ioba < tbl->it_offset)
943 return -EINVAL;
944
945 if ((ioba + npages) > (tbl->it_offset + tbl->it_size))
946 return -EINVAL;
947
948 return 0;
949}
950EXPORT_SYMBOL_GPL(iommu_tce_clear_param_check);
951
952int iommu_tce_put_param_check(struct iommu_table *tbl,
953 unsigned long ioba, unsigned long tce)
954{
955 if (!(tce & (TCE_PCI_WRITE | TCE_PCI_READ)))
956 return -EINVAL;
957
958 if (tce & ~(IOMMU_PAGE_MASK | TCE_PCI_WRITE | TCE_PCI_READ))
959 return -EINVAL;
960
961 if (ioba & ~IOMMU_PAGE_MASK)
962 return -EINVAL;
963
964 ioba >>= IOMMU_PAGE_SHIFT;
965 if (ioba < tbl->it_offset)
966 return -EINVAL;
967
968 if ((ioba + 1) > (tbl->it_offset + tbl->it_size))
969 return -EINVAL;
970
971 return 0;
972}
973EXPORT_SYMBOL_GPL(iommu_tce_put_param_check);
974
975unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry)
976{
977 unsigned long oldtce;
978 struct iommu_pool *pool = get_pool(tbl, entry);
979
980 spin_lock(&(pool->lock));
981
982 oldtce = ppc_md.tce_get(tbl, entry);
983 if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))
984 ppc_md.tce_free(tbl, entry, 1);
985 else
986 oldtce = 0;
987
988 spin_unlock(&(pool->lock));
989
990 return oldtce;
991}
992EXPORT_SYMBOL_GPL(iommu_clear_tce);
993
994int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
995 unsigned long entry, unsigned long pages)
996{
997 unsigned long oldtce;
998 struct page *page;
999
1000 for ( ; pages; --pages, ++entry) {
1001 oldtce = iommu_clear_tce(tbl, entry);
1002 if (!oldtce)
1003 continue;
1004
1005 page = pfn_to_page(oldtce >> PAGE_SHIFT);
1006 WARN_ON(!page);
1007 if (page) {
1008 if (oldtce & TCE_PCI_WRITE)
1009 SetPageDirty(page);
1010 put_page(page);
1011 }
1012 }
1013
1014 return 0;
1015}
1016EXPORT_SYMBOL_GPL(iommu_clear_tces_and_put_pages);
1017
1018/*
1019 * hwaddr is a kernel virtual address here (0xc... bazillion),
1020 * tce_build converts it to a physical address.
1021 */
1022int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
1023 unsigned long hwaddr, enum dma_data_direction direction)
1024{
1025 int ret = -EBUSY;
1026 unsigned long oldtce;
1027 struct iommu_pool *pool = get_pool(tbl, entry);
1028
1029 spin_lock(&(pool->lock));
1030
1031 oldtce = ppc_md.tce_get(tbl, entry);
1032 /* Add new entry if it is not busy */
1033 if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
1034 ret = ppc_md.tce_build(tbl, entry, 1, hwaddr, direction, NULL);
1035
1036 spin_unlock(&(pool->lock));
1037
1038 /* if (unlikely(ret))
1039 pr_err("iommu_tce: %s failed on hwaddr=%lx ioba=%lx kva=%lx ret=%d\n",
1040 __func__, hwaddr, entry << IOMMU_PAGE_SHIFT,
1041 hwaddr, ret); */
1042
1043 return ret;
1044}
1045EXPORT_SYMBOL_GPL(iommu_tce_build);
1046
1047int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry,
1048 unsigned long tce)
1049{
1050 int ret;
1051 struct page *page = NULL;
1052 unsigned long hwaddr, offset = tce & IOMMU_PAGE_MASK & ~PAGE_MASK;
1053 enum dma_data_direction direction = iommu_tce_direction(tce);
1054
1055 ret = get_user_pages_fast(tce & PAGE_MASK, 1,
1056 direction != DMA_TO_DEVICE, &page);
1057 if (unlikely(ret != 1)) {
1058 /* pr_err("iommu_tce: get_user_pages_fast failed tce=%lx ioba=%lx ret=%d\n",
1059 tce, entry << IOMMU_PAGE_SHIFT, ret); */
1060 return -EFAULT;
1061 }
1062 hwaddr = (unsigned long) page_address(page) + offset;
1063
1064 ret = iommu_tce_build(tbl, entry, hwaddr, direction);
1065 if (ret)
1066 put_page(page);
1067
1068 if (ret < 0)
1069 pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%d\n",
1070 __func__, entry << IOMMU_PAGE_SHIFT, tce, ret);
1071
1072 return ret;
1073}
1074EXPORT_SYMBOL_GPL(iommu_put_tce_user_mode);
1075
1076int iommu_take_ownership(struct iommu_table *tbl)
1077{
1078 unsigned long sz = (tbl->it_size + 7) >> 3;
1079
1080 if (tbl->it_offset == 0)
1081 clear_bit(0, tbl->it_map);
1082
1083 if (!bitmap_empty(tbl->it_map, tbl->it_size)) {
1084 pr_err("iommu_tce: it_map is not empty");
1085 return -EBUSY;
1086 }
1087
1088 memset(tbl->it_map, 0xff, sz);
1089 iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size);
1090
1091 return 0;
1092}
1093EXPORT_SYMBOL_GPL(iommu_take_ownership);
1094
1095void iommu_release_ownership(struct iommu_table *tbl)
1096{
1097 unsigned long sz = (tbl->it_size + 7) >> 3;
1098
1099 iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size);
1100 memset(tbl->it_map, 0, sz);
1101
1102 /* Restore bit#0 set by iommu_init_table() */
1103 if (tbl->it_offset == 0)
1104 set_bit(0, tbl->it_map);
1105}
1106EXPORT_SYMBOL_GPL(iommu_release_ownership);
1107
1108static int iommu_add_device(struct device *dev)
1109{
1110 struct iommu_table *tbl;
1111 int ret = 0;
1112
1113 if (WARN_ON(dev->iommu_group)) {
1114 pr_warn("iommu_tce: device %s is already in iommu group %d, skipping\n",
1115 dev_name(dev),
1116 iommu_group_id(dev->iommu_group));
1117 return -EBUSY;
1118 }
1119
1120 tbl = get_iommu_table_base(dev);
1121 if (!tbl || !tbl->it_group) {
1122 pr_debug("iommu_tce: skipping device %s with no tbl\n",
1123 dev_name(dev));
1124 return 0;
1125 }
1126
1127 pr_debug("iommu_tce: adding %s to iommu group %d\n",
1128 dev_name(dev), iommu_group_id(tbl->it_group));
1129
1130 ret = iommu_group_add_device(tbl->it_group, dev);
1131 if (ret < 0)
1132 pr_err("iommu_tce: %s has not been added, ret=%d\n",
1133 dev_name(dev), ret);
1134
1135 return ret;
1136}
1137
1138static void iommu_del_device(struct device *dev)
1139{
1140 iommu_group_remove_device(dev);
1141}
1142
1143static int iommu_bus_notifier(struct notifier_block *nb,
1144 unsigned long action, void *data)
1145{
1146 struct device *dev = data;
1147
1148 switch (action) {
1149 case BUS_NOTIFY_ADD_DEVICE:
1150 return iommu_add_device(dev);
1151 case BUS_NOTIFY_DEL_DEVICE:
1152 iommu_del_device(dev);
1153 return 0;
1154 default:
1155 return 0;
1156 }
1157}
1158
1159static struct notifier_block tce_iommu_bus_nb = {
1160 .notifier_call = iommu_bus_notifier,
1161};
1162
1163static int __init tce_iommu_init(void)
1164{
1165 struct pci_dev *pdev = NULL;
1166
1167 BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
1168
1169 for_each_pci_dev(pdev)
1170 iommu_add_device(&pdev->dev);
1171
1172 bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
1173 return 0;
1174}
1175
1176subsys_initcall_sync(tce_iommu_init);
1177
1178#else
1179
1180void iommu_register_group(struct iommu_table *tbl,
1181 int pci_domain_number, unsigned long pe_num)
1182{
1183}
1184
1185#endif /* CONFIG_IOMMU_API */
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index ea185e0b3cae..2e51cde616d2 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -116,8 +116,6 @@ static inline notrace int decrementer_check_overflow(void)
116 u64 now = get_tb_or_rtc(); 116 u64 now = get_tb_or_rtc();
117 u64 *next_tb = &__get_cpu_var(decrementers_next_tb); 117 u64 *next_tb = &__get_cpu_var(decrementers_next_tb);
118 118
119 if (now >= *next_tb)
120 set_dec(1);
121 return now >= *next_tb; 119 return now >= *next_tb;
122} 120}
123 121
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 11f5b03a0b06..2156ea90eb54 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -36,12 +36,6 @@
36#include <asm/sstep.h> 36#include <asm/sstep.h>
37#include <asm/uaccess.h> 37#include <asm/uaccess.h>
38 38
39#ifdef CONFIG_PPC_ADV_DEBUG_REGS
40#define MSR_SINGLESTEP (MSR_DE)
41#else
42#define MSR_SINGLESTEP (MSR_SE)
43#endif
44
45DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; 39DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
46DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); 40DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
47 41
@@ -104,19 +98,7 @@ void __kprobes arch_remove_kprobe(struct kprobe *p)
104 98
105static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) 99static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
106{ 100{
107 /* We turn off async exceptions to ensure that the single step will 101 enable_single_step(regs);
108 * be for the instruction we have the kprobe on, if we dont its
109 * possible we'd get the single step reported for an exception handler
110 * like Decrementer or External Interrupt */
111 regs->msr &= ~MSR_EE;
112 regs->msr |= MSR_SINGLESTEP;
113#ifdef CONFIG_PPC_ADV_DEBUG_REGS
114 regs->msr &= ~MSR_CE;
115 mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) | DBCR0_IC | DBCR0_IDM);
116#ifdef CONFIG_PPC_47x
117 isync();
118#endif
119#endif
120 102
121 /* 103 /*
122 * On powerpc we should single step on the original 104 * On powerpc we should single step on the original
diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c
index 48fbc2b97e95..8213ee1eb05a 100644
--- a/arch/powerpc/kernel/nvram_64.c
+++ b/arch/powerpc/kernel/nvram_64.c
@@ -84,22 +84,30 @@ static ssize_t dev_nvram_read(struct file *file, char __user *buf,
84 char *tmp = NULL; 84 char *tmp = NULL;
85 ssize_t size; 85 ssize_t size;
86 86
87 ret = -ENODEV; 87 if (!ppc_md.nvram_size) {
88 if (!ppc_md.nvram_size) 88 ret = -ENODEV;
89 goto out; 89 goto out;
90 }
90 91
91 ret = 0;
92 size = ppc_md.nvram_size(); 92 size = ppc_md.nvram_size();
93 if (*ppos >= size || size < 0) 93 if (size < 0) {
94 ret = size;
95 goto out;
96 }
97
98 if (*ppos >= size) {
99 ret = 0;
94 goto out; 100 goto out;
101 }
95 102
96 count = min_t(size_t, count, size - *ppos); 103 count = min_t(size_t, count, size - *ppos);
97 count = min(count, PAGE_SIZE); 104 count = min(count, PAGE_SIZE);
98 105
99 ret = -ENOMEM;
100 tmp = kmalloc(count, GFP_KERNEL); 106 tmp = kmalloc(count, GFP_KERNEL);
101 if (!tmp) 107 if (!tmp) {
108 ret = -ENOMEM;
102 goto out; 109 goto out;
110 }
103 111
104 ret = ppc_md.nvram_read(tmp, count, ppos); 112 ret = ppc_md.nvram_read(tmp, count, ppos);
105 if (ret <= 0) 113 if (ret <= 0)
diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c
new file mode 100644
index 000000000000..3f608800c06b
--- /dev/null
+++ b/arch/powerpc/kernel/pci-hotplug.c
@@ -0,0 +1,111 @@
1/*
2 * Derived from "arch/powerpc/platforms/pseries/pci_dlpar.c"
3 *
4 * Copyright (C) 2003 Linda Xie <lxie@us.ibm.com>
5 * Copyright (C) 2005 International Business Machines
6 *
7 * Updates, 2005, John Rose <johnrose@austin.ibm.com>
8 * Updates, 2005, Linas Vepstas <linas@austin.ibm.com>
9 * Updates, 2013, Gavin Shan <shangw@linux.vnet.ibm.com>
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 */
16
17#include <linux/pci.h>
18#include <linux/export.h>
19#include <asm/pci-bridge.h>
20#include <asm/ppc-pci.h>
21#include <asm/firmware.h>
22#include <asm/eeh.h>
23
24/**
25 * __pcibios_remove_pci_devices - remove all devices under this bus
26 * @bus: the indicated PCI bus
27 * @purge_pe: destroy the PE on removal of PCI devices
28 *
29 * Remove all of the PCI devices under this bus both from the
30 * linux pci device tree, and from the powerpc EEH address cache.
31 * By default, the corresponding PE will be destroied during the
32 * normal PCI hotplug path. For PCI hotplug during EEH recovery,
33 * the corresponding PE won't be destroied and deallocated.
34 */
35void __pcibios_remove_pci_devices(struct pci_bus *bus, int purge_pe)
36{
37 struct pci_dev *dev, *tmp;
38 struct pci_bus *child_bus;
39
40 /* First go down child busses */
41 list_for_each_entry(child_bus, &bus->children, node)
42 __pcibios_remove_pci_devices(child_bus, purge_pe);
43
44 pr_debug("PCI: Removing devices on bus %04x:%02x\n",
45 pci_domain_nr(bus), bus->number);
46 list_for_each_entry_safe(dev, tmp, &bus->devices, bus_list) {
47 pr_debug(" * Removing %s...\n", pci_name(dev));
48 eeh_remove_bus_device(dev, purge_pe);
49 pci_stop_and_remove_bus_device(dev);
50 }
51}
52
53/**
54 * pcibios_remove_pci_devices - remove all devices under this bus
55 * @bus: the indicated PCI bus
56 *
57 * Remove all of the PCI devices under this bus both from the
58 * linux pci device tree, and from the powerpc EEH address cache.
59 */
60void pcibios_remove_pci_devices(struct pci_bus *bus)
61{
62 __pcibios_remove_pci_devices(bus, 1);
63}
64EXPORT_SYMBOL_GPL(pcibios_remove_pci_devices);
65
66/**
67 * pcibios_add_pci_devices - adds new pci devices to bus
68 * @bus: the indicated PCI bus
69 *
70 * This routine will find and fixup new pci devices under
71 * the indicated bus. This routine presumes that there
72 * might already be some devices under this bridge, so
73 * it carefully tries to add only new devices. (And that
74 * is how this routine differs from other, similar pcibios
75 * routines.)
76 */
77void pcibios_add_pci_devices(struct pci_bus * bus)
78{
79 int slotno, num, mode, pass, max;
80 struct pci_dev *dev;
81 struct device_node *dn = pci_bus_to_OF_node(bus);
82
83 eeh_add_device_tree_early(dn);
84
85 mode = PCI_PROBE_NORMAL;
86 if (ppc_md.pci_probe_mode)
87 mode = ppc_md.pci_probe_mode(bus);
88
89 if (mode == PCI_PROBE_DEVTREE) {
90 /* use ofdt-based probe */
91 of_rescan_bus(dn, bus);
92 } else if (mode == PCI_PROBE_NORMAL) {
93 /* use legacy probe */
94 slotno = PCI_SLOT(PCI_DN(dn->child)->devfn);
95 num = pci_scan_slot(bus, PCI_DEVFN(slotno, 0));
96 if (!num)
97 return;
98 pcibios_setup_bus_devices(bus);
99 max = bus->busn_res.start;
100 for (pass = 0; pass < 2; pass++) {
101 list_for_each_entry(dev, &bus->devices, bus_list) {
102 if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE ||
103 dev->hdr_type == PCI_HEADER_TYPE_CARDBUS)
104 max = pci_scan_bridge(bus, dev,
105 max, pass);
106 }
107 }
108 }
109 pcibios_finish_adding_to_bus(bus);
110}
111EXPORT_SYMBOL_GPL(pcibios_add_pci_devices);
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 076d1242507a..c517dbe705fd 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -916,7 +916,11 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
916 flush_altivec_to_thread(src); 916 flush_altivec_to_thread(src);
917 flush_vsx_to_thread(src); 917 flush_vsx_to_thread(src);
918 flush_spe_to_thread(src); 918 flush_spe_to_thread(src);
919
919 *dst = *src; 920 *dst = *src;
921
922 clear_task_ebb(dst);
923
920 return 0; 924 return 0;
921} 925}
922 926
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 8b6f7a99cce2..eb23ac92abb9 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -559,6 +559,35 @@ void __init early_init_dt_setup_initrd_arch(unsigned long start,
559} 559}
560#endif 560#endif
561 561
562static void __init early_reserve_mem_dt(void)
563{
564 unsigned long i, len, dt_root;
565 const __be32 *prop;
566
567 dt_root = of_get_flat_dt_root();
568
569 prop = of_get_flat_dt_prop(dt_root, "reserved-ranges", &len);
570
571 if (!prop)
572 return;
573
574 DBG("Found new-style reserved-ranges\n");
575
576 /* Each reserved range is an (address,size) pair, 2 cells each,
577 * totalling 4 cells per range. */
578 for (i = 0; i < len / (sizeof(*prop) * 4); i++) {
579 u64 base, size;
580
581 base = of_read_number(prop + (i * 4) + 0, 2);
582 size = of_read_number(prop + (i * 4) + 2, 2);
583
584 if (size) {
585 DBG("reserving: %llx -> %llx\n", base, size);
586 memblock_reserve(base, size);
587 }
588 }
589}
590
562static void __init early_reserve_mem(void) 591static void __init early_reserve_mem(void)
563{ 592{
564 u64 base, size; 593 u64 base, size;
@@ -574,12 +603,16 @@ static void __init early_reserve_mem(void)
574 self_size = initial_boot_params->totalsize; 603 self_size = initial_boot_params->totalsize;
575 memblock_reserve(self_base, self_size); 604 memblock_reserve(self_base, self_size);
576 605
606 /* Look for the new "reserved-regions" property in the DT */
607 early_reserve_mem_dt();
608
577#ifdef CONFIG_BLK_DEV_INITRD 609#ifdef CONFIG_BLK_DEV_INITRD
578 /* then reserve the initrd, if any */ 610 /* Then reserve the initrd, if any */
579 if (initrd_start && (initrd_end > initrd_start)) 611 if (initrd_start && (initrd_end > initrd_start)) {
580 memblock_reserve(_ALIGN_DOWN(__pa(initrd_start), PAGE_SIZE), 612 memblock_reserve(_ALIGN_DOWN(__pa(initrd_start), PAGE_SIZE),
581 _ALIGN_UP(initrd_end, PAGE_SIZE) - 613 _ALIGN_UP(initrd_end, PAGE_SIZE) -
582 _ALIGN_DOWN(initrd_start, PAGE_SIZE)); 614 _ALIGN_DOWN(initrd_start, PAGE_SIZE));
615 }
583#endif /* CONFIG_BLK_DEV_INITRD */ 616#endif /* CONFIG_BLK_DEV_INITRD */
584 617
585#ifdef CONFIG_PPC32 618#ifdef CONFIG_PPC32
@@ -591,6 +624,8 @@ static void __init early_reserve_mem(void)
591 u32 base_32, size_32; 624 u32 base_32, size_32;
592 u32 *reserve_map_32 = (u32 *)reserve_map; 625 u32 *reserve_map_32 = (u32 *)reserve_map;
593 626
627 DBG("Found old 32-bit reserve map\n");
628
594 while (1) { 629 while (1) {
595 base_32 = *(reserve_map_32++); 630 base_32 = *(reserve_map_32++);
596 size_32 = *(reserve_map_32++); 631 size_32 = *(reserve_map_32++);
@@ -605,6 +640,9 @@ static void __init early_reserve_mem(void)
605 return; 640 return;
606 } 641 }
607#endif 642#endif
643 DBG("Processing reserve map\n");
644
645 /* Handle the reserve map in the fdt blob if it exists */
608 while (1) { 646 while (1) {
609 base = *(reserve_map++); 647 base = *(reserve_map++);
610 size = *(reserve_map++); 648 size = *(reserve_map++);
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index 98c2fc198712..64f7bd5b1b0f 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -1449,7 +1449,9 @@ static long ppc_set_hwdebug(struct task_struct *child,
1449 */ 1449 */
1450 if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE) { 1450 if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE) {
1451 len = bp_info->addr2 - bp_info->addr; 1451 len = bp_info->addr2 - bp_info->addr;
1452 } else if (bp_info->addr_mode != PPC_BREAKPOINT_MODE_EXACT) { 1452 } else if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_EXACT)
1453 len = 1;
1454 else {
1453 ptrace_put_breakpoints(child); 1455 ptrace_put_breakpoints(child);
1454 return -EINVAL; 1456 return -EINVAL;
1455 } 1457 }
diff --git a/arch/powerpc/kernel/reloc_32.S b/arch/powerpc/kernel/reloc_32.S
index ef46ba6e094f..f366fedb0872 100644
--- a/arch/powerpc/kernel/reloc_32.S
+++ b/arch/powerpc/kernel/reloc_32.S
@@ -166,7 +166,7 @@ ha16:
166 /* R_PPC_ADDR16_LO */ 166 /* R_PPC_ADDR16_LO */
167lo16: 167lo16:
168 cmpwi r4, R_PPC_ADDR16_LO 168 cmpwi r4, R_PPC_ADDR16_LO
169 bne nxtrela 169 bne unknown_type
170 lwz r4, 0(r9) /* r_offset */ 170 lwz r4, 0(r9) /* r_offset */
171 lwz r0, 8(r9) /* r_addend */ 171 lwz r0, 8(r9) /* r_addend */
172 add r0, r0, r3 172 add r0, r0, r3
@@ -191,6 +191,7 @@ nxtrela:
191 dcbst r4,r7 191 dcbst r4,r7
192 sync /* Ensure the data is flushed before icbi */ 192 sync /* Ensure the data is flushed before icbi */
193 icbi r4,r7 193 icbi r4,r7
194unknown_type:
194 cmpwi r8, 0 /* relasz = 0 ? */ 195 cmpwi r8, 0 /* relasz = 0 ? */
195 ble done 196 ble done
196 add r9, r9, r6 /* move to next entry in the .rela table */ 197 add r9, r9, r6 /* move to next entry in the .rela table */
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index 52add6f3e201..80b5ef403f68 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -1172,7 +1172,7 @@ int __init early_init_dt_scan_rtas(unsigned long node,
1172static arch_spinlock_t timebase_lock; 1172static arch_spinlock_t timebase_lock;
1173static u64 timebase = 0; 1173static u64 timebase = 0;
1174 1174
1175void __cpuinit rtas_give_timebase(void) 1175void rtas_give_timebase(void)
1176{ 1176{
1177 unsigned long flags; 1177 unsigned long flags;
1178 1178
@@ -1189,7 +1189,7 @@ void __cpuinit rtas_give_timebase(void)
1189 local_irq_restore(flags); 1189 local_irq_restore(flags);
1190} 1190}
1191 1191
1192void __cpuinit rtas_take_timebase(void) 1192void rtas_take_timebase(void)
1193{ 1193{
1194 while (!timebase) 1194 while (!timebase)
1195 barrier(); 1195 barrier();
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index e379d3fd1694..389fb8077cc9 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -76,7 +76,7 @@
76#endif 76#endif
77 77
78int boot_cpuid = 0; 78int boot_cpuid = 0;
79int __initdata spinning_secondaries; 79int spinning_secondaries;
80u64 ppc64_pft_size; 80u64 ppc64_pft_size;
81 81
82/* Pick defaults since we might want to patch instructions 82/* Pick defaults since we might want to patch instructions
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 201385c3a1ae..0f83122e6676 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -407,7 +407,8 @@ inline unsigned long copy_transact_fpr_from_user(struct task_struct *task,
407 * altivec/spe instructions at some point. 407 * altivec/spe instructions at some point.
408 */ 408 */
409static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame, 409static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame,
410 int sigret, int ctx_has_vsx_region) 410 struct mcontext __user *tm_frame, int sigret,
411 int ctx_has_vsx_region)
411{ 412{
412 unsigned long msr = regs->msr; 413 unsigned long msr = regs->msr;
413 414
@@ -475,6 +476,12 @@ static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame,
475 476
476 if (__put_user(msr, &frame->mc_gregs[PT_MSR])) 477 if (__put_user(msr, &frame->mc_gregs[PT_MSR]))
477 return 1; 478 return 1;
479 /* We need to write 0 the MSR top 32 bits in the tm frame so that we
480 * can check it on the restore to see if TM is active
481 */
482 if (tm_frame && __put_user(0, &tm_frame->mc_gregs[PT_MSR]))
483 return 1;
484
478 if (sigret) { 485 if (sigret) {
479 /* Set up the sigreturn trampoline: li r0,sigret; sc */ 486 /* Set up the sigreturn trampoline: li r0,sigret; sc */
480 if (__put_user(0x38000000UL + sigret, &frame->tramp[0]) 487 if (__put_user(0x38000000UL + sigret, &frame->tramp[0])
@@ -747,7 +754,7 @@ static long restore_tm_user_regs(struct pt_regs *regs,
747 struct mcontext __user *tm_sr) 754 struct mcontext __user *tm_sr)
748{ 755{
749 long err; 756 long err;
750 unsigned long msr; 757 unsigned long msr, msr_hi;
751#ifdef CONFIG_VSX 758#ifdef CONFIG_VSX
752 int i; 759 int i;
753#endif 760#endif
@@ -852,8 +859,11 @@ static long restore_tm_user_regs(struct pt_regs *regs,
852 tm_enable(); 859 tm_enable();
853 /* This loads the checkpointed FP/VEC state, if used */ 860 /* This loads the checkpointed FP/VEC state, if used */
854 tm_recheckpoint(&current->thread, msr); 861 tm_recheckpoint(&current->thread, msr);
855 /* The task has moved into TM state S, so ensure MSR reflects this */ 862 /* Get the top half of the MSR */
856 regs->msr = (regs->msr & ~MSR_TS_MASK) | MSR_TS_S; 863 if (__get_user(msr_hi, &tm_sr->mc_gregs[PT_MSR]))
864 return 1;
865 /* Pull in MSR TM from user context */
866 regs->msr = (regs->msr & ~MSR_TS_MASK) | ((msr_hi<<32) & MSR_TS_MASK);
857 867
858 /* This loads the speculative FP/VEC state, if used */ 868 /* This loads the speculative FP/VEC state, if used */
859 if (msr & MSR_FP) { 869 if (msr & MSR_FP) {
@@ -952,6 +962,7 @@ int handle_rt_signal32(unsigned long sig, struct k_sigaction *ka,
952{ 962{
953 struct rt_sigframe __user *rt_sf; 963 struct rt_sigframe __user *rt_sf;
954 struct mcontext __user *frame; 964 struct mcontext __user *frame;
965 struct mcontext __user *tm_frame = NULL;
955 void __user *addr; 966 void __user *addr;
956 unsigned long newsp = 0; 967 unsigned long newsp = 0;
957 int sigret; 968 int sigret;
@@ -985,23 +996,24 @@ int handle_rt_signal32(unsigned long sig, struct k_sigaction *ka,
985 } 996 }
986 997
987#ifdef CONFIG_PPC_TRANSACTIONAL_MEM 998#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
999 tm_frame = &rt_sf->uc_transact.uc_mcontext;
988 if (MSR_TM_ACTIVE(regs->msr)) { 1000 if (MSR_TM_ACTIVE(regs->msr)) {
989 if (save_tm_user_regs(regs, &rt_sf->uc.uc_mcontext, 1001 if (save_tm_user_regs(regs, frame, tm_frame, sigret))
990 &rt_sf->uc_transact.uc_mcontext, sigret))
991 goto badframe; 1002 goto badframe;
992 } 1003 }
993 else 1004 else
994#endif 1005#endif
995 if (save_user_regs(regs, frame, sigret, 1)) 1006 {
1007 if (save_user_regs(regs, frame, tm_frame, sigret, 1))
996 goto badframe; 1008 goto badframe;
1009 }
997 regs->link = tramp; 1010 regs->link = tramp;
998 1011
999#ifdef CONFIG_PPC_TRANSACTIONAL_MEM 1012#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
1000 if (MSR_TM_ACTIVE(regs->msr)) { 1013 if (MSR_TM_ACTIVE(regs->msr)) {
1001 if (__put_user((unsigned long)&rt_sf->uc_transact, 1014 if (__put_user((unsigned long)&rt_sf->uc_transact,
1002 &rt_sf->uc.uc_link) 1015 &rt_sf->uc.uc_link)
1003 || __put_user(to_user_ptr(&rt_sf->uc_transact.uc_mcontext), 1016 || __put_user((unsigned long)tm_frame, &rt_sf->uc_transact.uc_regs))
1004 &rt_sf->uc_transact.uc_regs))
1005 goto badframe; 1017 goto badframe;
1006 } 1018 }
1007 else 1019 else
@@ -1170,7 +1182,7 @@ long sys_swapcontext(struct ucontext __user *old_ctx,
1170 mctx = (struct mcontext __user *) 1182 mctx = (struct mcontext __user *)
1171 ((unsigned long) &old_ctx->uc_mcontext & ~0xfUL); 1183 ((unsigned long) &old_ctx->uc_mcontext & ~0xfUL);
1172 if (!access_ok(VERIFY_WRITE, old_ctx, ctx_size) 1184 if (!access_ok(VERIFY_WRITE, old_ctx, ctx_size)
1173 || save_user_regs(regs, mctx, 0, ctx_has_vsx_region) 1185 || save_user_regs(regs, mctx, NULL, 0, ctx_has_vsx_region)
1174 || put_sigset_t(&old_ctx->uc_sigmask, &current->blocked) 1186 || put_sigset_t(&old_ctx->uc_sigmask, &current->blocked)
1175 || __put_user(to_user_ptr(mctx), &old_ctx->uc_regs)) 1187 || __put_user(to_user_ptr(mctx), &old_ctx->uc_regs))
1176 return -EFAULT; 1188 return -EFAULT;
@@ -1233,7 +1245,7 @@ long sys_rt_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8,
1233 if (__get_user(msr_hi, &mcp->mc_gregs[PT_MSR])) 1245 if (__get_user(msr_hi, &mcp->mc_gregs[PT_MSR]))
1234 goto bad; 1246 goto bad;
1235 1247
1236 if (MSR_TM_SUSPENDED(msr_hi<<32)) { 1248 if (MSR_TM_ACTIVE(msr_hi<<32)) {
1237 /* We only recheckpoint on return if we're 1249 /* We only recheckpoint on return if we're
1238 * transaction. 1250 * transaction.
1239 */ 1251 */
@@ -1392,6 +1404,7 @@ int handle_signal32(unsigned long sig, struct k_sigaction *ka,
1392{ 1404{
1393 struct sigcontext __user *sc; 1405 struct sigcontext __user *sc;
1394 struct sigframe __user *frame; 1406 struct sigframe __user *frame;
1407 struct mcontext __user *tm_mctx = NULL;
1395 unsigned long newsp = 0; 1408 unsigned long newsp = 0;
1396 int sigret; 1409 int sigret;
1397 unsigned long tramp; 1410 unsigned long tramp;
@@ -1425,6 +1438,7 @@ int handle_signal32(unsigned long sig, struct k_sigaction *ka,
1425 } 1438 }
1426 1439
1427#ifdef CONFIG_PPC_TRANSACTIONAL_MEM 1440#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
1441 tm_mctx = &frame->mctx_transact;
1428 if (MSR_TM_ACTIVE(regs->msr)) { 1442 if (MSR_TM_ACTIVE(regs->msr)) {
1429 if (save_tm_user_regs(regs, &frame->mctx, &frame->mctx_transact, 1443 if (save_tm_user_regs(regs, &frame->mctx, &frame->mctx_transact,
1430 sigret)) 1444 sigret))
@@ -1432,8 +1446,10 @@ int handle_signal32(unsigned long sig, struct k_sigaction *ka,
1432 } 1446 }
1433 else 1447 else
1434#endif 1448#endif
1435 if (save_user_regs(regs, &frame->mctx, sigret, 1)) 1449 {
1450 if (save_user_regs(regs, &frame->mctx, tm_mctx, sigret, 1))
1436 goto badframe; 1451 goto badframe;
1452 }
1437 1453
1438 regs->link = tramp; 1454 regs->link = tramp;
1439 1455
@@ -1481,16 +1497,22 @@ badframe:
1481long sys_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8, 1497long sys_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8,
1482 struct pt_regs *regs) 1498 struct pt_regs *regs)
1483{ 1499{
1500 struct sigframe __user *sf;
1484 struct sigcontext __user *sc; 1501 struct sigcontext __user *sc;
1485 struct sigcontext sigctx; 1502 struct sigcontext sigctx;
1486 struct mcontext __user *sr; 1503 struct mcontext __user *sr;
1487 void __user *addr; 1504 void __user *addr;
1488 sigset_t set; 1505 sigset_t set;
1506#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
1507 struct mcontext __user *mcp, *tm_mcp;
1508 unsigned long msr_hi;
1509#endif
1489 1510
1490 /* Always make any pending restarted system calls return -EINTR */ 1511 /* Always make any pending restarted system calls return -EINTR */
1491 current_thread_info()->restart_block.fn = do_no_restart_syscall; 1512 current_thread_info()->restart_block.fn = do_no_restart_syscall;
1492 1513
1493 sc = (struct sigcontext __user *)(regs->gpr[1] + __SIGNAL_FRAMESIZE); 1514 sf = (struct sigframe __user *)(regs->gpr[1] + __SIGNAL_FRAMESIZE);
1515 sc = &sf->sctx;
1494 addr = sc; 1516 addr = sc;
1495 if (copy_from_user(&sigctx, sc, sizeof(sigctx))) 1517 if (copy_from_user(&sigctx, sc, sizeof(sigctx)))
1496 goto badframe; 1518 goto badframe;
@@ -1507,11 +1529,25 @@ long sys_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8,
1507#endif 1529#endif
1508 set_current_blocked(&set); 1530 set_current_blocked(&set);
1509 1531
1510 sr = (struct mcontext __user *)from_user_ptr(sigctx.regs); 1532#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
1511 addr = sr; 1533 mcp = (struct mcontext __user *)&sf->mctx;
1512 if (!access_ok(VERIFY_READ, sr, sizeof(*sr)) 1534 tm_mcp = (struct mcontext __user *)&sf->mctx_transact;
1513 || restore_user_regs(regs, sr, 1)) 1535 if (__get_user(msr_hi, &tm_mcp->mc_gregs[PT_MSR]))
1514 goto badframe; 1536 goto badframe;
1537 if (MSR_TM_ACTIVE(msr_hi<<32)) {
1538 if (!cpu_has_feature(CPU_FTR_TM))
1539 goto badframe;
1540 if (restore_tm_user_regs(regs, mcp, tm_mcp))
1541 goto badframe;
1542 } else
1543#endif
1544 {
1545 sr = (struct mcontext __user *)from_user_ptr(sigctx.regs);
1546 addr = sr;
1547 if (!access_ok(VERIFY_READ, sr, sizeof(*sr))
1548 || restore_user_regs(regs, sr, 1))
1549 goto badframe;
1550 }
1515 1551
1516 set_thread_flag(TIF_RESTOREALL); 1552 set_thread_flag(TIF_RESTOREALL);
1517 return 0; 1553 return 0;
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index 345947367ec0..887e99d85bc2 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -410,6 +410,10 @@ static long restore_tm_sigcontexts(struct pt_regs *regs,
410 410
411 /* get MSR separately, transfer the LE bit if doing signal return */ 411 /* get MSR separately, transfer the LE bit if doing signal return */
412 err |= __get_user(msr, &sc->gp_regs[PT_MSR]); 412 err |= __get_user(msr, &sc->gp_regs[PT_MSR]);
413 /* pull in MSR TM from user context */
414 regs->msr = (regs->msr & ~MSR_TS_MASK) | (msr & MSR_TS_MASK);
415
416 /* pull in MSR LE from user context */
413 regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE); 417 regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE);
414 418
415 /* The following non-GPR non-FPR non-VR state is also checkpointed: */ 419 /* The following non-GPR non-FPR non-VR state is also checkpointed: */
@@ -505,8 +509,6 @@ static long restore_tm_sigcontexts(struct pt_regs *regs,
505 tm_enable(); 509 tm_enable();
506 /* This loads the checkpointed FP/VEC state, if used */ 510 /* This loads the checkpointed FP/VEC state, if used */
507 tm_recheckpoint(&current->thread, msr); 511 tm_recheckpoint(&current->thread, msr);
508 /* The task has moved into TM state S, so ensure MSR reflects this: */
509 regs->msr = (regs->msr & ~MSR_TS_MASK) | __MASK(33);
510 512
511 /* This loads the speculative FP/VEC state, if used */ 513 /* This loads the speculative FP/VEC state, if used */
512 if (msr & MSR_FP) { 514 if (msr & MSR_FP) {
@@ -654,7 +656,7 @@ int sys_rt_sigreturn(unsigned long r3, unsigned long r4, unsigned long r5,
654#ifdef CONFIG_PPC_TRANSACTIONAL_MEM 656#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
655 if (__get_user(msr, &uc->uc_mcontext.gp_regs[PT_MSR])) 657 if (__get_user(msr, &uc->uc_mcontext.gp_regs[PT_MSR]))
656 goto badframe; 658 goto badframe;
657 if (MSR_TM_SUSPENDED(msr)) { 659 if (MSR_TM_ACTIVE(msr)) {
658 /* We recheckpoint on return. */ 660 /* We recheckpoint on return. */
659 struct ucontext __user *uc_transact; 661 struct ucontext __user *uc_transact;
660 if (__get_user(uc_transact, &uc->uc_link)) 662 if (__get_user(uc_transact, &uc->uc_link))
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index ee7ac5e6e28a..38b0ba65a735 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -480,7 +480,7 @@ static void cpu_idle_thread_init(unsigned int cpu, struct task_struct *idle)
480 secondary_ti = current_set[cpu] = ti; 480 secondary_ti = current_set[cpu] = ti;
481} 481}
482 482
483int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle) 483int __cpu_up(unsigned int cpu, struct task_struct *tidle)
484{ 484{
485 int rc, c; 485 int rc, c;
486 486
@@ -610,7 +610,7 @@ static struct device_node *cpu_to_l2cache(int cpu)
610} 610}
611 611
612/* Activate a secondary processor. */ 612/* Activate a secondary processor. */
613__cpuinit void start_secondary(void *unused) 613void start_secondary(void *unused)
614{ 614{
615 unsigned int cpu = smp_processor_id(); 615 unsigned int cpu = smp_processor_id();
616 struct device_node *l2_cache; 616 struct device_node *l2_cache;
@@ -637,12 +637,10 @@ __cpuinit void start_secondary(void *unused)
637 637
638 vdso_getcpu_init(); 638 vdso_getcpu_init();
639#endif 639#endif
640 notify_cpu_starting(cpu);
641 set_cpu_online(cpu, true);
642 /* Update sibling maps */ 640 /* Update sibling maps */
643 base = cpu_first_thread_sibling(cpu); 641 base = cpu_first_thread_sibling(cpu);
644 for (i = 0; i < threads_per_core; i++) { 642 for (i = 0; i < threads_per_core; i++) {
645 if (cpu_is_offline(base + i)) 643 if (cpu_is_offline(base + i) && (cpu != base + i))
646 continue; 644 continue;
647 cpumask_set_cpu(cpu, cpu_sibling_mask(base + i)); 645 cpumask_set_cpu(cpu, cpu_sibling_mask(base + i));
648 cpumask_set_cpu(base + i, cpu_sibling_mask(cpu)); 646 cpumask_set_cpu(base + i, cpu_sibling_mask(cpu));
@@ -667,6 +665,10 @@ __cpuinit void start_secondary(void *unused)
667 } 665 }
668 of_node_put(l2_cache); 666 of_node_put(l2_cache);
669 667
668 smp_wmb();
669 notify_cpu_starting(cpu);
670 set_cpu_online(cpu, true);
671
670 local_irq_enable(); 672 local_irq_enable();
671 673
672 cpu_startup_entry(CPUHP_ONLINE); 674 cpu_startup_entry(CPUHP_ONLINE);
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index e68a84568b8b..27a90b99ef67 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -341,7 +341,7 @@ static struct device_attribute pa6t_attrs[] = {
341#endif /* HAS_PPC_PMC_PA6T */ 341#endif /* HAS_PPC_PMC_PA6T */
342#endif /* HAS_PPC_PMC_CLASSIC */ 342#endif /* HAS_PPC_PMC_CLASSIC */
343 343
344static void __cpuinit register_cpu_online(unsigned int cpu) 344static void register_cpu_online(unsigned int cpu)
345{ 345{
346 struct cpu *c = &per_cpu(cpu_devices, cpu); 346 struct cpu *c = &per_cpu(cpu_devices, cpu);
347 struct device *s = &c->dev; 347 struct device *s = &c->dev;
@@ -502,7 +502,7 @@ ssize_t arch_cpu_release(const char *buf, size_t count)
502 502
503#endif /* CONFIG_HOTPLUG_CPU */ 503#endif /* CONFIG_HOTPLUG_CPU */
504 504
505static int __cpuinit sysfs_cpu_notify(struct notifier_block *self, 505static int sysfs_cpu_notify(struct notifier_block *self,
506 unsigned long action, void *hcpu) 506 unsigned long action, void *hcpu)
507{ 507{
508 unsigned int cpu = (unsigned int)(long)hcpu; 508 unsigned int cpu = (unsigned int)(long)hcpu;
@@ -522,7 +522,7 @@ static int __cpuinit sysfs_cpu_notify(struct notifier_block *self,
522 return NOTIFY_OK; 522 return NOTIFY_OK;
523} 523}
524 524
525static struct notifier_block __cpuinitdata sysfs_cpu_nb = { 525static struct notifier_block sysfs_cpu_nb = {
526 .notifier_call = sysfs_cpu_notify, 526 .notifier_call = sysfs_cpu_notify,
527}; 527};
528 528
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 5fc29ad7e26f..65ab9e909377 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -631,7 +631,6 @@ static int __init get_freq(char *name, int cells, unsigned long *val)
631 return found; 631 return found;
632} 632}
633 633
634/* should become __cpuinit when secondary_cpu_time_init also is */
635void start_cpu_decrementer(void) 634void start_cpu_decrementer(void)
636{ 635{
637#if defined(CONFIG_BOOKE) || defined(CONFIG_40x) 636#if defined(CONFIG_BOOKE) || defined(CONFIG_40x)
diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S
index 2da67e7a16d5..51be8fb24803 100644
--- a/arch/powerpc/kernel/tm.S
+++ b/arch/powerpc/kernel/tm.S
@@ -112,9 +112,18 @@ _GLOBAL(tm_reclaim)
112 std r3, STACK_PARAM(0)(r1) 112 std r3, STACK_PARAM(0)(r1)
113 SAVE_NVGPRS(r1) 113 SAVE_NVGPRS(r1)
114 114
115 /* We need to setup MSR for VSX register save instructions. Here we
116 * also clear the MSR RI since when we do the treclaim, we won't have a
117 * valid kernel pointer for a while. We clear RI here as it avoids
118 * adding another mtmsr closer to the treclaim. This makes the region
119 * maked as non-recoverable wider than it needs to be but it saves on
120 * inserting another mtmsrd later.
121 */
115 mfmsr r14 122 mfmsr r14
116 mr r15, r14 123 mr r15, r14
117 ori r15, r15, MSR_FP 124 ori r15, r15, MSR_FP
125 li r16, MSR_RI
126 andc r15, r15, r16
118 oris r15, r15, MSR_VEC@h 127 oris r15, r15, MSR_VEC@h
119#ifdef CONFIG_VSX 128#ifdef CONFIG_VSX
120 BEGIN_FTR_SECTION 129 BEGIN_FTR_SECTION
@@ -349,9 +358,10 @@ restore_gprs:
349 mtcr r5 358 mtcr r5
350 mtxer r6 359 mtxer r6
351 360
352 /* MSR and flags: We don't change CRs, and we don't need to alter 361 /* Clear the MSR RI since we are about to change R1. EE is already off
353 * MSR.
354 */ 362 */
363 li r4, 0
364 mtmsrd r4, 1
355 365
356 REST_4GPRS(0, r7) /* GPR0-3 */ 366 REST_4GPRS(0, r7) /* GPR0-3 */
357 REST_GPR(4, r7) /* GPR4-6 */ 367 REST_GPR(4, r7) /* GPR4-6 */
@@ -377,6 +387,10 @@ restore_gprs:
377 GET_PACA(r13) 387 GET_PACA(r13)
378 GET_SCRATCH0(r1) 388 GET_SCRATCH0(r1)
379 389
390 /* R1 is restored, so we are recoverable again. EE is still off */
391 li r4, MSR_RI
392 mtmsrd r4, 1
393
380 REST_NVGPRS(r1) 394 REST_NVGPRS(r1)
381 395
382 addi r1, r1, TM_FRAME_SIZE 396 addi r1, r1, TM_FRAME_SIZE
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index c0e5caf8ccc7..bf33c22e38a4 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -866,6 +866,10 @@ static int emulate_string_inst(struct pt_regs *regs, u32 instword)
866 u8 val; 866 u8 val;
867 u32 shift = 8 * (3 - (pos & 0x3)); 867 u32 shift = 8 * (3 - (pos & 0x3));
868 868
869 /* if process is 32-bit, clear upper 32 bits of EA */
870 if ((regs->msr & MSR_64BIT) == 0)
871 EA &= 0xFFFFFFFF;
872
869 switch ((instword & PPC_INST_STRING_MASK)) { 873 switch ((instword & PPC_INST_STRING_MASK)) {
870 case PPC_INST_LSWX: 874 case PPC_INST_LSWX:
871 case PPC_INST_LSWI: 875 case PPC_INST_LSWI:
@@ -1125,7 +1129,17 @@ void __kprobes program_check_exception(struct pt_regs *regs)
1125 * ESR_DST (!?) or 0. In the process of chasing this with the 1129 * ESR_DST (!?) or 0. In the process of chasing this with the
1126 * hardware people - not sure if it can happen on any illegal 1130 * hardware people - not sure if it can happen on any illegal
1127 * instruction or only on FP instructions, whether there is a 1131 * instruction or only on FP instructions, whether there is a
1128 * pattern to occurrences etc. -dgibson 31/Mar/2003 */ 1132 * pattern to occurrences etc. -dgibson 31/Mar/2003
1133 */
1134
1135 /*
1136 * If we support a HW FPU, we need to ensure the FP state
1137 * if flushed into the thread_struct before attempting
1138 * emulation
1139 */
1140#ifdef CONFIG_PPC_FPU
1141 flush_fp_to_thread(current);
1142#endif
1129 switch (do_mathemu(regs)) { 1143 switch (do_mathemu(regs)) {
1130 case 0: 1144 case 0:
1131 emulate_single_step(regs); 1145 emulate_single_step(regs);
@@ -1282,25 +1296,50 @@ void vsx_unavailable_exception(struct pt_regs *regs)
1282 die("Unrecoverable VSX Unavailable Exception", regs, SIGABRT); 1296 die("Unrecoverable VSX Unavailable Exception", regs, SIGABRT);
1283} 1297}
1284 1298
1285void tm_unavailable_exception(struct pt_regs *regs) 1299void facility_unavailable_exception(struct pt_regs *regs)
1286{ 1300{
1301 static char *facility_strings[] = {
1302 "FPU",
1303 "VMX/VSX",
1304 "DSCR",
1305 "PMU SPRs",
1306 "BHRB",
1307 "TM",
1308 "AT",
1309 "EBB",
1310 "TAR",
1311 };
1312 char *facility, *prefix;
1313 u64 value;
1314
1315 if (regs->trap == 0xf60) {
1316 value = mfspr(SPRN_FSCR);
1317 prefix = "";
1318 } else {
1319 value = mfspr(SPRN_HFSCR);
1320 prefix = "Hypervisor ";
1321 }
1322
1323 value = value >> 56;
1324
1287 /* We restore the interrupt state now */ 1325 /* We restore the interrupt state now */
1288 if (!arch_irq_disabled_regs(regs)) 1326 if (!arch_irq_disabled_regs(regs))
1289 local_irq_enable(); 1327 local_irq_enable();
1290 1328
1291 /* Currently we never expect a TMU exception. Catch 1329 if (value < ARRAY_SIZE(facility_strings))
1292 * this and kill the process! 1330 facility = facility_strings[value];
1293 */ 1331 else
1294 printk(KERN_EMERG "Unexpected TM unavailable exception at %lx " 1332 facility = "unknown";
1295 "(msr %lx)\n", 1333
1296 regs->nip, regs->msr); 1334 pr_err("%sFacility '%s' unavailable, exception at 0x%lx, MSR=%lx\n",
1335 prefix, facility, regs->nip, regs->msr);
1297 1336
1298 if (user_mode(regs)) { 1337 if (user_mode(regs)) {
1299 _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); 1338 _exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
1300 return; 1339 return;
1301 } 1340 }
1302 1341
1303 die("Unexpected TM unavailable exception", regs, SIGABRT); 1342 die("Unexpected facility unavailable exception", regs, SIGABRT);
1304} 1343}
1305 1344
1306#ifdef CONFIG_PPC_TRANSACTIONAL_MEM 1345#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
@@ -1396,8 +1435,7 @@ void performance_monitor_exception(struct pt_regs *regs)
1396void SoftwareEmulation(struct pt_regs *regs) 1435void SoftwareEmulation(struct pt_regs *regs)
1397{ 1436{
1398 extern int do_mathemu(struct pt_regs *); 1437 extern int do_mathemu(struct pt_regs *);
1399 extern int Soft_emulate_8xx(struct pt_regs *); 1438#if defined(CONFIG_MATH_EMULATION)
1400#if defined(CONFIG_MATH_EMULATION) || defined(CONFIG_8XX_MINIMAL_FPEMU)
1401 int errcode; 1439 int errcode;
1402#endif 1440#endif
1403 1441
@@ -1430,23 +1468,6 @@ void SoftwareEmulation(struct pt_regs *regs)
1430 _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); 1468 _exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
1431 return; 1469 return;
1432 } 1470 }
1433
1434#elif defined(CONFIG_8XX_MINIMAL_FPEMU)
1435 errcode = Soft_emulate_8xx(regs);
1436 if (errcode >= 0)
1437 PPC_WARN_EMULATED(8xx, regs);
1438
1439 switch (errcode) {
1440 case 0:
1441 emulate_single_step(regs);
1442 return;
1443 case 1:
1444 _exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
1445 return;
1446 case -EFAULT:
1447 _exception(SIGSEGV, regs, SEGV_MAPERR, regs->nip);
1448 return;
1449 }
1450#else 1471#else
1451 _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); 1472 _exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
1452#endif 1473#endif
@@ -1796,8 +1817,6 @@ struct ppc_emulated ppc_emulated = {
1796 WARN_EMULATED_SETUP(unaligned), 1817 WARN_EMULATED_SETUP(unaligned),
1797#ifdef CONFIG_MATH_EMULATION 1818#ifdef CONFIG_MATH_EMULATION
1798 WARN_EMULATED_SETUP(math), 1819 WARN_EMULATED_SETUP(math),
1799#elif defined(CONFIG_8XX_MINIMAL_FPEMU)
1800 WARN_EMULATED_SETUP(8xx),
1801#endif 1820#endif
1802#ifdef CONFIG_VSX 1821#ifdef CONFIG_VSX
1803 WARN_EMULATED_SETUP(vsx), 1822 WARN_EMULATED_SETUP(vsx),
diff --git a/arch/powerpc/kernel/udbg.c b/arch/powerpc/kernel/udbg.c
index 9d3fdcd66290..a15837519dca 100644
--- a/arch/powerpc/kernel/udbg.c
+++ b/arch/powerpc/kernel/udbg.c
@@ -50,7 +50,7 @@ void __init udbg_early_init(void)
50 udbg_init_debug_beat(); 50 udbg_init_debug_beat();
51#elif defined(CONFIG_PPC_EARLY_DEBUG_PAS_REALMODE) 51#elif defined(CONFIG_PPC_EARLY_DEBUG_PAS_REALMODE)
52 udbg_init_pas_realmode(); 52 udbg_init_pas_realmode();
53#elif defined(CONFIG_BOOTX_TEXT) 53#elif defined(CONFIG_PPC_EARLY_DEBUG_BOOTX)
54 udbg_init_btext(); 54 udbg_init_btext();
55#elif defined(CONFIG_PPC_EARLY_DEBUG_44x) 55#elif defined(CONFIG_PPC_EARLY_DEBUG_44x)
56 /* PPC44x debug */ 56 /* PPC44x debug */
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index d4f463ac65b1..1d9c92621b36 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -711,7 +711,7 @@ static void __init vdso_setup_syscall_map(void)
711} 711}
712 712
713#ifdef CONFIG_PPC64 713#ifdef CONFIG_PPC64
714int __cpuinit vdso_getcpu_init(void) 714int vdso_getcpu_init(void)
715{ 715{
716 unsigned long cpu, node, val; 716 unsigned long cpu, node, val;
717 717
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index b350d9494b26..e5240524bf6c 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -34,7 +34,7 @@
34void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte) 34void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
35{ 35{
36 ppc_md.hpte_invalidate(pte->slot, pte->host_vpn, 36 ppc_md.hpte_invalidate(pte->slot, pte->host_vpn,
37 MMU_PAGE_4K, MMU_SEGSIZE_256M, 37 MMU_PAGE_4K, MMU_PAGE_4K, MMU_SEGSIZE_256M,
38 false); 38 false);
39} 39}
40 40
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 5880dfb31074..710d31317d81 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -675,6 +675,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
675 } 675 }
676 /* if the guest wants write access, see if that is OK */ 676 /* if the guest wants write access, see if that is OK */
677 if (!writing && hpte_is_writable(r)) { 677 if (!writing && hpte_is_writable(r)) {
678 unsigned int hugepage_shift;
678 pte_t *ptep, pte; 679 pte_t *ptep, pte;
679 680
680 /* 681 /*
@@ -683,9 +684,10 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
683 */ 684 */
684 rcu_read_lock_sched(); 685 rcu_read_lock_sched();
685 ptep = find_linux_pte_or_hugepte(current->mm->pgd, 686 ptep = find_linux_pte_or_hugepte(current->mm->pgd,
686 hva, NULL); 687 hva, &hugepage_shift);
687 if (ptep && pte_present(*ptep)) { 688 if (ptep) {
688 pte = kvmppc_read_update_linux_pte(ptep, 1); 689 pte = kvmppc_read_update_linux_pte(ptep, 1,
690 hugepage_shift);
689 if (pte_write(pte)) 691 if (pte_write(pte))
690 write_ok = 1; 692 write_ok = 1;
691 } 693 }
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 6dcbb49105a4..fc25689a9f35 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -27,7 +27,7 @@ static void *real_vmalloc_addr(void *x)
27 unsigned long addr = (unsigned long) x; 27 unsigned long addr = (unsigned long) x;
28 pte_t *p; 28 pte_t *p;
29 29
30 p = find_linux_pte(swapper_pg_dir, addr); 30 p = find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL);
31 if (!p || !pte_present(*p)) 31 if (!p || !pte_present(*p))
32 return NULL; 32 return NULL;
33 /* assume we don't have huge pages in vmalloc space... */ 33 /* assume we don't have huge pages in vmalloc space... */
@@ -139,20 +139,18 @@ static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,
139{ 139{
140 pte_t *ptep; 140 pte_t *ptep;
141 unsigned long ps = *pte_sizep; 141 unsigned long ps = *pte_sizep;
142 unsigned int shift; 142 unsigned int hugepage_shift;
143 143
144 ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift); 144 ptep = find_linux_pte_or_hugepte(pgdir, hva, &hugepage_shift);
145 if (!ptep) 145 if (!ptep)
146 return __pte(0); 146 return __pte(0);
147 if (shift) 147 if (hugepage_shift)
148 *pte_sizep = 1ul << shift; 148 *pte_sizep = 1ul << hugepage_shift;
149 else 149 else
150 *pte_sizep = PAGE_SIZE; 150 *pte_sizep = PAGE_SIZE;
151 if (ps > *pte_sizep) 151 if (ps > *pte_sizep)
152 return __pte(0); 152 return __pte(0);
153 if (!pte_present(*ptep)) 153 return kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift);
154 return __pte(0);
155 return kvmppc_read_update_linux_pte(ptep, writing);
156} 154}
157 155
158static inline void unlock_hpte(unsigned long *hpte, unsigned long hpte_v) 156static inline void unlock_hpte(unsigned long *hpte, unsigned long hpte_v)
diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index e15c521846ca..99c7fc16dc0d 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -580,7 +580,7 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)
580 if (instr & 1) 580 if (instr & 1)
581 regs->link = regs->nip; 581 regs->link = regs->nip;
582 if (branch_taken(instr, regs)) 582 if (branch_taken(instr, regs))
583 regs->nip = imm; 583 regs->nip = truncate_if_32bit(regs->msr, imm);
584 return 1; 584 return 1;
585#ifdef CONFIG_PPC64 585#ifdef CONFIG_PPC64
586 case 17: /* sc */ 586 case 17: /* sc */
diff --git a/arch/powerpc/math-emu/Makefile b/arch/powerpc/math-emu/Makefile
index 7d1dba0d57f9..8d035d2d42a6 100644
--- a/arch/powerpc/math-emu/Makefile
+++ b/arch/powerpc/math-emu/Makefile
@@ -4,7 +4,8 @@ obj-$(CONFIG_MATH_EMULATION) += fabs.o fadd.o fadds.o fcmpo.o fcmpu.o \
4 fmadd.o fmadds.o fmsub.o fmsubs.o \ 4 fmadd.o fmadds.o fmsub.o fmsubs.o \
5 fmul.o fmuls.o fnabs.o fneg.o \ 5 fmul.o fmuls.o fnabs.o fneg.o \
6 fnmadd.o fnmadds.o fnmsub.o fnmsubs.o \ 6 fnmadd.o fnmadds.o fnmsub.o fnmsubs.o \
7 fres.o frsp.o frsqrte.o fsel.o lfs.o \ 7 fres.o fre.o frsp.o fsel.o lfs.o \
8 frsqrte.o frsqrtes.o \
8 fsqrt.o fsqrts.o fsub.o fsubs.o \ 9 fsqrt.o fsqrts.o fsub.o fsubs.o \
9 mcrfs.o mffs.o mtfsb0.o mtfsb1.o \ 10 mcrfs.o mffs.o mtfsb0.o mtfsb1.o \
10 mtfsf.o mtfsfi.o stfiwx.o stfs.o \ 11 mtfsf.o mtfsfi.o stfiwx.o stfs.o \
diff --git a/arch/powerpc/math-emu/fre.c b/arch/powerpc/math-emu/fre.c
new file mode 100644
index 000000000000..49ccf2cc6a5a
--- /dev/null
+++ b/arch/powerpc/math-emu/fre.c
@@ -0,0 +1,11 @@
1#include <linux/types.h>
2#include <linux/errno.h>
3#include <asm/uaccess.h>
4
5int fre(void *frD, void *frB)
6{
7#ifdef DEBUG
8 printk("%s: %p %p\n", __func__, frD, frB);
9#endif
10 return -ENOSYS;
11}
diff --git a/arch/powerpc/math-emu/frsqrtes.c b/arch/powerpc/math-emu/frsqrtes.c
new file mode 100644
index 000000000000..7e838e380314
--- /dev/null
+++ b/arch/powerpc/math-emu/frsqrtes.c
@@ -0,0 +1,11 @@
1#include <linux/types.h>
2#include <linux/errno.h>
3#include <asm/uaccess.h>
4
5int frsqrtes(void *frD, void *frB)
6{
7#ifdef DEBUG
8 printk("%s: %p %p\n", __func__, frD, frB);
9#endif
10 return 0;
11}
diff --git a/arch/powerpc/math-emu/math.c b/arch/powerpc/math-emu/math.c
index 164d55935bd8..0328e66e0799 100644
--- a/arch/powerpc/math-emu/math.c
+++ b/arch/powerpc/math-emu/math.c
@@ -58,8 +58,10 @@ FLOATFUNC(fnabs);
58FLOATFUNC(fneg); 58FLOATFUNC(fneg);
59 59
60/* Optional */ 60/* Optional */
61FLOATFUNC(fre);
61FLOATFUNC(fres); 62FLOATFUNC(fres);
62FLOATFUNC(frsqrte); 63FLOATFUNC(frsqrte);
64FLOATFUNC(frsqrtes);
63FLOATFUNC(fsel); 65FLOATFUNC(fsel);
64FLOATFUNC(fsqrt); 66FLOATFUNC(fsqrt);
65FLOATFUNC(fsqrts); 67FLOATFUNC(fsqrts);
@@ -97,6 +99,7 @@ FLOATFUNC(fsqrts);
97#define FSQRTS 0x016 /* 22 */ 99#define FSQRTS 0x016 /* 22 */
98#define FRES 0x018 /* 24 */ 100#define FRES 0x018 /* 24 */
99#define FMULS 0x019 /* 25 */ 101#define FMULS 0x019 /* 25 */
102#define FRSQRTES 0x01a /* 26 */
100#define FMSUBS 0x01c /* 28 */ 103#define FMSUBS 0x01c /* 28 */
101#define FMADDS 0x01d /* 29 */ 104#define FMADDS 0x01d /* 29 */
102#define FNMSUBS 0x01e /* 30 */ 105#define FNMSUBS 0x01e /* 30 */
@@ -109,6 +112,7 @@ FLOATFUNC(fsqrts);
109#define FADD 0x015 /* 21 */ 112#define FADD 0x015 /* 21 */
110#define FSQRT 0x016 /* 22 */ 113#define FSQRT 0x016 /* 22 */
111#define FSEL 0x017 /* 23 */ 114#define FSEL 0x017 /* 23 */
115#define FRE 0x018 /* 24 */
112#define FMUL 0x019 /* 25 */ 116#define FMUL 0x019 /* 25 */
113#define FRSQRTE 0x01a /* 26 */ 117#define FRSQRTE 0x01a /* 26 */
114#define FMSUB 0x01c /* 28 */ 118#define FMSUB 0x01c /* 28 */
@@ -299,9 +303,10 @@ do_mathemu(struct pt_regs *regs)
299 case FDIVS: func = fdivs; type = AB; break; 303 case FDIVS: func = fdivs; type = AB; break;
300 case FSUBS: func = fsubs; type = AB; break; 304 case FSUBS: func = fsubs; type = AB; break;
301 case FADDS: func = fadds; type = AB; break; 305 case FADDS: func = fadds; type = AB; break;
302 case FSQRTS: func = fsqrts; type = AB; break; 306 case FSQRTS: func = fsqrts; type = XB; break;
303 case FRES: func = fres; type = AB; break; 307 case FRES: func = fres; type = XB; break;
304 case FMULS: func = fmuls; type = AC; break; 308 case FMULS: func = fmuls; type = AC; break;
309 case FRSQRTES: func = frsqrtes;type = XB; break;
305 case FMSUBS: func = fmsubs; type = ABC; break; 310 case FMSUBS: func = fmsubs; type = ABC; break;
306 case FMADDS: func = fmadds; type = ABC; break; 311 case FMADDS: func = fmadds; type = ABC; break;
307 case FNMSUBS: func = fnmsubs; type = ABC; break; 312 case FNMSUBS: func = fnmsubs; type = ABC; break;
@@ -317,10 +322,11 @@ do_mathemu(struct pt_regs *regs)
317 case FDIV: func = fdiv; type = AB; break; 322 case FDIV: func = fdiv; type = AB; break;
318 case FSUB: func = fsub; type = AB; break; 323 case FSUB: func = fsub; type = AB; break;
319 case FADD: func = fadd; type = AB; break; 324 case FADD: func = fadd; type = AB; break;
320 case FSQRT: func = fsqrt; type = AB; break; 325 case FSQRT: func = fsqrt; type = XB; break;
326 case FRE: func = fre; type = XB; break;
321 case FSEL: func = fsel; type = ABC; break; 327 case FSEL: func = fsel; type = ABC; break;
322 case FMUL: func = fmul; type = AC; break; 328 case FMUL: func = fmul; type = AC; break;
323 case FRSQRTE: func = frsqrte; type = AB; break; 329 case FRSQRTE: func = frsqrte; type = XB; break;
324 case FMSUB: func = fmsub; type = ABC; break; 330 case FMSUB: func = fmsub; type = ABC; break;
325 case FMADD: func = fmadd; type = ABC; break; 331 case FMADD: func = fmadd; type = ABC; break;
326 case FNMSUB: func = fnmsub; type = ABC; break; 332 case FNMSUB: func = fnmsub; type = ABC; break;
diff --git a/arch/powerpc/mm/44x_mmu.c b/arch/powerpc/mm/44x_mmu.c
index 2c9441ee6bb8..82b1ff759e26 100644
--- a/arch/powerpc/mm/44x_mmu.c
+++ b/arch/powerpc/mm/44x_mmu.c
@@ -41,7 +41,7 @@ int icache_44x_need_flush;
41 41
42unsigned long tlb_47x_boltmap[1024/8]; 42unsigned long tlb_47x_boltmap[1024/8];
43 43
44static void __cpuinit ppc44x_update_tlb_hwater(void) 44static void ppc44x_update_tlb_hwater(void)
45{ 45{
46 extern unsigned int tlb_44x_patch_hwater_D[]; 46 extern unsigned int tlb_44x_patch_hwater_D[];
47 extern unsigned int tlb_44x_patch_hwater_I[]; 47 extern unsigned int tlb_44x_patch_hwater_I[];
@@ -134,7 +134,7 @@ static void __init ppc47x_update_boltmap(void)
134/* 134/*
135 * "Pins" a 256MB TLB entry in AS0 for kernel lowmem for 47x type MMU 135 * "Pins" a 256MB TLB entry in AS0 for kernel lowmem for 47x type MMU
136 */ 136 */
137static void __cpuinit ppc47x_pin_tlb(unsigned int virt, unsigned int phys) 137static void ppc47x_pin_tlb(unsigned int virt, unsigned int phys)
138{ 138{
139 unsigned int rA; 139 unsigned int rA;
140 int bolted; 140 int bolted;
@@ -229,7 +229,7 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base,
229} 229}
230 230
231#ifdef CONFIG_SMP 231#ifdef CONFIG_SMP
232void __cpuinit mmu_init_secondary(int cpu) 232void mmu_init_secondary(int cpu)
233{ 233{
234 unsigned long addr; 234 unsigned long addr;
235 unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1); 235 unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1);
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index cf16b5733eaa..51230ee6a407 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -6,17 +6,16 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
6 6
7ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) 7ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC)
8 8
9obj-y := fault.o mem.o pgtable.o gup.o \ 9obj-y := fault.o mem.o pgtable.o gup.o mmap.o \
10 init_$(CONFIG_WORD_SIZE).o \ 10 init_$(CONFIG_WORD_SIZE).o \
11 pgtable_$(CONFIG_WORD_SIZE).o 11 pgtable_$(CONFIG_WORD_SIZE).o
12obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \ 12obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \
13 tlb_nohash_low.o 13 tlb_nohash_low.o
14obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(CONFIG_WORD_SIZE)e.o 14obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(CONFIG_WORD_SIZE)e.o
15obj-$(CONFIG_PPC64) += mmap_64.o
16hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o 15hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o
17obj-$(CONFIG_PPC_STD_MMU_64) += hash_utils_64.o \ 16obj-$(CONFIG_PPC_STD_MMU_64) += hash_utils_64.o \
18 slb_low.o slb.o stab.o \ 17 slb_low.o slb.o stab.o \
19 mmap_64.o $(hash64-y) 18 $(hash64-y)
20obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o 19obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o
21obj-$(CONFIG_PPC_STD_MMU) += hash_low_$(CONFIG_WORD_SIZE).o \ 20obj-$(CONFIG_PPC_STD_MMU) += hash_low_$(CONFIG_WORD_SIZE).o \
22 tlb_hash$(CONFIG_WORD_SIZE).o \ 21 tlb_hash$(CONFIG_WORD_SIZE).o \
@@ -28,11 +27,12 @@ obj-$(CONFIG_44x) += 44x_mmu.o
28obj-$(CONFIG_PPC_FSL_BOOK3E) += fsl_booke_mmu.o 27obj-$(CONFIG_PPC_FSL_BOOK3E) += fsl_booke_mmu.o
29obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o 28obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
30obj-$(CONFIG_PPC_MM_SLICES) += slice.o 29obj-$(CONFIG_PPC_MM_SLICES) += slice.o
31ifeq ($(CONFIG_HUGETLB_PAGE),y)
32obj-y += hugetlbpage.o 30obj-y += hugetlbpage.o
31ifeq ($(CONFIG_HUGETLB_PAGE),y)
33obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o 32obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o
34obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o 33obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o
35endif 34endif
35obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hugepage-hash64.o
36obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o 36obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o
37obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o 37obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
38obj-$(CONFIG_HIGHMEM) += highmem.o 38obj-$(CONFIG_HIGHMEM) += highmem.o
diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
index 4b921affa495..49822d90ea96 100644
--- a/arch/powerpc/mm/gup.c
+++ b/arch/powerpc/mm/gup.c
@@ -34,7 +34,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
34 34
35 ptep = pte_offset_kernel(&pmd, addr); 35 ptep = pte_offset_kernel(&pmd, addr);
36 do { 36 do {
37 pte_t pte = *ptep; 37 pte_t pte = ACCESS_ONCE(*ptep);
38 struct page *page; 38 struct page *page;
39 39
40 if ((pte_val(pte) & mask) != result) 40 if ((pte_val(pte) & mask) != result)
@@ -63,12 +63,18 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
63 63
64 pmdp = pmd_offset(&pud, addr); 64 pmdp = pmd_offset(&pud, addr);
65 do { 65 do {
66 pmd_t pmd = *pmdp; 66 pmd_t pmd = ACCESS_ONCE(*pmdp);
67 67
68 next = pmd_addr_end(addr, end); 68 next = pmd_addr_end(addr, end);
69 if (pmd_none(pmd)) 69 /*
70 * If we find a splitting transparent hugepage we
71 * return zero. That will result in taking the slow
72 * path which will call wait_split_huge_page()
73 * if the pmd is still in splitting state
74 */
75 if (pmd_none(pmd) || pmd_trans_splitting(pmd))
70 return 0; 76 return 0;
71 if (pmd_huge(pmd)) { 77 if (pmd_huge(pmd) || pmd_large(pmd)) {
72 if (!gup_hugepte((pte_t *)pmdp, PMD_SIZE, addr, next, 78 if (!gup_hugepte((pte_t *)pmdp, PMD_SIZE, addr, next,
73 write, pages, nr)) 79 write, pages, nr))
74 return 0; 80 return 0;
@@ -91,7 +97,7 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
91 97
92 pudp = pud_offset(&pgd, addr); 98 pudp = pud_offset(&pgd, addr);
93 do { 99 do {
94 pud_t pud = *pudp; 100 pud_t pud = ACCESS_ONCE(*pudp);
95 101
96 next = pud_addr_end(addr, end); 102 next = pud_addr_end(addr, end);
97 if (pud_none(pud)) 103 if (pud_none(pud))
@@ -154,7 +160,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
154 160
155 pgdp = pgd_offset(mm, addr); 161 pgdp = pgd_offset(mm, addr);
156 do { 162 do {
157 pgd_t pgd = *pgdp; 163 pgd_t pgd = ACCESS_ONCE(*pgdp);
158 164
159 pr_devel(" %016lx: normal pgd %p\n", addr, 165 pr_devel(" %016lx: normal pgd %p\n", addr,
160 (void *)pgd_val(pgd)); 166 (void *)pgd_val(pgd));
diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S
index 0e980acae67c..d3cbda62857b 100644
--- a/arch/powerpc/mm/hash_low_64.S
+++ b/arch/powerpc/mm/hash_low_64.S
@@ -289,9 +289,10 @@ htab_modify_pte:
289 289
290 /* Call ppc_md.hpte_updatepp */ 290 /* Call ppc_md.hpte_updatepp */
291 mr r5,r29 /* vpn */ 291 mr r5,r29 /* vpn */
292 li r6,MMU_PAGE_4K /* page size */ 292 li r6,MMU_PAGE_4K /* base page size */
293 ld r7,STK_PARAM(R9)(r1) /* segment size */ 293 li r7,MMU_PAGE_4K /* actual page size */
294 ld r8,STK_PARAM(R8)(r1) /* get "local" param */ 294 ld r8,STK_PARAM(R9)(r1) /* segment size */
295 ld r9,STK_PARAM(R8)(r1) /* get "local" param */
295_GLOBAL(htab_call_hpte_updatepp) 296_GLOBAL(htab_call_hpte_updatepp)
296 bl . /* Patched by htab_finish_init() */ 297 bl . /* Patched by htab_finish_init() */
297 298
@@ -649,9 +650,10 @@ htab_modify_pte:
649 650
650 /* Call ppc_md.hpte_updatepp */ 651 /* Call ppc_md.hpte_updatepp */
651 mr r5,r29 /* vpn */ 652 mr r5,r29 /* vpn */
652 li r6,MMU_PAGE_4K /* page size */ 653 li r6,MMU_PAGE_4K /* base page size */
653 ld r7,STK_PARAM(R9)(r1) /* segment size */ 654 li r7,MMU_PAGE_4K /* actual page size */
654 ld r8,STK_PARAM(R8)(r1) /* get "local" param */ 655 ld r8,STK_PARAM(R9)(r1) /* segment size */
656 ld r9,STK_PARAM(R8)(r1) /* get "local" param */
655_GLOBAL(htab_call_hpte_updatepp) 657_GLOBAL(htab_call_hpte_updatepp)
656 bl . /* patched by htab_finish_init() */ 658 bl . /* patched by htab_finish_init() */
657 659
@@ -937,9 +939,10 @@ ht64_modify_pte:
937 939
938 /* Call ppc_md.hpte_updatepp */ 940 /* Call ppc_md.hpte_updatepp */
939 mr r5,r29 /* vpn */ 941 mr r5,r29 /* vpn */
940 li r6,MMU_PAGE_64K 942 li r6,MMU_PAGE_64K /* base page size */
941 ld r7,STK_PARAM(R9)(r1) /* segment size */ 943 li r7,MMU_PAGE_64K /* actual page size */
942 ld r8,STK_PARAM(R8)(r1) /* get "local" param */ 944 ld r8,STK_PARAM(R9)(r1) /* segment size */
945 ld r9,STK_PARAM(R8)(r1) /* get "local" param */
943_GLOBAL(ht64_call_hpte_updatepp) 946_GLOBAL(ht64_call_hpte_updatepp)
944 bl . /* patched by htab_finish_init() */ 947 bl . /* patched by htab_finish_init() */
945 948
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index 4c122c3f1623..3f0c30ae4791 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -273,61 +273,15 @@ static long native_hpte_remove(unsigned long hpte_group)
273 return i; 273 return i;
274} 274}
275 275
276static inline int __hpte_actual_psize(unsigned int lp, int psize)
277{
278 int i, shift;
279 unsigned int mask;
280
281 /* start from 1 ignoring MMU_PAGE_4K */
282 for (i = 1; i < MMU_PAGE_COUNT; i++) {
283
284 /* invalid penc */
285 if (mmu_psize_defs[psize].penc[i] == -1)
286 continue;
287 /*
288 * encoding bits per actual page size
289 * PTE LP actual page size
290 * rrrr rrrz >=8KB
291 * rrrr rrzz >=16KB
292 * rrrr rzzz >=32KB
293 * rrrr zzzz >=64KB
294 * .......
295 */
296 shift = mmu_psize_defs[i].shift - LP_SHIFT;
297 if (shift > LP_BITS)
298 shift = LP_BITS;
299 mask = (1 << shift) - 1;
300 if ((lp & mask) == mmu_psize_defs[psize].penc[i])
301 return i;
302 }
303 return -1;
304}
305
306static inline int hpte_actual_psize(struct hash_pte *hptep, int psize)
307{
308 /* Look at the 8 bit LP value */
309 unsigned int lp = (hptep->r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
310
311 if (!(hptep->v & HPTE_V_VALID))
312 return -1;
313
314 /* First check if it is large page */
315 if (!(hptep->v & HPTE_V_LARGE))
316 return MMU_PAGE_4K;
317
318 return __hpte_actual_psize(lp, psize);
319}
320
321static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, 276static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
322 unsigned long vpn, int psize, int ssize, 277 unsigned long vpn, int bpsize,
323 int local) 278 int apsize, int ssize, int local)
324{ 279{
325 struct hash_pte *hptep = htab_address + slot; 280 struct hash_pte *hptep = htab_address + slot;
326 unsigned long hpte_v, want_v; 281 unsigned long hpte_v, want_v;
327 int ret = 0; 282 int ret = 0;
328 int actual_psize;
329 283
330 want_v = hpte_encode_avpn(vpn, psize, ssize); 284 want_v = hpte_encode_avpn(vpn, bpsize, ssize);
331 285
332 DBG_LOW(" update(vpn=%016lx, avpnv=%016lx, group=%lx, newpp=%lx)", 286 DBG_LOW(" update(vpn=%016lx, avpnv=%016lx, group=%lx, newpp=%lx)",
333 vpn, want_v & HPTE_V_AVPN, slot, newpp); 287 vpn, want_v & HPTE_V_AVPN, slot, newpp);
@@ -335,7 +289,6 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
335 native_lock_hpte(hptep); 289 native_lock_hpte(hptep);
336 290
337 hpte_v = hptep->v; 291 hpte_v = hptep->v;
338 actual_psize = hpte_actual_psize(hptep, psize);
339 /* 292 /*
340 * We need to invalidate the TLB always because hpte_remove doesn't do 293 * We need to invalidate the TLB always because hpte_remove doesn't do
341 * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less 294 * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
@@ -343,12 +296,7 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
343 * (hpte_remove) because we assume the old translation is still 296 * (hpte_remove) because we assume the old translation is still
344 * technically "valid". 297 * technically "valid".
345 */ 298 */
346 if (actual_psize < 0) { 299 if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) {
347 actual_psize = psize;
348 ret = -1;
349 goto err_out;
350 }
351 if (!HPTE_V_COMPARE(hpte_v, want_v)) {
352 DBG_LOW(" -> miss\n"); 300 DBG_LOW(" -> miss\n");
353 ret = -1; 301 ret = -1;
354 } else { 302 } else {
@@ -357,11 +305,10 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
357 hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) | 305 hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) |
358 (newpp & (HPTE_R_PP | HPTE_R_N | HPTE_R_C)); 306 (newpp & (HPTE_R_PP | HPTE_R_N | HPTE_R_C));
359 } 307 }
360err_out:
361 native_unlock_hpte(hptep); 308 native_unlock_hpte(hptep);
362 309
363 /* Ensure it is out of the tlb too. */ 310 /* Ensure it is out of the tlb too. */
364 tlbie(vpn, psize, actual_psize, ssize, local); 311 tlbie(vpn, bpsize, apsize, ssize, local);
365 312
366 return ret; 313 return ret;
367} 314}
@@ -402,7 +349,6 @@ static long native_hpte_find(unsigned long vpn, int psize, int ssize)
402static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, 349static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
403 int psize, int ssize) 350 int psize, int ssize)
404{ 351{
405 int actual_psize;
406 unsigned long vpn; 352 unsigned long vpn;
407 unsigned long vsid; 353 unsigned long vsid;
408 long slot; 354 long slot;
@@ -415,36 +361,33 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
415 if (slot == -1) 361 if (slot == -1)
416 panic("could not find page to bolt\n"); 362 panic("could not find page to bolt\n");
417 hptep = htab_address + slot; 363 hptep = htab_address + slot;
418 actual_psize = hpte_actual_psize(hptep, psize);
419 if (actual_psize < 0)
420 actual_psize = psize;
421 364
422 /* Update the HPTE */ 365 /* Update the HPTE */
423 hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) | 366 hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) |
424 (newpp & (HPTE_R_PP | HPTE_R_N)); 367 (newpp & (HPTE_R_PP | HPTE_R_N));
425 368 /*
426 /* Ensure it is out of the tlb too. */ 369 * Ensure it is out of the tlb too. Bolted entries base and
427 tlbie(vpn, psize, actual_psize, ssize, 0); 370 * actual page size will be same.
371 */
372 tlbie(vpn, psize, psize, ssize, 0);
428} 373}
429 374
430static void native_hpte_invalidate(unsigned long slot, unsigned long vpn, 375static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
431 int psize, int ssize, int local) 376 int bpsize, int apsize, int ssize, int local)
432{ 377{
433 struct hash_pte *hptep = htab_address + slot; 378 struct hash_pte *hptep = htab_address + slot;
434 unsigned long hpte_v; 379 unsigned long hpte_v;
435 unsigned long want_v; 380 unsigned long want_v;
436 unsigned long flags; 381 unsigned long flags;
437 int actual_psize;
438 382
439 local_irq_save(flags); 383 local_irq_save(flags);
440 384
441 DBG_LOW(" invalidate(vpn=%016lx, hash: %lx)\n", vpn, slot); 385 DBG_LOW(" invalidate(vpn=%016lx, hash: %lx)\n", vpn, slot);
442 386
443 want_v = hpte_encode_avpn(vpn, psize, ssize); 387 want_v = hpte_encode_avpn(vpn, bpsize, ssize);
444 native_lock_hpte(hptep); 388 native_lock_hpte(hptep);
445 hpte_v = hptep->v; 389 hpte_v = hptep->v;
446 390
447 actual_psize = hpte_actual_psize(hptep, psize);
448 /* 391 /*
449 * We need to invalidate the TLB always because hpte_remove doesn't do 392 * We need to invalidate the TLB always because hpte_remove doesn't do
450 * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less 393 * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
@@ -452,23 +395,120 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
452 * (hpte_remove) because we assume the old translation is still 395 * (hpte_remove) because we assume the old translation is still
453 * technically "valid". 396 * technically "valid".
454 */ 397 */
455 if (actual_psize < 0) { 398 if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
456 actual_psize = psize;
457 native_unlock_hpte(hptep);
458 goto err_out;
459 }
460 if (!HPTE_V_COMPARE(hpte_v, want_v))
461 native_unlock_hpte(hptep); 399 native_unlock_hpte(hptep);
462 else 400 else
463 /* Invalidate the hpte. NOTE: this also unlocks it */ 401 /* Invalidate the hpte. NOTE: this also unlocks it */
464 hptep->v = 0; 402 hptep->v = 0;
465 403
466err_out:
467 /* Invalidate the TLB */ 404 /* Invalidate the TLB */
468 tlbie(vpn, psize, actual_psize, ssize, local); 405 tlbie(vpn, bpsize, apsize, ssize, local);
406
407 local_irq_restore(flags);
408}
409
410static void native_hugepage_invalidate(struct mm_struct *mm,
411 unsigned char *hpte_slot_array,
412 unsigned long addr, int psize)
413{
414 int ssize = 0, i;
415 int lock_tlbie;
416 struct hash_pte *hptep;
417 int actual_psize = MMU_PAGE_16M;
418 unsigned int max_hpte_count, valid;
419 unsigned long flags, s_addr = addr;
420 unsigned long hpte_v, want_v, shift;
421 unsigned long hidx, vpn = 0, vsid, hash, slot;
422
423 shift = mmu_psize_defs[psize].shift;
424 max_hpte_count = 1U << (PMD_SHIFT - shift);
425
426 local_irq_save(flags);
427 for (i = 0; i < max_hpte_count; i++) {
428 valid = hpte_valid(hpte_slot_array, i);
429 if (!valid)
430 continue;
431 hidx = hpte_hash_index(hpte_slot_array, i);
432
433 /* get the vpn */
434 addr = s_addr + (i * (1ul << shift));
435 if (!is_kernel_addr(addr)) {
436 ssize = user_segment_size(addr);
437 vsid = get_vsid(mm->context.id, addr, ssize);
438 WARN_ON(vsid == 0);
439 } else {
440 vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
441 ssize = mmu_kernel_ssize;
442 }
443
444 vpn = hpt_vpn(addr, vsid, ssize);
445 hash = hpt_hash(vpn, shift, ssize);
446 if (hidx & _PTEIDX_SECONDARY)
447 hash = ~hash;
448
449 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
450 slot += hidx & _PTEIDX_GROUP_IX;
451
452 hptep = htab_address + slot;
453 want_v = hpte_encode_avpn(vpn, psize, ssize);
454 native_lock_hpte(hptep);
455 hpte_v = hptep->v;
456
457 /* Even if we miss, we need to invalidate the TLB */
458 if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
459 native_unlock_hpte(hptep);
460 else
461 /* Invalidate the hpte. NOTE: this also unlocks it */
462 hptep->v = 0;
463 }
464 /*
465 * Since this is a hugepage, we just need a single tlbie.
466 * use the last vpn.
467 */
468 lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
469 if (lock_tlbie)
470 raw_spin_lock(&native_tlbie_lock);
471
472 asm volatile("ptesync":::"memory");
473 __tlbie(vpn, psize, actual_psize, ssize);
474 asm volatile("eieio; tlbsync; ptesync":::"memory");
475
476 if (lock_tlbie)
477 raw_spin_unlock(&native_tlbie_lock);
478
469 local_irq_restore(flags); 479 local_irq_restore(flags);
470} 480}
471 481
482static inline int __hpte_actual_psize(unsigned int lp, int psize)
483{
484 int i, shift;
485 unsigned int mask;
486
487 /* start from 1 ignoring MMU_PAGE_4K */
488 for (i = 1; i < MMU_PAGE_COUNT; i++) {
489
490 /* invalid penc */
491 if (mmu_psize_defs[psize].penc[i] == -1)
492 continue;
493 /*
494 * encoding bits per actual page size
495 * PTE LP actual page size
496 * rrrr rrrz >=8KB
497 * rrrr rrzz >=16KB
498 * rrrr rzzz >=32KB
499 * rrrr zzzz >=64KB
500 * .......
501 */
502 shift = mmu_psize_defs[i].shift - LP_SHIFT;
503 if (shift > LP_BITS)
504 shift = LP_BITS;
505 mask = (1 << shift) - 1;
506 if ((lp & mask) == mmu_psize_defs[psize].penc[i])
507 return i;
508 }
509 return -1;
510}
511
472static void hpte_decode(struct hash_pte *hpte, unsigned long slot, 512static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
473 int *psize, int *apsize, int *ssize, unsigned long *vpn) 513 int *psize, int *apsize, int *ssize, unsigned long *vpn)
474{ 514{
@@ -672,4 +712,5 @@ void __init hpte_init_native(void)
672 ppc_md.hpte_remove = native_hpte_remove; 712 ppc_md.hpte_remove = native_hpte_remove;
673 ppc_md.hpte_clear_all = native_hpte_clear; 713 ppc_md.hpte_clear_all = native_hpte_clear;
674 ppc_md.flush_hash_range = native_flush_hash_range; 714 ppc_md.flush_hash_range = native_flush_hash_range;
715 ppc_md.hugepage_invalidate = native_hugepage_invalidate;
675} 716}
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index e303a6d74e3a..6ecc38bd5b24 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -807,7 +807,7 @@ void __init early_init_mmu(void)
807} 807}
808 808
809#ifdef CONFIG_SMP 809#ifdef CONFIG_SMP
810void __cpuinit early_init_mmu_secondary(void) 810void early_init_mmu_secondary(void)
811{ 811{
812 /* Initialize hash table for that CPU */ 812 /* Initialize hash table for that CPU */
813 if (!firmware_has_feature(FW_FEATURE_LPAR)) 813 if (!firmware_has_feature(FW_FEATURE_LPAR))
@@ -1050,13 +1050,26 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
1050 goto bail; 1050 goto bail;
1051 } 1051 }
1052 1052
1053#ifdef CONFIG_HUGETLB_PAGE
1054 if (hugeshift) { 1053 if (hugeshift) {
1055 rc = __hash_page_huge(ea, access, vsid, ptep, trap, local, 1054 if (pmd_trans_huge(*(pmd_t *)ptep))
1056 ssize, hugeshift, psize); 1055 rc = __hash_page_thp(ea, access, vsid, (pmd_t *)ptep,
1056 trap, local, ssize, psize);
1057#ifdef CONFIG_HUGETLB_PAGE
1058 else
1059 rc = __hash_page_huge(ea, access, vsid, ptep, trap,
1060 local, ssize, hugeshift, psize);
1061#else
1062 else {
1063 /*
1064 * if we have hugeshift, and is not transhuge with
1065 * hugetlb disabled, something is really wrong.
1066 */
1067 rc = 1;
1068 WARN_ON(1);
1069 }
1070#endif
1057 goto bail; 1071 goto bail;
1058 } 1072 }
1059#endif /* CONFIG_HUGETLB_PAGE */
1060 1073
1061#ifndef CONFIG_PPC_64K_PAGES 1074#ifndef CONFIG_PPC_64K_PAGES
1062 DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep)); 1075 DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep));
@@ -1145,6 +1158,7 @@ EXPORT_SYMBOL_GPL(hash_page);
1145void hash_preload(struct mm_struct *mm, unsigned long ea, 1158void hash_preload(struct mm_struct *mm, unsigned long ea,
1146 unsigned long access, unsigned long trap) 1159 unsigned long access, unsigned long trap)
1147{ 1160{
1161 int hugepage_shift;
1148 unsigned long vsid; 1162 unsigned long vsid;
1149 pgd_t *pgdir; 1163 pgd_t *pgdir;
1150 pte_t *ptep; 1164 pte_t *ptep;
@@ -1166,10 +1180,27 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
1166 pgdir = mm->pgd; 1180 pgdir = mm->pgd;
1167 if (pgdir == NULL) 1181 if (pgdir == NULL)
1168 return; 1182 return;
1169 ptep = find_linux_pte(pgdir, ea); 1183
1170 if (!ptep) 1184 /* Get VSID */
1185 ssize = user_segment_size(ea);
1186 vsid = get_vsid(mm->context.id, ea, ssize);
1187 if (!vsid)
1171 return; 1188 return;
1189 /*
1190 * Hash doesn't like irqs. Walking linux page table with irq disabled
1191 * saves us from holding multiple locks.
1192 */
1193 local_irq_save(flags);
1194
1195 /*
1196 * THP pages use update_mmu_cache_pmd. We don't do
1197 * hash preload there. Hence can ignore THP here
1198 */
1199 ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugepage_shift);
1200 if (!ptep)
1201 goto out_exit;
1172 1202
1203 WARN_ON(hugepage_shift);
1173#ifdef CONFIG_PPC_64K_PAGES 1204#ifdef CONFIG_PPC_64K_PAGES
1174 /* If either _PAGE_4K_PFN or _PAGE_NO_CACHE is set (and we are on 1205 /* If either _PAGE_4K_PFN or _PAGE_NO_CACHE is set (and we are on
1175 * a 64K kernel), then we don't preload, hash_page() will take 1206 * a 64K kernel), then we don't preload, hash_page() will take
@@ -1178,18 +1209,9 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
1178 * page size demotion here 1209 * page size demotion here
1179 */ 1210 */
1180 if (pte_val(*ptep) & (_PAGE_4K_PFN | _PAGE_NO_CACHE)) 1211 if (pte_val(*ptep) & (_PAGE_4K_PFN | _PAGE_NO_CACHE))
1181 return; 1212 goto out_exit;
1182#endif /* CONFIG_PPC_64K_PAGES */ 1213#endif /* CONFIG_PPC_64K_PAGES */
1183 1214
1184 /* Get VSID */
1185 ssize = user_segment_size(ea);
1186 vsid = get_vsid(mm->context.id, ea, ssize);
1187 if (!vsid)
1188 return;
1189
1190 /* Hash doesn't like irqs */
1191 local_irq_save(flags);
1192
1193 /* Is that local to this CPU ? */ 1215 /* Is that local to this CPU ? */
1194 if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) 1216 if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
1195 local = 1; 1217 local = 1;
@@ -1211,7 +1233,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
1211 mm->context.user_psize, 1233 mm->context.user_psize,
1212 mm->context.user_psize, 1234 mm->context.user_psize,
1213 pte_val(*ptep)); 1235 pte_val(*ptep));
1214 1236out_exit:
1215 local_irq_restore(flags); 1237 local_irq_restore(flags);
1216} 1238}
1217 1239
@@ -1232,7 +1254,11 @@ void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize,
1232 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; 1254 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
1233 slot += hidx & _PTEIDX_GROUP_IX; 1255 slot += hidx & _PTEIDX_GROUP_IX;
1234 DBG_LOW(" sub %ld: hash=%lx, hidx=%lx\n", index, slot, hidx); 1256 DBG_LOW(" sub %ld: hash=%lx, hidx=%lx\n", index, slot, hidx);
1235 ppc_md.hpte_invalidate(slot, vpn, psize, ssize, local); 1257 /*
1258 * We use same base page size and actual psize, because we don't
1259 * use these functions for hugepage
1260 */
1261 ppc_md.hpte_invalidate(slot, vpn, psize, psize, ssize, local);
1236 } pte_iterate_hashed_end(); 1262 } pte_iterate_hashed_end();
1237 1263
1238#ifdef CONFIG_PPC_TRANSACTIONAL_MEM 1264#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
@@ -1365,7 +1391,8 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
1365 hash = ~hash; 1391 hash = ~hash;
1366 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; 1392 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
1367 slot += hidx & _PTEIDX_GROUP_IX; 1393 slot += hidx & _PTEIDX_GROUP_IX;
1368 ppc_md.hpte_invalidate(slot, vpn, mmu_linear_psize, mmu_kernel_ssize, 0); 1394 ppc_md.hpte_invalidate(slot, vpn, mmu_linear_psize, mmu_linear_psize,
1395 mmu_kernel_ssize, 0);
1369} 1396}
1370 1397
1371void kernel_map_pages(struct page *page, int numpages, int enable) 1398void kernel_map_pages(struct page *page, int numpages, int enable)
diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c
new file mode 100644
index 000000000000..34de9e0cdc34
--- /dev/null
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -0,0 +1,175 @@
1/*
2 * Copyright IBM Corporation, 2013
3 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of version 2.1 of the GNU Lesser General Public License
7 * as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 *
13 */
14
15/*
16 * PPC64 THP Support for hash based MMUs
17 */
18#include <linux/mm.h>
19#include <asm/machdep.h>
20
21int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
22 pmd_t *pmdp, unsigned long trap, int local, int ssize,
23 unsigned int psize)
24{
25 unsigned int index, valid;
26 unsigned char *hpte_slot_array;
27 unsigned long rflags, pa, hidx;
28 unsigned long old_pmd, new_pmd;
29 int ret, lpsize = MMU_PAGE_16M;
30 unsigned long vpn, hash, shift, slot;
31
32 /*
33 * atomically mark the linux large page PMD busy and dirty
34 */
35 do {
36 old_pmd = pmd_val(*pmdp);
37 /* If PMD busy, retry the access */
38 if (unlikely(old_pmd & _PAGE_BUSY))
39 return 0;
40 /* If PMD is trans splitting retry the access */
41 if (unlikely(old_pmd & _PAGE_SPLITTING))
42 return 0;
43 /* If PMD permissions don't match, take page fault */
44 if (unlikely(access & ~old_pmd))
45 return 1;
46 /*
47 * Try to lock the PTE, add ACCESSED and DIRTY if it was
48 * a write access
49 */
50 new_pmd = old_pmd | _PAGE_BUSY | _PAGE_ACCESSED;
51 if (access & _PAGE_RW)
52 new_pmd |= _PAGE_DIRTY;
53 } while (old_pmd != __cmpxchg_u64((unsigned long *)pmdp,
54 old_pmd, new_pmd));
55 /*
56 * PP bits. _PAGE_USER is already PP bit 0x2, so we only
57 * need to add in 0x1 if it's a read-only user page
58 */
59 rflags = new_pmd & _PAGE_USER;
60 if ((new_pmd & _PAGE_USER) && !((new_pmd & _PAGE_RW) &&
61 (new_pmd & _PAGE_DIRTY)))
62 rflags |= 0x1;
63 /*
64 * _PAGE_EXEC -> HW_NO_EXEC since it's inverted
65 */
66 rflags |= ((new_pmd & _PAGE_EXEC) ? 0 : HPTE_R_N);
67
68#if 0
69 if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
70
71 /*
72 * No CPU has hugepages but lacks no execute, so we
73 * don't need to worry about that case
74 */
75 rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
76 }
77#endif
78 /*
79 * Find the slot index details for this ea, using base page size.
80 */
81 shift = mmu_psize_defs[psize].shift;
82 index = (ea & ~HPAGE_PMD_MASK) >> shift;
83 BUG_ON(index >= 4096);
84
85 vpn = hpt_vpn(ea, vsid, ssize);
86 hash = hpt_hash(vpn, shift, ssize);
87 hpte_slot_array = get_hpte_slot_array(pmdp);
88
89 valid = hpte_valid(hpte_slot_array, index);
90 if (valid) {
91 /* update the hpte bits */
92 hidx = hpte_hash_index(hpte_slot_array, index);
93 if (hidx & _PTEIDX_SECONDARY)
94 hash = ~hash;
95 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
96 slot += hidx & _PTEIDX_GROUP_IX;
97
98 ret = ppc_md.hpte_updatepp(slot, rflags, vpn,
99 psize, lpsize, ssize, local);
100 /*
101 * We failed to update, try to insert a new entry.
102 */
103 if (ret == -1) {
104 /*
105 * large pte is marked busy, so we can be sure
106 * nobody is looking at hpte_slot_array. hence we can
107 * safely update this here.
108 */
109 valid = 0;
110 new_pmd &= ~_PAGE_HPTEFLAGS;
111 hpte_slot_array[index] = 0;
112 } else
113 /* clear the busy bits and set the hash pte bits */
114 new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
115 }
116
117 if (!valid) {
118 unsigned long hpte_group;
119
120 /* insert new entry */
121 pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT;
122repeat:
123 hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
124
125 /* clear the busy bits and set the hash pte bits */
126 new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
127
128 /* Add in WIMG bits */
129 rflags |= (new_pmd & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
130 _PAGE_COHERENT | _PAGE_GUARDED));
131
132 /* Insert into the hash table, primary slot */
133 slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0,
134 psize, lpsize, ssize);
135 /*
136 * Primary is full, try the secondary
137 */
138 if (unlikely(slot == -1)) {
139 hpte_group = ((~hash & htab_hash_mask) *
140 HPTES_PER_GROUP) & ~0x7UL;
141 slot = ppc_md.hpte_insert(hpte_group, vpn, pa,
142 rflags, HPTE_V_SECONDARY,
143 psize, lpsize, ssize);
144 if (slot == -1) {
145 if (mftb() & 0x1)
146 hpte_group = ((hash & htab_hash_mask) *
147 HPTES_PER_GROUP) & ~0x7UL;
148
149 ppc_md.hpte_remove(hpte_group);
150 goto repeat;
151 }
152 }
153 /*
154 * Hypervisor failure. Restore old pmd and return -1
155 * similar to __hash_page_*
156 */
157 if (unlikely(slot == -2)) {
158 *pmdp = __pmd(old_pmd);
159 hash_failure_debug(ea, access, vsid, trap, ssize,
160 psize, lpsize, old_pmd);
161 return -1;
162 }
163 /*
164 * large pte is marked busy, so we can be sure
165 * nobody is looking at hpte_slot_array. hence we can
166 * safely update this here.
167 */
168 mark_hpte_slot_valid(hpte_slot_array, index, slot);
169 }
170 /*
171 * No need to use ldarx/stdcx here
172 */
173 *pmdp = __pmd(new_pmd & ~_PAGE_BUSY);
174 return 0;
175}
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
index 0f1d94a1fb82..0b7fb6761015 100644
--- a/arch/powerpc/mm/hugetlbpage-hash64.c
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -81,7 +81,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
81 slot += (old_pte & _PAGE_F_GIX) >> 12; 81 slot += (old_pte & _PAGE_F_GIX) >> 12;
82 82
83 if (ppc_md.hpte_updatepp(slot, rflags, vpn, mmu_psize, 83 if (ppc_md.hpte_updatepp(slot, rflags, vpn, mmu_psize,
84 ssize, local) == -1) 84 mmu_psize, ssize, local) == -1)
85 old_pte &= ~_PAGE_HPTEFLAGS; 85 old_pte &= ~_PAGE_HPTEFLAGS;
86 } 86 }
87 87
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 4210549ac95e..834ca8eb38f2 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -21,6 +21,9 @@
21#include <asm/pgalloc.h> 21#include <asm/pgalloc.h>
22#include <asm/tlb.h> 22#include <asm/tlb.h>
23#include <asm/setup.h> 23#include <asm/setup.h>
24#include <asm/hugetlb.h>
25
26#ifdef CONFIG_HUGETLB_PAGE
24 27
25#define PAGE_SHIFT_64K 16 28#define PAGE_SHIFT_64K 16
26#define PAGE_SHIFT_16M 24 29#define PAGE_SHIFT_16M 24
@@ -100,68 +103,9 @@ int pgd_huge(pgd_t pgd)
100} 103}
101#endif 104#endif
102 105
103/*
104 * We have 4 cases for pgds and pmds:
105 * (1) invalid (all zeroes)
106 * (2) pointer to next table, as normal; bottom 6 bits == 0
107 * (3) leaf pte for huge page, bottom two bits != 00
108 * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table
109 */
110pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
111{
112 pgd_t *pg;
113 pud_t *pu;
114 pmd_t *pm;
115 pte_t *ret_pte;
116 hugepd_t *hpdp = NULL;
117 unsigned pdshift = PGDIR_SHIFT;
118
119 if (shift)
120 *shift = 0;
121
122 pg = pgdir + pgd_index(ea);
123
124 if (pgd_huge(*pg)) {
125 ret_pte = (pte_t *) pg;
126 goto out;
127 } else if (is_hugepd(pg))
128 hpdp = (hugepd_t *)pg;
129 else if (!pgd_none(*pg)) {
130 pdshift = PUD_SHIFT;
131 pu = pud_offset(pg, ea);
132
133 if (pud_huge(*pu)) {
134 ret_pte = (pte_t *) pu;
135 goto out;
136 } else if (is_hugepd(pu))
137 hpdp = (hugepd_t *)pu;
138 else if (!pud_none(*pu)) {
139 pdshift = PMD_SHIFT;
140 pm = pmd_offset(pu, ea);
141
142 if (pmd_huge(*pm)) {
143 ret_pte = (pte_t *) pm;
144 goto out;
145 } else if (is_hugepd(pm))
146 hpdp = (hugepd_t *)pm;
147 else if (!pmd_none(*pm))
148 return pte_offset_kernel(pm, ea);
149 }
150 }
151 if (!hpdp)
152 return NULL;
153
154 ret_pte = hugepte_offset(hpdp, ea, pdshift);
155 pdshift = hugepd_shift(*hpdp);
156out:
157 if (shift)
158 *shift = pdshift;
159 return ret_pte;
160}
161EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte);
162
163pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) 106pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
164{ 107{
108 /* Only called for hugetlbfs pages, hence can ignore THP */
165 return find_linux_pte_or_hugepte(mm->pgd, addr, NULL); 109 return find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
166} 110}
167 111
@@ -736,11 +680,14 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
736 struct page *page; 680 struct page *page;
737 unsigned shift; 681 unsigned shift;
738 unsigned long mask; 682 unsigned long mask;
739 683 /*
684 * Transparent hugepages are handled by generic code. We can skip them
685 * here.
686 */
740 ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift); 687 ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
741 688
742 /* Verify it is a huge page else bail. */ 689 /* Verify it is a huge page else bail. */
743 if (!ptep || !shift) 690 if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep))
744 return ERR_PTR(-EINVAL); 691 return ERR_PTR(-EINVAL);
745 692
746 mask = (1UL << shift) - 1; 693 mask = (1UL << shift) - 1;
@@ -759,69 +706,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
759 return NULL; 706 return NULL;
760} 707}
761 708
762int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
763 unsigned long end, int write, struct page **pages, int *nr)
764{
765 unsigned long mask;
766 unsigned long pte_end;
767 struct page *head, *page, *tail;
768 pte_t pte;
769 int refs;
770
771 pte_end = (addr + sz) & ~(sz-1);
772 if (pte_end < end)
773 end = pte_end;
774
775 pte = *ptep;
776 mask = _PAGE_PRESENT | _PAGE_USER;
777 if (write)
778 mask |= _PAGE_RW;
779
780 if ((pte_val(pte) & mask) != mask)
781 return 0;
782
783 /* hugepages are never "special" */
784 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
785
786 refs = 0;
787 head = pte_page(pte);
788
789 page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
790 tail = page;
791 do {
792 VM_BUG_ON(compound_head(page) != head);
793 pages[*nr] = page;
794 (*nr)++;
795 page++;
796 refs++;
797 } while (addr += PAGE_SIZE, addr != end);
798
799 if (!page_cache_add_speculative(head, refs)) {
800 *nr -= refs;
801 return 0;
802 }
803
804 if (unlikely(pte_val(pte) != pte_val(*ptep))) {
805 /* Could be optimized better */
806 *nr -= refs;
807 while (refs--)
808 put_page(head);
809 return 0;
810 }
811
812 /*
813 * Any tail page need their mapcount reference taken before we
814 * return.
815 */
816 while (refs--) {
817 if (PageTail(tail))
818 get_huge_page_tail(tail);
819 tail++;
820 }
821
822 return 1;
823}
824
825static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, 709static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
826 unsigned long sz) 710 unsigned long sz)
827{ 711{
@@ -1038,3 +922,168 @@ void flush_dcache_icache_hugepage(struct page *page)
1038 } 922 }
1039 } 923 }
1040} 924}
925
926#endif /* CONFIG_HUGETLB_PAGE */
927
928/*
929 * We have 4 cases for pgds and pmds:
930 * (1) invalid (all zeroes)
931 * (2) pointer to next table, as normal; bottom 6 bits == 0
932 * (3) leaf pte for huge page, bottom two bits != 00
933 * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table
934 *
935 * So long as we atomically load page table pointers we are safe against teardown,
936 * we can follow the address down to the the page and take a ref on it.
937 */
938
939pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
940{
941 pgd_t pgd, *pgdp;
942 pud_t pud, *pudp;
943 pmd_t pmd, *pmdp;
944 pte_t *ret_pte;
945 hugepd_t *hpdp = NULL;
946 unsigned pdshift = PGDIR_SHIFT;
947
948 if (shift)
949 *shift = 0;
950
951 pgdp = pgdir + pgd_index(ea);
952 pgd = ACCESS_ONCE(*pgdp);
953 /*
954 * Always operate on the local stack value. This make sure the
955 * value don't get updated by a parallel THP split/collapse,
956 * page fault or a page unmap. The return pte_t * is still not
957 * stable. So should be checked there for above conditions.
958 */
959 if (pgd_none(pgd))
960 return NULL;
961 else if (pgd_huge(pgd)) {
962 ret_pte = (pte_t *) pgdp;
963 goto out;
964 } else if (is_hugepd(&pgd))
965 hpdp = (hugepd_t *)&pgd;
966 else {
967 /*
968 * Even if we end up with an unmap, the pgtable will not
969 * be freed, because we do an rcu free and here we are
970 * irq disabled
971 */
972 pdshift = PUD_SHIFT;
973 pudp = pud_offset(&pgd, ea);
974 pud = ACCESS_ONCE(*pudp);
975
976 if (pud_none(pud))
977 return NULL;
978 else if (pud_huge(pud)) {
979 ret_pte = (pte_t *) pudp;
980 goto out;
981 } else if (is_hugepd(&pud))
982 hpdp = (hugepd_t *)&pud;
983 else {
984 pdshift = PMD_SHIFT;
985 pmdp = pmd_offset(&pud, ea);
986 pmd = ACCESS_ONCE(*pmdp);
987 /*
988 * A hugepage collapse is captured by pmd_none, because
989 * it mark the pmd none and do a hpte invalidate.
990 *
991 * A hugepage split is captured by pmd_trans_splitting
992 * because we mark the pmd trans splitting and do a
993 * hpte invalidate
994 *
995 */
996 if (pmd_none(pmd) || pmd_trans_splitting(pmd))
997 return NULL;
998
999 if (pmd_huge(pmd) || pmd_large(pmd)) {
1000 ret_pte = (pte_t *) pmdp;
1001 goto out;
1002 } else if (is_hugepd(&pmd))
1003 hpdp = (hugepd_t *)&pmd;
1004 else
1005 return pte_offset_kernel(&pmd, ea);
1006 }
1007 }
1008 if (!hpdp)
1009 return NULL;
1010
1011 ret_pte = hugepte_offset(hpdp, ea, pdshift);
1012 pdshift = hugepd_shift(*hpdp);
1013out:
1014 if (shift)
1015 *shift = pdshift;
1016 return ret_pte;
1017}
1018EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte);
1019
1020int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
1021 unsigned long end, int write, struct page **pages, int *nr)
1022{
1023 unsigned long mask;
1024 unsigned long pte_end;
1025 struct page *head, *page, *tail;
1026 pte_t pte;
1027 int refs;
1028
1029 pte_end = (addr + sz) & ~(sz-1);
1030 if (pte_end < end)
1031 end = pte_end;
1032
1033 pte = ACCESS_ONCE(*ptep);
1034 mask = _PAGE_PRESENT | _PAGE_USER;
1035 if (write)
1036 mask |= _PAGE_RW;
1037
1038 if ((pte_val(pte) & mask) != mask)
1039 return 0;
1040
1041#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1042 /*
1043 * check for splitting here
1044 */
1045 if (pmd_trans_splitting(pte_pmd(pte)))
1046 return 0;
1047#endif
1048
1049 /* hugepages are never "special" */
1050 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
1051
1052 refs = 0;
1053 head = pte_page(pte);
1054
1055 page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
1056 tail = page;
1057 do {
1058 VM_BUG_ON(compound_head(page) != head);
1059 pages[*nr] = page;
1060 (*nr)++;
1061 page++;
1062 refs++;
1063 } while (addr += PAGE_SIZE, addr != end);
1064
1065 if (!page_cache_add_speculative(head, refs)) {
1066 *nr -= refs;
1067 return 0;
1068 }
1069
1070 if (unlikely(pte_val(pte) != pte_val(*ptep))) {
1071 /* Could be optimized better */
1072 *nr -= refs;
1073 while (refs--)
1074 put_page(head);
1075 return 0;
1076 }
1077
1078 /*
1079 * Any tail page need their mapcount reference taken before we
1080 * return.
1081 */
1082 while (refs--) {
1083 if (PageTail(tail))
1084 get_huge_page_tail(tail);
1085 tail++;
1086 }
1087
1088 return 1;
1089}
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index a90b9c458990..d0cd9e4c6837 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -88,7 +88,11 @@ static void pgd_ctor(void *addr)
88 88
89static void pmd_ctor(void *addr) 89static void pmd_ctor(void *addr)
90{ 90{
91#ifdef CONFIG_TRANSPARENT_HUGEPAGE
92 memset(addr, 0, PMD_TABLE_SIZE * 2);
93#else
91 memset(addr, 0, PMD_TABLE_SIZE); 94 memset(addr, 0, PMD_TABLE_SIZE);
95#endif
92} 96}
93 97
94struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE]; 98struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE];
@@ -137,10 +141,9 @@ void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
137void pgtable_cache_init(void) 141void pgtable_cache_init(void)
138{ 142{
139 pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor); 143 pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor);
140 pgtable_cache_add(PMD_INDEX_SIZE, pmd_ctor); 144 pgtable_cache_add(PMD_CACHE_INDEX, pmd_ctor);
141 if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_INDEX_SIZE)) 145 if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_CACHE_INDEX))
142 panic("Couldn't allocate pgtable caches"); 146 panic("Couldn't allocate pgtable caches");
143
144 /* In all current configs, when the PUD index exists it's the 147 /* In all current configs, when the PUD index exists it's the
145 * same size as either the pgd or pmd index. Verify that the 148 * same size as either the pgd or pmd index. Verify that the
146 * initialization above has also created a PUD cache. This 149 * initialization above has also created a PUD cache. This
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 1cb1ea133a2c..7f4bea162026 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -461,6 +461,10 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
461 pte_t *ptep) 461 pte_t *ptep)
462{ 462{
463#ifdef CONFIG_PPC_STD_MMU 463#ifdef CONFIG_PPC_STD_MMU
464 /*
465 * We don't need to worry about _PAGE_PRESENT here because we are
466 * called with either mm->page_table_lock held or ptl lock held
467 */
464 unsigned long access = 0, trap; 468 unsigned long access = 0, trap;
465 469
466 /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ 470 /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
diff --git a/arch/powerpc/mm/mmap_64.c b/arch/powerpc/mm/mmap.c
index 67a42ed0d2fc..67a42ed0d2fc 100644
--- a/arch/powerpc/mm/mmap_64.c
+++ b/arch/powerpc/mm/mmap.c
diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
index e779642c25e5..af3d78e19302 100644
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -112,8 +112,10 @@ static unsigned int steal_context_smp(unsigned int id)
112 */ 112 */
113 for_each_cpu(cpu, mm_cpumask(mm)) { 113 for_each_cpu(cpu, mm_cpumask(mm)) {
114 for (i = cpu_first_thread_sibling(cpu); 114 for (i = cpu_first_thread_sibling(cpu);
115 i <= cpu_last_thread_sibling(cpu); i++) 115 i <= cpu_last_thread_sibling(cpu); i++) {
116 __set_bit(id, stale_map[i]); 116 if (stale_map[i])
117 __set_bit(id, stale_map[i]);
118 }
117 cpu = i - 1; 119 cpu = i - 1;
118 } 120 }
119 return id; 121 return id;
@@ -272,7 +274,8 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
272 /* XXX This clear should ultimately be part of local_flush_tlb_mm */ 274 /* XXX This clear should ultimately be part of local_flush_tlb_mm */
273 for (i = cpu_first_thread_sibling(cpu); 275 for (i = cpu_first_thread_sibling(cpu);
274 i <= cpu_last_thread_sibling(cpu); i++) { 276 i <= cpu_last_thread_sibling(cpu); i++) {
275 __clear_bit(id, stale_map[i]); 277 if (stale_map[i])
278 __clear_bit(id, stale_map[i]);
276 } 279 }
277 } 280 }
278 281
@@ -329,8 +332,8 @@ void destroy_context(struct mm_struct *mm)
329 332
330#ifdef CONFIG_SMP 333#ifdef CONFIG_SMP
331 334
332static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self, 335static int mmu_context_cpu_notify(struct notifier_block *self,
333 unsigned long action, void *hcpu) 336 unsigned long action, void *hcpu)
334{ 337{
335 unsigned int cpu = (unsigned int)(long)hcpu; 338 unsigned int cpu = (unsigned int)(long)hcpu;
336 339
@@ -363,7 +366,7 @@ static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self,
363 return NOTIFY_OK; 366 return NOTIFY_OK;
364} 367}
365 368
366static struct notifier_block __cpuinitdata mmu_context_cpu_nb = { 369static struct notifier_block mmu_context_cpu_nb = {
367 .notifier_call = mmu_context_cpu_notify, 370 .notifier_call = mmu_context_cpu_notify,
368}; 371};
369 372
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 88c0425dc0a8..08397217e8ac 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -516,7 +516,7 @@ static int of_drconf_to_nid_single(struct of_drconf_cell *drmem,
516 * Figure out to which domain a cpu belongs and stick it there. 516 * Figure out to which domain a cpu belongs and stick it there.
517 * Return the id of the domain used. 517 * Return the id of the domain used.
518 */ 518 */
519static int __cpuinit numa_setup_cpu(unsigned long lcpu) 519static int numa_setup_cpu(unsigned long lcpu)
520{ 520{
521 int nid = 0; 521 int nid = 0;
522 struct device_node *cpu = of_get_cpu_node(lcpu, NULL); 522 struct device_node *cpu = of_get_cpu_node(lcpu, NULL);
@@ -538,8 +538,7 @@ out:
538 return nid; 538 return nid;
539} 539}
540 540
541static int __cpuinit cpu_numa_callback(struct notifier_block *nfb, 541static int cpu_numa_callback(struct notifier_block *nfb, unsigned long action,
542 unsigned long action,
543 void *hcpu) 542 void *hcpu)
544{ 543{
545 unsigned long lcpu = (unsigned long)hcpu; 544 unsigned long lcpu = (unsigned long)hcpu;
@@ -919,7 +918,7 @@ static void __init *careful_zallocation(int nid, unsigned long size,
919 return ret; 918 return ret;
920} 919}
921 920
922static struct notifier_block __cpuinitdata ppc64_numa_nb = { 921static struct notifier_block ppc64_numa_nb = {
923 .notifier_call = cpu_numa_callback, 922 .notifier_call = cpu_numa_callback,
924 .priority = 1 /* Must run before sched domains notifier. */ 923 .priority = 1 /* Must run before sched domains notifier. */
925}; 924};
@@ -1433,11 +1432,9 @@ static int update_cpu_topology(void *data)
1433 if (cpu != update->cpu) 1432 if (cpu != update->cpu)
1434 continue; 1433 continue;
1435 1434
1436 unregister_cpu_under_node(update->cpu, update->old_nid);
1437 unmap_cpu_from_node(update->cpu); 1435 unmap_cpu_from_node(update->cpu);
1438 map_cpu_to_node(update->cpu, update->new_nid); 1436 map_cpu_to_node(update->cpu, update->new_nid);
1439 vdso_getcpu_init(); 1437 vdso_getcpu_init();
1440 register_cpu_under_node(update->cpu, update->new_nid);
1441 } 1438 }
1442 1439
1443 return 0; 1440 return 0;
@@ -1485,6 +1482,9 @@ int arch_update_cpu_topology(void)
1485 stop_machine(update_cpu_topology, &updates[0], &updated_cpus); 1482 stop_machine(update_cpu_topology, &updates[0], &updated_cpus);
1486 1483
1487 for (ud = &updates[0]; ud; ud = ud->next) { 1484 for (ud = &updates[0]; ud; ud = ud->next) {
1485 unregister_cpu_under_node(ud->cpu, ud->old_nid);
1486 register_cpu_under_node(ud->cpu, ud->new_nid);
1487
1488 dev = get_cpu_device(ud->cpu); 1488 dev = get_cpu_device(ud->cpu);
1489 if (dev) 1489 if (dev)
1490 kobject_uevent(&dev->kobj, KOBJ_CHANGE); 1490 kobject_uevent(&dev->kobj, KOBJ_CHANGE);
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 214130a4edc6..edda589795c3 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -235,6 +235,14 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
235 pud = pud_offset(pgd, addr); 235 pud = pud_offset(pgd, addr);
236 BUG_ON(pud_none(*pud)); 236 BUG_ON(pud_none(*pud));
237 pmd = pmd_offset(pud, addr); 237 pmd = pmd_offset(pud, addr);
238 /*
239 * khugepaged to collapse normal pages to hugepage, first set
240 * pmd to none to force page fault/gup to take mmap_sem. After
241 * pmd is set to none, we do a pte_clear which does this assertion
242 * so if we find pmd none, return.
243 */
244 if (pmd_none(*pmd))
245 return;
238 BUG_ON(!pmd_present(*pmd)); 246 BUG_ON(!pmd_present(*pmd));
239 assert_spin_locked(pte_lockptr(mm, pmd)); 247 assert_spin_locked(pte_lockptr(mm, pmd));
240} 248}
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index a854096e1023..536eec72c0f7 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -338,6 +338,19 @@ EXPORT_SYMBOL(iounmap);
338EXPORT_SYMBOL(__iounmap); 338EXPORT_SYMBOL(__iounmap);
339EXPORT_SYMBOL(__iounmap_at); 339EXPORT_SYMBOL(__iounmap_at);
340 340
341/*
342 * For hugepage we have pfn in the pmd, we use PTE_RPN_SHIFT bits for flags
343 * For PTE page, we have a PTE_FRAG_SIZE (4K) aligned virtual address.
344 */
345struct page *pmd_page(pmd_t pmd)
346{
347#ifdef CONFIG_TRANSPARENT_HUGEPAGE
348 if (pmd_trans_huge(pmd))
349 return pfn_to_page(pmd_pfn(pmd));
350#endif
351 return virt_to_page(pmd_page_vaddr(pmd));
352}
353
341#ifdef CONFIG_PPC_64K_PAGES 354#ifdef CONFIG_PPC_64K_PAGES
342static pte_t *get_from_cache(struct mm_struct *mm) 355static pte_t *get_from_cache(struct mm_struct *mm)
343{ 356{
@@ -455,3 +468,404 @@ void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
455} 468}
456#endif 469#endif
457#endif /* CONFIG_PPC_64K_PAGES */ 470#endif /* CONFIG_PPC_64K_PAGES */
471
472#ifdef CONFIG_TRANSPARENT_HUGEPAGE
473
474/*
475 * This is called when relaxing access to a hugepage. It's also called in the page
476 * fault path when we don't hit any of the major fault cases, ie, a minor
477 * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
478 * handled those two for us, we additionally deal with missing execute
479 * permission here on some processors
480 */
481int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
482 pmd_t *pmdp, pmd_t entry, int dirty)
483{
484 int changed;
485#ifdef CONFIG_DEBUG_VM
486 WARN_ON(!pmd_trans_huge(*pmdp));
487 assert_spin_locked(&vma->vm_mm->page_table_lock);
488#endif
489 changed = !pmd_same(*(pmdp), entry);
490 if (changed) {
491 __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry));
492 /*
493 * Since we are not supporting SW TLB systems, we don't
494 * have any thing similar to flush_tlb_page_nohash()
495 */
496 }
497 return changed;
498}
499
500unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
501 pmd_t *pmdp, unsigned long clr)
502{
503
504 unsigned long old, tmp;
505
506#ifdef CONFIG_DEBUG_VM
507 WARN_ON(!pmd_trans_huge(*pmdp));
508 assert_spin_locked(&mm->page_table_lock);
509#endif
510
511#ifdef PTE_ATOMIC_UPDATES
512 __asm__ __volatile__(
513 "1: ldarx %0,0,%3\n\
514 andi. %1,%0,%6\n\
515 bne- 1b \n\
516 andc %1,%0,%4 \n\
517 stdcx. %1,0,%3 \n\
518 bne- 1b"
519 : "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
520 : "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY)
521 : "cc" );
522#else
523 old = pmd_val(*pmdp);
524 *pmdp = __pmd(old & ~clr);
525#endif
526 if (old & _PAGE_HASHPTE)
527 hpte_do_hugepage_flush(mm, addr, pmdp);
528 return old;
529}
530
531pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
532 pmd_t *pmdp)
533{
534 pmd_t pmd;
535
536 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
537 if (pmd_trans_huge(*pmdp)) {
538 pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
539 } else {
540 /*
541 * khugepaged calls this for normal pmd
542 */
543 pmd = *pmdp;
544 pmd_clear(pmdp);
545 /*
546 * Wait for all pending hash_page to finish. This is needed
547 * in case of subpage collapse. When we collapse normal pages
548 * to hugepage, we first clear the pmd, then invalidate all
549 * the PTE entries. The assumption here is that any low level
550 * page fault will see a none pmd and take the slow path that
551 * will wait on mmap_sem. But we could very well be in a
552 * hash_page with local ptep pointer value. Such a hash page
553 * can result in adding new HPTE entries for normal subpages.
554 * That means we could be modifying the page content as we
555 * copy them to a huge page. So wait for parallel hash_page
556 * to finish before invalidating HPTE entries. We can do this
557 * by sending an IPI to all the cpus and executing a dummy
558 * function there.
559 */
560 kick_all_cpus_sync();
561 /*
562 * Now invalidate the hpte entries in the range
563 * covered by pmd. This make sure we take a
564 * fault and will find the pmd as none, which will
565 * result in a major fault which takes mmap_sem and
566 * hence wait for collapse to complete. Without this
567 * the __collapse_huge_page_copy can result in copying
568 * the old content.
569 */
570 flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
571 }
572 return pmd;
573}
574
575int pmdp_test_and_clear_young(struct vm_area_struct *vma,
576 unsigned long address, pmd_t *pmdp)
577{
578 return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
579}
580
581/*
582 * We currently remove entries from the hashtable regardless of whether
583 * the entry was young or dirty. The generic routines only flush if the
584 * entry was young or dirty which is not good enough.
585 *
586 * We should be more intelligent about this but for the moment we override
587 * these functions and force a tlb flush unconditionally
588 */
589int pmdp_clear_flush_young(struct vm_area_struct *vma,
590 unsigned long address, pmd_t *pmdp)
591{
592 return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
593}
594
595/*
596 * We mark the pmd splitting and invalidate all the hpte
597 * entries for this hugepage.
598 */
599void pmdp_splitting_flush(struct vm_area_struct *vma,
600 unsigned long address, pmd_t *pmdp)
601{
602 unsigned long old, tmp;
603
604 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
605
606#ifdef CONFIG_DEBUG_VM
607 WARN_ON(!pmd_trans_huge(*pmdp));
608 assert_spin_locked(&vma->vm_mm->page_table_lock);
609#endif
610
611#ifdef PTE_ATOMIC_UPDATES
612
613 __asm__ __volatile__(
614 "1: ldarx %0,0,%3\n\
615 andi. %1,%0,%6\n\
616 bne- 1b \n\
617 ori %1,%0,%4 \n\
618 stdcx. %1,0,%3 \n\
619 bne- 1b"
620 : "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
621 : "r" (pmdp), "i" (_PAGE_SPLITTING), "m" (*pmdp), "i" (_PAGE_BUSY)
622 : "cc" );
623#else
624 old = pmd_val(*pmdp);
625 *pmdp = __pmd(old | _PAGE_SPLITTING);
626#endif
627 /*
628 * If we didn't had the splitting flag set, go and flush the
629 * HPTE entries.
630 */
631 if (!(old & _PAGE_SPLITTING)) {
632 /* We need to flush the hpte */
633 if (old & _PAGE_HASHPTE)
634 hpte_do_hugepage_flush(vma->vm_mm, address, pmdp);
635 }
636}
637
638/*
639 * We want to put the pgtable in pmd and use pgtable for tracking
640 * the base page size hptes
641 */
642void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
643 pgtable_t pgtable)
644{
645 pgtable_t *pgtable_slot;
646 assert_spin_locked(&mm->page_table_lock);
647 /*
648 * we store the pgtable in the second half of PMD
649 */
650 pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
651 *pgtable_slot = pgtable;
652 /*
653 * expose the deposited pgtable to other cpus.
654 * before we set the hugepage PTE at pmd level
655 * hash fault code looks at the deposted pgtable
656 * to store hash index values.
657 */
658 smp_wmb();
659}
660
661pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
662{
663 pgtable_t pgtable;
664 pgtable_t *pgtable_slot;
665
666 assert_spin_locked(&mm->page_table_lock);
667 pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
668 pgtable = *pgtable_slot;
669 /*
670 * Once we withdraw, mark the entry NULL.
671 */
672 *pgtable_slot = NULL;
673 /*
674 * We store HPTE information in the deposited PTE fragment.
675 * zero out the content on withdraw.
676 */
677 memset(pgtable, 0, PTE_FRAG_SIZE);
678 return pgtable;
679}
680
681/*
682 * set a new huge pmd. We should not be called for updating
683 * an existing pmd entry. That should go via pmd_hugepage_update.
684 */
685void set_pmd_at(struct mm_struct *mm, unsigned long addr,
686 pmd_t *pmdp, pmd_t pmd)
687{
688#ifdef CONFIG_DEBUG_VM
689 WARN_ON(!pmd_none(*pmdp));
690 assert_spin_locked(&mm->page_table_lock);
691 WARN_ON(!pmd_trans_huge(pmd));
692#endif
693 return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
694}
695
696void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
697 pmd_t *pmdp)
698{
699 pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT);
700}
701
702/*
703 * A linux hugepage PMD was changed and the corresponding hash table entries
704 * neesd to be flushed.
705 */
706void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
707 pmd_t *pmdp)
708{
709 int ssize, i;
710 unsigned long s_addr;
711 int max_hpte_count;
712 unsigned int psize, valid;
713 unsigned char *hpte_slot_array;
714 unsigned long hidx, vpn, vsid, hash, shift, slot;
715
716 /*
717 * Flush all the hptes mapping this hugepage
718 */
719 s_addr = addr & HPAGE_PMD_MASK;
720 hpte_slot_array = get_hpte_slot_array(pmdp);
721 /*
722 * IF we try to do a HUGE PTE update after a withdraw is done.
723 * we will find the below NULL. This happens when we do
724 * split_huge_page_pmd
725 */
726 if (!hpte_slot_array)
727 return;
728
729 /* get the base page size */
730 psize = get_slice_psize(mm, s_addr);
731
732 if (ppc_md.hugepage_invalidate)
733 return ppc_md.hugepage_invalidate(mm, hpte_slot_array,
734 s_addr, psize);
735 /*
736 * No bluk hpte removal support, invalidate each entry
737 */
738 shift = mmu_psize_defs[psize].shift;
739 max_hpte_count = HPAGE_PMD_SIZE >> shift;
740 for (i = 0; i < max_hpte_count; i++) {
741 /*
742 * 8 bits per each hpte entries
743 * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
744 */
745 valid = hpte_valid(hpte_slot_array, i);
746 if (!valid)
747 continue;
748 hidx = hpte_hash_index(hpte_slot_array, i);
749
750 /* get the vpn */
751 addr = s_addr + (i * (1ul << shift));
752 if (!is_kernel_addr(addr)) {
753 ssize = user_segment_size(addr);
754 vsid = get_vsid(mm->context.id, addr, ssize);
755 WARN_ON(vsid == 0);
756 } else {
757 vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
758 ssize = mmu_kernel_ssize;
759 }
760
761 vpn = hpt_vpn(addr, vsid, ssize);
762 hash = hpt_hash(vpn, shift, ssize);
763 if (hidx & _PTEIDX_SECONDARY)
764 hash = ~hash;
765
766 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
767 slot += hidx & _PTEIDX_GROUP_IX;
768 ppc_md.hpte_invalidate(slot, vpn, psize,
769 MMU_PAGE_16M, ssize, 0);
770 }
771}
772
773static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
774{
775 pmd_val(pmd) |= pgprot_val(pgprot);
776 return pmd;
777}
778
779pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
780{
781 pmd_t pmd;
782 /*
783 * For a valid pte, we would have _PAGE_PRESENT or _PAGE_FILE always
784 * set. We use this to check THP page at pmd level.
785 * leaf pte for huge page, bottom two bits != 00
786 */
787 pmd_val(pmd) = pfn << PTE_RPN_SHIFT;
788 pmd_val(pmd) |= _PAGE_THP_HUGE;
789 pmd = pmd_set_protbits(pmd, pgprot);
790 return pmd;
791}
792
793pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
794{
795 return pfn_pmd(page_to_pfn(page), pgprot);
796}
797
798pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
799{
800
801 pmd_val(pmd) &= _HPAGE_CHG_MASK;
802 pmd = pmd_set_protbits(pmd, newprot);
803 return pmd;
804}
805
806/*
807 * This is called at the end of handling a user page fault, when the
808 * fault has been handled by updating a HUGE PMD entry in the linux page tables.
809 * We use it to preload an HPTE into the hash table corresponding to
810 * the updated linux HUGE PMD entry.
811 */
812void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
813 pmd_t *pmd)
814{
815 return;
816}
817
818pmd_t pmdp_get_and_clear(struct mm_struct *mm,
819 unsigned long addr, pmd_t *pmdp)
820{
821 pmd_t old_pmd;
822 pgtable_t pgtable;
823 unsigned long old;
824 pgtable_t *pgtable_slot;
825
826 old = pmd_hugepage_update(mm, addr, pmdp, ~0UL);
827 old_pmd = __pmd(old);
828 /*
829 * We have pmd == none and we are holding page_table_lock.
830 * So we can safely go and clear the pgtable hash
831 * index info.
832 */
833 pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
834 pgtable = *pgtable_slot;
835 /*
836 * Let's zero out old valid and hash index details
837 * hash fault look at them.
838 */
839 memset(pgtable, 0, PTE_FRAG_SIZE);
840 return old_pmd;
841}
842
843int has_transparent_hugepage(void)
844{
845 if (!mmu_has_feature(MMU_FTR_16M_PAGE))
846 return 0;
847 /*
848 * We support THP only if PMD_SIZE is 16MB.
849 */
850 if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
851 return 0;
852 /*
853 * We need to make sure that we support 16MB hugepage in a segement
854 * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
855 * of 64K.
856 */
857 /*
858 * If we have 64K HPTE, we will be using that by default
859 */
860 if (mmu_psize_defs[MMU_PAGE_64K].shift &&
861 (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
862 return 0;
863 /*
864 * Ok we only have 4K HPTE
865 */
866 if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
867 return 0;
868
869 return 1;
870}
871#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index 7c415ddde948..aa74acb0fdfc 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -130,6 +130,53 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len)
130 up_write(&mm->mmap_sem); 130 up_write(&mm->mmap_sem);
131} 131}
132 132
133#ifdef CONFIG_TRANSPARENT_HUGEPAGE
134static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
135 unsigned long end, struct mm_walk *walk)
136{
137 struct vm_area_struct *vma = walk->private;
138 split_huge_page_pmd(vma, addr, pmd);
139 return 0;
140}
141
142static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
143 unsigned long len)
144{
145 struct vm_area_struct *vma;
146 struct mm_walk subpage_proto_walk = {
147 .mm = mm,
148 .pmd_entry = subpage_walk_pmd_entry,
149 };
150
151 /*
152 * We don't try too hard, we just mark all the vma in that range
153 * VM_NOHUGEPAGE and split them.
154 */
155 vma = find_vma(mm, addr);
156 /*
157 * If the range is in unmapped range, just return
158 */
159 if (vma && ((addr + len) <= vma->vm_start))
160 return;
161
162 while (vma) {
163 if (vma->vm_start >= (addr + len))
164 break;
165 vma->vm_flags |= VM_NOHUGEPAGE;
166 subpage_proto_walk.private = vma;
167 walk_page_range(vma->vm_start, vma->vm_end,
168 &subpage_proto_walk);
169 vma = vma->vm_next;
170 }
171}
172#else
173static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
174 unsigned long len)
175{
176 return;
177}
178#endif
179
133/* 180/*
134 * Copy in a subpage protection map for an address range. 181 * Copy in a subpage protection map for an address range.
135 * The map has 2 bits per 4k subpage, so 32 bits per 64k page. 182 * The map has 2 bits per 4k subpage, so 32 bits per 64k page.
@@ -168,6 +215,7 @@ long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map)
168 return -EFAULT; 215 return -EFAULT;
169 216
170 down_write(&mm->mmap_sem); 217 down_write(&mm->mmap_sem);
218 subpage_mark_vma_nohuge(mm, addr, len);
171 for (limit = addr + len; addr < limit; addr = next) { 219 for (limit = addr + len; addr < limit; addr = next) {
172 next = pmd_addr_end(addr, limit); 220 next = pmd_addr_end(addr, limit);
173 err = -ENOMEM; 221 err = -ENOMEM;
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
index 7df1c5edda87..36e44b4260eb 100644
--- a/arch/powerpc/mm/tlb_hash64.c
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -189,6 +189,7 @@ void tlb_flush(struct mmu_gather *tlb)
189void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, 189void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
190 unsigned long end) 190 unsigned long end)
191{ 191{
192 int hugepage_shift;
192 unsigned long flags; 193 unsigned long flags;
193 194
194 start = _ALIGN_DOWN(start, PAGE_SIZE); 195 start = _ALIGN_DOWN(start, PAGE_SIZE);
@@ -206,7 +207,8 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
206 local_irq_save(flags); 207 local_irq_save(flags);
207 arch_enter_lazy_mmu_mode(); 208 arch_enter_lazy_mmu_mode();
208 for (; start < end; start += PAGE_SIZE) { 209 for (; start < end; start += PAGE_SIZE) {
209 pte_t *ptep = find_linux_pte(mm->pgd, start); 210 pte_t *ptep = find_linux_pte_or_hugepte(mm->pgd, start,
211 &hugepage_shift);
210 unsigned long pte; 212 unsigned long pte;
211 213
212 if (ptep == NULL) 214 if (ptep == NULL)
@@ -214,7 +216,37 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
214 pte = pte_val(*ptep); 216 pte = pte_val(*ptep);
215 if (!(pte & _PAGE_HASHPTE)) 217 if (!(pte & _PAGE_HASHPTE))
216 continue; 218 continue;
217 hpte_need_flush(mm, start, ptep, pte, 0); 219 if (unlikely(hugepage_shift && pmd_trans_huge(*(pmd_t *)pte)))
220 hpte_do_hugepage_flush(mm, start, (pmd_t *)pte);
221 else
222 hpte_need_flush(mm, start, ptep, pte, 0);
223 }
224 arch_leave_lazy_mmu_mode();
225 local_irq_restore(flags);
226}
227
228void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
229{
230 pte_t *pte;
231 pte_t *start_pte;
232 unsigned long flags;
233
234 addr = _ALIGN_DOWN(addr, PMD_SIZE);
235 /* Note: Normally, we should only ever use a batch within a
236 * PTE locked section. This violates the rule, but will work
237 * since we don't actually modify the PTEs, we just flush the
238 * hash while leaving the PTEs intact (including their reference
239 * to being hashed). This is not the most performance oriented
240 * way to do things but is fine for our needs here.
241 */
242 local_irq_save(flags);
243 arch_enter_lazy_mmu_mode();
244 start_pte = pte_offset_map(pmd, addr);
245 for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) {
246 unsigned long pteval = pte_val(*pte);
247 if (pteval & _PAGE_HASHPTE)
248 hpte_need_flush(mm, addr, pte, pteval, 0);
249 addr += PAGE_SIZE;
218 } 250 }
219 arch_leave_lazy_mmu_mode(); 251 arch_leave_lazy_mmu_mode();
220 local_irq_restore(flags); 252 local_irq_restore(flags);
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index 6888cad5103d..41cd68dee681 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -648,7 +648,7 @@ void __init early_init_mmu(void)
648 __early_init_mmu(1); 648 __early_init_mmu(1);
649} 649}
650 650
651void __cpuinit early_init_mmu_secondary(void) 651void early_init_mmu_secondary(void)
652{ 652{
653 __early_init_mmu(0); 653 __early_init_mmu(0);
654} 654}
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 29c6482890c8..a3985aee77fe 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -75,6 +75,11 @@ static unsigned int freeze_events_kernel = MMCR0_FCS;
75 75
76#define MMCR0_FCHV 0 76#define MMCR0_FCHV 0
77#define MMCR0_PMCjCE MMCR0_PMCnCE 77#define MMCR0_PMCjCE MMCR0_PMCnCE
78#define MMCR0_FC56 0
79#define MMCR0_PMAO 0
80#define MMCR0_EBE 0
81#define MMCR0_PMCC 0
82#define MMCR0_PMCC_U6 0
78 83
79#define SPRN_MMCRA SPRN_MMCR2 84#define SPRN_MMCRA SPRN_MMCR2
80#define MMCRA_SAMPLE_ENABLE 0 85#define MMCRA_SAMPLE_ENABLE 0
@@ -102,6 +107,15 @@ static inline int siar_valid(struct pt_regs *regs)
102 return 1; 107 return 1;
103} 108}
104 109
110static bool is_ebb_event(struct perf_event *event) { return false; }
111static int ebb_event_check(struct perf_event *event) { return 0; }
112static void ebb_event_add(struct perf_event *event) { }
113static void ebb_switch_out(unsigned long mmcr0) { }
114static unsigned long ebb_switch_in(bool ebb, unsigned long mmcr0)
115{
116 return mmcr0;
117}
118
105static inline void power_pmu_bhrb_enable(struct perf_event *event) {} 119static inline void power_pmu_bhrb_enable(struct perf_event *event) {}
106static inline void power_pmu_bhrb_disable(struct perf_event *event) {} 120static inline void power_pmu_bhrb_disable(struct perf_event *event) {}
107void power_pmu_flush_branch_stack(void) {} 121void power_pmu_flush_branch_stack(void) {}
@@ -462,6 +476,89 @@ void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw)
462 return; 476 return;
463} 477}
464 478
479static bool is_ebb_event(struct perf_event *event)
480{
481 /*
482 * This could be a per-PMU callback, but we'd rather avoid the cost. We
483 * check that the PMU supports EBB, meaning those that don't can still
484 * use bit 63 of the event code for something else if they wish.
485 */
486 return (ppmu->flags & PPMU_EBB) &&
487 ((event->attr.config >> EVENT_CONFIG_EBB_SHIFT) & 1);
488}
489
490static int ebb_event_check(struct perf_event *event)
491{
492 struct perf_event *leader = event->group_leader;
493
494 /* Event and group leader must agree on EBB */
495 if (is_ebb_event(leader) != is_ebb_event(event))
496 return -EINVAL;
497
498 if (is_ebb_event(event)) {
499 if (!(event->attach_state & PERF_ATTACH_TASK))
500 return -EINVAL;
501
502 if (!leader->attr.pinned || !leader->attr.exclusive)
503 return -EINVAL;
504
505 if (event->attr.inherit || event->attr.sample_period ||
506 event->attr.enable_on_exec || event->attr.freq)
507 return -EINVAL;
508 }
509
510 return 0;
511}
512
513static void ebb_event_add(struct perf_event *event)
514{
515 if (!is_ebb_event(event) || current->thread.used_ebb)
516 return;
517
518 /*
519 * IFF this is the first time we've added an EBB event, set
520 * PMXE in the user MMCR0 so we can detect when it's cleared by
521 * userspace. We need this so that we can context switch while
522 * userspace is in the EBB handler (where PMXE is 0).
523 */
524 current->thread.used_ebb = 1;
525 current->thread.mmcr0 |= MMCR0_PMXE;
526}
527
528static void ebb_switch_out(unsigned long mmcr0)
529{
530 if (!(mmcr0 & MMCR0_EBE))
531 return;
532
533 current->thread.siar = mfspr(SPRN_SIAR);
534 current->thread.sier = mfspr(SPRN_SIER);
535 current->thread.sdar = mfspr(SPRN_SDAR);
536 current->thread.mmcr0 = mmcr0 & MMCR0_USER_MASK;
537 current->thread.mmcr2 = mfspr(SPRN_MMCR2) & MMCR2_USER_MASK;
538}
539
540static unsigned long ebb_switch_in(bool ebb, unsigned long mmcr0)
541{
542 if (!ebb)
543 goto out;
544
545 /* Enable EBB and read/write to all 6 PMCs for userspace */
546 mmcr0 |= MMCR0_EBE | MMCR0_PMCC_U6;
547
548 /* Add any bits from the user reg, FC or PMAO */
549 mmcr0 |= current->thread.mmcr0;
550
551 /* Be careful not to set PMXE if userspace had it cleared */
552 if (!(current->thread.mmcr0 & MMCR0_PMXE))
553 mmcr0 &= ~MMCR0_PMXE;
554
555 mtspr(SPRN_SIAR, current->thread.siar);
556 mtspr(SPRN_SIER, current->thread.sier);
557 mtspr(SPRN_SDAR, current->thread.sdar);
558 mtspr(SPRN_MMCR2, current->thread.mmcr2);
559out:
560 return mmcr0;
561}
465#endif /* CONFIG_PPC64 */ 562#endif /* CONFIG_PPC64 */
466 563
467static void perf_event_interrupt(struct pt_regs *regs); 564static void perf_event_interrupt(struct pt_regs *regs);
@@ -732,6 +829,13 @@ static void power_pmu_read(struct perf_event *event)
732 829
733 if (!event->hw.idx) 830 if (!event->hw.idx)
734 return; 831 return;
832
833 if (is_ebb_event(event)) {
834 val = read_pmc(event->hw.idx);
835 local64_set(&event->hw.prev_count, val);
836 return;
837 }
838
735 /* 839 /*
736 * Performance monitor interrupts come even when interrupts 840 * Performance monitor interrupts come even when interrupts
737 * are soft-disabled, as long as interrupts are hard-enabled. 841 * are soft-disabled, as long as interrupts are hard-enabled.
@@ -852,7 +956,7 @@ static void write_mmcr0(struct cpu_hw_events *cpuhw, unsigned long mmcr0)
852static void power_pmu_disable(struct pmu *pmu) 956static void power_pmu_disable(struct pmu *pmu)
853{ 957{
854 struct cpu_hw_events *cpuhw; 958 struct cpu_hw_events *cpuhw;
855 unsigned long flags; 959 unsigned long flags, mmcr0, val;
856 960
857 if (!ppmu) 961 if (!ppmu)
858 return; 962 return;
@@ -860,9 +964,6 @@ static void power_pmu_disable(struct pmu *pmu)
860 cpuhw = &__get_cpu_var(cpu_hw_events); 964 cpuhw = &__get_cpu_var(cpu_hw_events);
861 965
862 if (!cpuhw->disabled) { 966 if (!cpuhw->disabled) {
863 cpuhw->disabled = 1;
864 cpuhw->n_added = 0;
865
866 /* 967 /*
867 * Check if we ever enabled the PMU on this cpu. 968 * Check if we ever enabled the PMU on this cpu.
868 */ 969 */
@@ -872,6 +973,21 @@ static void power_pmu_disable(struct pmu *pmu)
872 } 973 }
873 974
874 /* 975 /*
976 * Set the 'freeze counters' bit, clear EBE/PMCC/PMAO/FC56.
977 */
978 val = mmcr0 = mfspr(SPRN_MMCR0);
979 val |= MMCR0_FC;
980 val &= ~(MMCR0_EBE | MMCR0_PMCC | MMCR0_PMAO | MMCR0_FC56);
981
982 /*
983 * The barrier is to make sure the mtspr has been
984 * executed and the PMU has frozen the events etc.
985 * before we return.
986 */
987 write_mmcr0(cpuhw, val);
988 mb();
989
990 /*
875 * Disable instruction sampling if it was enabled 991 * Disable instruction sampling if it was enabled
876 */ 992 */
877 if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) { 993 if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
@@ -880,15 +996,12 @@ static void power_pmu_disable(struct pmu *pmu)
880 mb(); 996 mb();
881 } 997 }
882 998
883 /* 999 cpuhw->disabled = 1;
884 * Set the 'freeze counters' bit. 1000 cpuhw->n_added = 0;
885 * The barrier is to make sure the mtspr has been 1001
886 * executed and the PMU has frozen the events 1002 ebb_switch_out(mmcr0);
887 * before we return.
888 */
889 write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC);
890 mb();
891 } 1003 }
1004
892 local_irq_restore(flags); 1005 local_irq_restore(flags);
893} 1006}
894 1007
@@ -903,23 +1016,36 @@ static void power_pmu_enable(struct pmu *pmu)
903 struct cpu_hw_events *cpuhw; 1016 struct cpu_hw_events *cpuhw;
904 unsigned long flags; 1017 unsigned long flags;
905 long i; 1018 long i;
906 unsigned long val; 1019 unsigned long val, mmcr0;
907 s64 left; 1020 s64 left;
908 unsigned int hwc_index[MAX_HWEVENTS]; 1021 unsigned int hwc_index[MAX_HWEVENTS];
909 int n_lim; 1022 int n_lim;
910 int idx; 1023 int idx;
1024 bool ebb;
911 1025
912 if (!ppmu) 1026 if (!ppmu)
913 return; 1027 return;
914 local_irq_save(flags); 1028 local_irq_save(flags);
1029
915 cpuhw = &__get_cpu_var(cpu_hw_events); 1030 cpuhw = &__get_cpu_var(cpu_hw_events);
916 if (!cpuhw->disabled) { 1031 if (!cpuhw->disabled)
917 local_irq_restore(flags); 1032 goto out;
918 return; 1033
1034 if (cpuhw->n_events == 0) {
1035 ppc_set_pmu_inuse(0);
1036 goto out;
919 } 1037 }
1038
920 cpuhw->disabled = 0; 1039 cpuhw->disabled = 0;
921 1040
922 /* 1041 /*
1042 * EBB requires an exclusive group and all events must have the EBB
1043 * flag set, or not set, so we can just check a single event. Also we
1044 * know we have at least one event.
1045 */
1046 ebb = is_ebb_event(cpuhw->event[0]);
1047
1048 /*
923 * If we didn't change anything, or only removed events, 1049 * If we didn't change anything, or only removed events,
924 * no need to recalculate MMCR* settings and reset the PMCs. 1050 * no need to recalculate MMCR* settings and reset the PMCs.
925 * Just reenable the PMU with the current MMCR* settings 1051 * Just reenable the PMU with the current MMCR* settings
@@ -928,8 +1054,6 @@ static void power_pmu_enable(struct pmu *pmu)
928 if (!cpuhw->n_added) { 1054 if (!cpuhw->n_added) {
929 mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE); 1055 mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
930 mtspr(SPRN_MMCR1, cpuhw->mmcr[1]); 1056 mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
931 if (cpuhw->n_events == 0)
932 ppc_set_pmu_inuse(0);
933 goto out_enable; 1057 goto out_enable;
934 } 1058 }
935 1059
@@ -996,25 +1120,34 @@ static void power_pmu_enable(struct pmu *pmu)
996 ++n_lim; 1120 ++n_lim;
997 continue; 1121 continue;
998 } 1122 }
999 val = 0; 1123
1000 if (event->hw.sample_period) { 1124 if (ebb)
1001 left = local64_read(&event->hw.period_left); 1125 val = local64_read(&event->hw.prev_count);
1002 if (left < 0x80000000L) 1126 else {
1003 val = 0x80000000L - left; 1127 val = 0;
1128 if (event->hw.sample_period) {
1129 left = local64_read(&event->hw.period_left);
1130 if (left < 0x80000000L)
1131 val = 0x80000000L - left;
1132 }
1133 local64_set(&event->hw.prev_count, val);
1004 } 1134 }
1005 local64_set(&event->hw.prev_count, val); 1135
1006 event->hw.idx = idx; 1136 event->hw.idx = idx;
1007 if (event->hw.state & PERF_HES_STOPPED) 1137 if (event->hw.state & PERF_HES_STOPPED)
1008 val = 0; 1138 val = 0;
1009 write_pmc(idx, val); 1139 write_pmc(idx, val);
1140
1010 perf_event_update_userpage(event); 1141 perf_event_update_userpage(event);
1011 } 1142 }
1012 cpuhw->n_limited = n_lim; 1143 cpuhw->n_limited = n_lim;
1013 cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE; 1144 cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
1014 1145
1015 out_enable: 1146 out_enable:
1147 mmcr0 = ebb_switch_in(ebb, cpuhw->mmcr[0]);
1148
1016 mb(); 1149 mb();
1017 write_mmcr0(cpuhw, cpuhw->mmcr[0]); 1150 write_mmcr0(cpuhw, mmcr0);
1018 1151
1019 /* 1152 /*
1020 * Enable instruction sampling if necessary 1153 * Enable instruction sampling if necessary
@@ -1112,6 +1245,8 @@ static int power_pmu_add(struct perf_event *event, int ef_flags)
1112 event->hw.config = cpuhw->events[n0]; 1245 event->hw.config = cpuhw->events[n0];
1113 1246
1114nocheck: 1247nocheck:
1248 ebb_event_add(event);
1249
1115 ++cpuhw->n_events; 1250 ++cpuhw->n_events;
1116 ++cpuhw->n_added; 1251 ++cpuhw->n_added;
1117 1252
@@ -1472,6 +1607,11 @@ static int power_pmu_event_init(struct perf_event *event)
1472 } 1607 }
1473 } 1608 }
1474 1609
1610 /* Extra checks for EBB */
1611 err = ebb_event_check(event);
1612 if (err)
1613 return err;
1614
1475 /* 1615 /*
1476 * If this is in a group, check if it can go on with all the 1616 * If this is in a group, check if it can go on with all the
1477 * other hardware events in the group. We assume the event 1617 * other hardware events in the group. We assume the event
@@ -1511,6 +1651,13 @@ static int power_pmu_event_init(struct perf_event *event)
1511 local64_set(&event->hw.period_left, event->hw.last_period); 1651 local64_set(&event->hw.period_left, event->hw.last_period);
1512 1652
1513 /* 1653 /*
1654 * For EBB events we just context switch the PMC value, we don't do any
1655 * of the sample_period logic. We use hw.prev_count for this.
1656 */
1657 if (is_ebb_event(event))
1658 local64_set(&event->hw.prev_count, 0);
1659
1660 /*
1514 * See if we need to reserve the PMU. 1661 * See if we need to reserve the PMU.
1515 * If no events are currently in use, then we have to take a 1662 * If no events are currently in use, then we have to take a
1516 * mutex to ensure that we don't race with another task doing 1663 * mutex to ensure that we don't race with another task doing
@@ -1786,7 +1933,7 @@ static void power_pmu_setup(int cpu)
1786 cpuhw->mmcr[0] = MMCR0_FC; 1933 cpuhw->mmcr[0] = MMCR0_FC;
1787} 1934}
1788 1935
1789static int __cpuinit 1936static int
1790power_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) 1937power_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
1791{ 1938{
1792 unsigned int cpu = (long)hcpu; 1939 unsigned int cpu = (long)hcpu;
@@ -1803,7 +1950,7 @@ power_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu
1803 return NOTIFY_OK; 1950 return NOTIFY_OK;
1804} 1951}
1805 1952
1806int __cpuinit register_power_pmu(struct power_pmu *pmu) 1953int register_power_pmu(struct power_pmu *pmu)
1807{ 1954{
1808 if (ppmu) 1955 if (ppmu)
1809 return -EBUSY; /* something's already registered */ 1956 return -EBUSY; /* something's already registered */
diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c
index f7d1c4fff303..96a64d6a8bdf 100644
--- a/arch/powerpc/perf/power8-pmu.c
+++ b/arch/powerpc/perf/power8-pmu.c
@@ -31,9 +31,9 @@
31 * 31 *
32 * 60 56 52 48 44 40 36 32 32 * 60 56 52 48 44 40 36 32
33 * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | 33 * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - |
34 * [ thresh_cmp ] [ thresh_ctl ] 34 * | [ thresh_cmp ] [ thresh_ctl ]
35 * | 35 * | |
36 * thresh start/stop OR FAB match -* 36 * *- EBB (Linux) thresh start/stop OR FAB match -*
37 * 37 *
38 * 28 24 20 16 12 8 4 0 38 * 28 24 20 16 12 8 4 0
39 * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | 39 * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - |
@@ -85,6 +85,7 @@
85 * 85 *
86 */ 86 */
87 87
88#define EVENT_EBB_MASK 1ull
88#define EVENT_THR_CMP_SHIFT 40 /* Threshold CMP value */ 89#define EVENT_THR_CMP_SHIFT 40 /* Threshold CMP value */
89#define EVENT_THR_CMP_MASK 0x3ff 90#define EVENT_THR_CMP_MASK 0x3ff
90#define EVENT_THR_CTL_SHIFT 32 /* Threshold control value (start/stop) */ 91#define EVENT_THR_CTL_SHIFT 32 /* Threshold control value (start/stop) */
@@ -109,6 +110,17 @@
109#define EVENT_IS_MARKED (EVENT_MARKED_MASK << EVENT_MARKED_SHIFT) 110#define EVENT_IS_MARKED (EVENT_MARKED_MASK << EVENT_MARKED_SHIFT)
110#define EVENT_PSEL_MASK 0xff /* PMCxSEL value */ 111#define EVENT_PSEL_MASK 0xff /* PMCxSEL value */
111 112
113#define EVENT_VALID_MASK \
114 ((EVENT_THRESH_MASK << EVENT_THRESH_SHIFT) | \
115 (EVENT_SAMPLE_MASK << EVENT_SAMPLE_SHIFT) | \
116 (EVENT_CACHE_SEL_MASK << EVENT_CACHE_SEL_SHIFT) | \
117 (EVENT_PMC_MASK << EVENT_PMC_SHIFT) | \
118 (EVENT_UNIT_MASK << EVENT_UNIT_SHIFT) | \
119 (EVENT_COMBINE_MASK << EVENT_COMBINE_SHIFT) | \
120 (EVENT_MARKED_MASK << EVENT_MARKED_SHIFT) | \
121 (EVENT_EBB_MASK << EVENT_CONFIG_EBB_SHIFT) | \
122 EVENT_PSEL_MASK)
123
112/* MMCRA IFM bits - POWER8 */ 124/* MMCRA IFM bits - POWER8 */
113#define POWER8_MMCRA_IFM1 0x0000000040000000UL 125#define POWER8_MMCRA_IFM1 0x0000000040000000UL
114#define POWER8_MMCRA_IFM2 0x0000000080000000UL 126#define POWER8_MMCRA_IFM2 0x0000000080000000UL
@@ -130,10 +142,10 @@
130 * 142 *
131 * 28 24 20 16 12 8 4 0 143 * 28 24 20 16 12 8 4 0
132 * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | 144 * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - |
133 * [ ] [ sample ] [ ] [6] [5] [4] [3] [2] [1] 145 * | [ ] [ sample ] [ ] [6] [5] [4] [3] [2] [1]
134 * | | 146 * EBB -* | |
135 * L1 I/D qualifier -* | Count of events for each PMC. 147 * | | Count of events for each PMC.
136 * | p1, p2, p3, p4, p5, p6. 148 * L1 I/D qualifier -* | p1, p2, p3, p4, p5, p6.
137 * nc - number of counters -* 149 * nc - number of counters -*
138 * 150 *
139 * The PMC fields P1..P6, and NC, are adder fields. As we accumulate constraints 151 * The PMC fields P1..P6, and NC, are adder fields. As we accumulate constraints
@@ -149,6 +161,9 @@
149#define CNST_THRESH_VAL(v) (((v) & EVENT_THRESH_MASK) << 32) 161#define CNST_THRESH_VAL(v) (((v) & EVENT_THRESH_MASK) << 32)
150#define CNST_THRESH_MASK CNST_THRESH_VAL(EVENT_THRESH_MASK) 162#define CNST_THRESH_MASK CNST_THRESH_VAL(EVENT_THRESH_MASK)
151 163
164#define CNST_EBB_VAL(v) (((v) & EVENT_EBB_MASK) << 24)
165#define CNST_EBB_MASK CNST_EBB_VAL(EVENT_EBB_MASK)
166
152#define CNST_L1_QUAL_VAL(v) (((v) & 3) << 22) 167#define CNST_L1_QUAL_VAL(v) (((v) & 3) << 22)
153#define CNST_L1_QUAL_MASK CNST_L1_QUAL_VAL(3) 168#define CNST_L1_QUAL_MASK CNST_L1_QUAL_VAL(3)
154 169
@@ -207,14 +222,21 @@ static inline bool event_is_fab_match(u64 event)
207 222
208static int power8_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp) 223static int power8_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp)
209{ 224{
210 unsigned int unit, pmc, cache; 225 unsigned int unit, pmc, cache, ebb;
211 unsigned long mask, value; 226 unsigned long mask, value;
212 227
213 mask = value = 0; 228 mask = value = 0;
214 229
215 pmc = (event >> EVENT_PMC_SHIFT) & EVENT_PMC_MASK; 230 if (event & ~EVENT_VALID_MASK)
216 unit = (event >> EVENT_UNIT_SHIFT) & EVENT_UNIT_MASK; 231 return -1;
217 cache = (event >> EVENT_CACHE_SEL_SHIFT) & EVENT_CACHE_SEL_MASK; 232
233 pmc = (event >> EVENT_PMC_SHIFT) & EVENT_PMC_MASK;
234 unit = (event >> EVENT_UNIT_SHIFT) & EVENT_UNIT_MASK;
235 cache = (event >> EVENT_CACHE_SEL_SHIFT) & EVENT_CACHE_SEL_MASK;
236 ebb = (event >> EVENT_CONFIG_EBB_SHIFT) & EVENT_EBB_MASK;
237
238 /* Clear the EBB bit in the event, so event checks work below */
239 event &= ~(EVENT_EBB_MASK << EVENT_CONFIG_EBB_SHIFT);
218 240
219 if (pmc) { 241 if (pmc) {
220 if (pmc > 6) 242 if (pmc > 6)
@@ -284,6 +306,18 @@ static int power8_get_constraint(u64 event, unsigned long *maskp, unsigned long
284 value |= CNST_THRESH_VAL(event >> EVENT_THRESH_SHIFT); 306 value |= CNST_THRESH_VAL(event >> EVENT_THRESH_SHIFT);
285 } 307 }
286 308
309 if (!pmc && ebb)
310 /* EBB events must specify the PMC */
311 return -1;
312
313 /*
314 * All events must agree on EBB, either all request it or none.
315 * EBB events are pinned & exclusive, so this should never actually
316 * hit, but we leave it as a fallback in case.
317 */
318 mask |= CNST_EBB_VAL(ebb);
319 value |= CNST_EBB_MASK;
320
287 *maskp = mask; 321 *maskp = mask;
288 *valp = value; 322 *valp = value;
289 323
@@ -378,6 +412,10 @@ static int power8_compute_mmcr(u64 event[], int n_ev,
378 if (pmc_inuse & 0x7c) 412 if (pmc_inuse & 0x7c)
379 mmcr[0] |= MMCR0_PMCjCE; 413 mmcr[0] |= MMCR0_PMCjCE;
380 414
415 /* If we're not using PMC 5 or 6, freeze them */
416 if (!(pmc_inuse & 0x60))
417 mmcr[0] |= MMCR0_FC56;
418
381 mmcr[1] = mmcr1; 419 mmcr[1] = mmcr1;
382 mmcr[2] = mmcra; 420 mmcr[2] = mmcra;
383 421
@@ -574,7 +612,7 @@ static struct power_pmu power8_pmu = {
574 .get_constraint = power8_get_constraint, 612 .get_constraint = power8_get_constraint,
575 .get_alternatives = power8_get_alternatives, 613 .get_alternatives = power8_get_alternatives,
576 .disable_pmc = power8_disable_pmc, 614 .disable_pmc = power8_disable_pmc,
577 .flags = PPMU_HAS_SSLOT | PPMU_HAS_SIER | PPMU_BHRB, 615 .flags = PPMU_HAS_SSLOT | PPMU_HAS_SIER | PPMU_BHRB | PPMU_EBB,
578 .n_generic = ARRAY_SIZE(power8_generic_events), 616 .n_generic = ARRAY_SIZE(power8_generic_events),
579 .generic_events = power8_generic_events, 617 .generic_events = power8_generic_events,
580 .attr_groups = power8_pmu_attr_groups, 618 .attr_groups = power8_pmu_attr_groups,
diff --git a/arch/powerpc/platforms/44x/currituck.c b/arch/powerpc/platforms/44x/currituck.c
index ecd3890c40d7..7f1b71a01c6a 100644
--- a/arch/powerpc/platforms/44x/currituck.c
+++ b/arch/powerpc/platforms/44x/currituck.c
@@ -91,12 +91,12 @@ static void __init ppc47x_init_irq(void)
91} 91}
92 92
93#ifdef CONFIG_SMP 93#ifdef CONFIG_SMP
94static void __cpuinit smp_ppc47x_setup_cpu(int cpu) 94static void smp_ppc47x_setup_cpu(int cpu)
95{ 95{
96 mpic_setup_this_cpu(); 96 mpic_setup_this_cpu();
97} 97}
98 98
99static int __cpuinit smp_ppc47x_kick_cpu(int cpu) 99static int smp_ppc47x_kick_cpu(int cpu)
100{ 100{
101 struct device_node *cpunode = of_get_cpu_node(cpu, NULL); 101 struct device_node *cpunode = of_get_cpu_node(cpu, NULL);
102 const u64 *spin_table_addr_prop; 102 const u64 *spin_table_addr_prop;
@@ -176,13 +176,48 @@ static int __init ppc47x_probe(void)
176 return 1; 176 return 1;
177} 177}
178 178
179static int board_rev = -1;
180static int __init ppc47x_get_board_rev(void)
181{
182 u8 fpga_reg0;
183 void *fpga;
184 struct device_node *np;
185
186 np = of_find_compatible_node(NULL, NULL, "ibm,currituck-fpga");
187 if (!np)
188 goto fail;
189
190 fpga = of_iomap(np, 0);
191 of_node_put(np);
192 if (!fpga)
193 goto fail;
194
195 fpga_reg0 = ioread8(fpga);
196 board_rev = fpga_reg0 & 0x03;
197 pr_info("%s: Found board revision %d\n", __func__, board_rev);
198 iounmap(fpga);
199 return 0;
200
201fail:
202 pr_info("%s: Unable to find board revision\n", __func__);
203 return 0;
204}
205machine_arch_initcall(ppc47x, ppc47x_get_board_rev);
206
179/* Use USB controller should have been hardware swizzled but it wasn't :( */ 207/* Use USB controller should have been hardware swizzled but it wasn't :( */
180static void ppc47x_pci_irq_fixup(struct pci_dev *dev) 208static void ppc47x_pci_irq_fixup(struct pci_dev *dev)
181{ 209{
182 if (dev->vendor == 0x1033 && (dev->device == 0x0035 || 210 if (dev->vendor == 0x1033 && (dev->device == 0x0035 ||
183 dev->device == 0x00e0)) { 211 dev->device == 0x00e0)) {
184 dev->irq = irq_create_mapping(NULL, 47); 212 if (board_rev == 0) {
185 pr_info("%s: Mapping irq 47 %d\n", __func__, dev->irq); 213 dev->irq = irq_create_mapping(NULL, 47);
214 pr_info("%s: Mapping irq %d\n", __func__, dev->irq);
215 } else if (board_rev == 2) {
216 dev->irq = irq_create_mapping(NULL, 49);
217 pr_info("%s: Mapping irq %d\n", __func__, dev->irq);
218 } else {
219 pr_alert("%s: Unknown board revision\n", __func__);
220 }
186 } 221 }
187} 222}
188 223
diff --git a/arch/powerpc/platforms/44x/iss4xx.c b/arch/powerpc/platforms/44x/iss4xx.c
index a28a8629727e..4241bc825800 100644
--- a/arch/powerpc/platforms/44x/iss4xx.c
+++ b/arch/powerpc/platforms/44x/iss4xx.c
@@ -81,12 +81,12 @@ static void __init iss4xx_init_irq(void)
81} 81}
82 82
83#ifdef CONFIG_SMP 83#ifdef CONFIG_SMP
84static void __cpuinit smp_iss4xx_setup_cpu(int cpu) 84static void smp_iss4xx_setup_cpu(int cpu)
85{ 85{
86 mpic_setup_this_cpu(); 86 mpic_setup_this_cpu();
87} 87}
88 88
89static int __cpuinit smp_iss4xx_kick_cpu(int cpu) 89static int smp_iss4xx_kick_cpu(int cpu)
90{ 90{
91 struct device_node *cpunode = of_get_cpu_node(cpu, NULL); 91 struct device_node *cpunode = of_get_cpu_node(cpu, NULL);
92 const u64 *spin_table_addr_prop; 92 const u64 *spin_table_addr_prop;
diff --git a/arch/powerpc/platforms/512x/mpc5121_ads.c b/arch/powerpc/platforms/512x/mpc5121_ads.c
index 0a134e0469ef..3e90ece10ae9 100644
--- a/arch/powerpc/platforms/512x/mpc5121_ads.c
+++ b/arch/powerpc/platforms/512x/mpc5121_ads.c
@@ -43,9 +43,7 @@ static void __init mpc5121_ads_setup_arch(void)
43 mpc83xx_add_bridge(np); 43 mpc83xx_add_bridge(np);
44#endif 44#endif
45 45
46#if defined(CONFIG_FB_FSL_DIU) || defined(CONFIG_FB_FSL_DIU_MODULE) 46 mpc512x_setup_arch();
47 mpc512x_setup_diu();
48#endif
49} 47}
50 48
51static void __init mpc5121_ads_init_IRQ(void) 49static void __init mpc5121_ads_init_IRQ(void)
@@ -69,7 +67,7 @@ define_machine(mpc5121_ads) {
69 .probe = mpc5121_ads_probe, 67 .probe = mpc5121_ads_probe,
70 .setup_arch = mpc5121_ads_setup_arch, 68 .setup_arch = mpc5121_ads_setup_arch,
71 .init = mpc512x_init, 69 .init = mpc512x_init,
72 .init_early = mpc512x_init_diu, 70 .init_early = mpc512x_init_early,
73 .init_IRQ = mpc5121_ads_init_IRQ, 71 .init_IRQ = mpc5121_ads_init_IRQ,
74 .get_irq = ipic_get_irq, 72 .get_irq = ipic_get_irq,
75 .calibrate_decr = generic_calibrate_decr, 73 .calibrate_decr = generic_calibrate_decr,
diff --git a/arch/powerpc/platforms/512x/mpc512x.h b/arch/powerpc/platforms/512x/mpc512x.h
index 0a8e60023944..cc97f022d028 100644
--- a/arch/powerpc/platforms/512x/mpc512x.h
+++ b/arch/powerpc/platforms/512x/mpc512x.h
@@ -12,18 +12,12 @@
12#ifndef __MPC512X_H__ 12#ifndef __MPC512X_H__
13#define __MPC512X_H__ 13#define __MPC512X_H__
14extern void __init mpc512x_init_IRQ(void); 14extern void __init mpc512x_init_IRQ(void);
15extern void __init mpc512x_init_early(void);
15extern void __init mpc512x_init(void); 16extern void __init mpc512x_init(void);
17extern void __init mpc512x_setup_arch(void);
16extern int __init mpc5121_clk_init(void); 18extern int __init mpc5121_clk_init(void);
17void __init mpc512x_declare_of_platform_devices(void);
18extern const char *mpc512x_select_psc_compat(void); 19extern const char *mpc512x_select_psc_compat(void);
20extern const char *mpc512x_select_reset_compat(void);
19extern void mpc512x_restart(char *cmd); 21extern void mpc512x_restart(char *cmd);
20 22
21#if defined(CONFIG_FB_FSL_DIU) || defined(CONFIG_FB_FSL_DIU_MODULE)
22void mpc512x_init_diu(void);
23void mpc512x_setup_diu(void);
24#else
25#define mpc512x_init_diu NULL
26#define mpc512x_setup_diu NULL
27#endif
28
29#endif /* __MPC512X_H__ */ 23#endif /* __MPC512X_H__ */
diff --git a/arch/powerpc/platforms/512x/mpc512x_generic.c b/arch/powerpc/platforms/512x/mpc512x_generic.c
index 5fb919b30924..ce71408781a0 100644
--- a/arch/powerpc/platforms/512x/mpc512x_generic.c
+++ b/arch/powerpc/platforms/512x/mpc512x_generic.c
@@ -45,8 +45,8 @@ define_machine(mpc512x_generic) {
45 .name = "MPC512x generic", 45 .name = "MPC512x generic",
46 .probe = mpc512x_generic_probe, 46 .probe = mpc512x_generic_probe,
47 .init = mpc512x_init, 47 .init = mpc512x_init,
48 .init_early = mpc512x_init_diu, 48 .init_early = mpc512x_init_early,
49 .setup_arch = mpc512x_setup_diu, 49 .setup_arch = mpc512x_setup_arch,
50 .init_IRQ = mpc512x_init_IRQ, 50 .init_IRQ = mpc512x_init_IRQ,
51 .get_irq = ipic_get_irq, 51 .get_irq = ipic_get_irq,
52 .calibrate_decr = generic_calibrate_decr, 52 .calibrate_decr = generic_calibrate_decr,
diff --git a/arch/powerpc/platforms/512x/mpc512x_shared.c b/arch/powerpc/platforms/512x/mpc512x_shared.c
index 6eb94ab99d39..a82a41b4fd91 100644
--- a/arch/powerpc/platforms/512x/mpc512x_shared.c
+++ b/arch/powerpc/platforms/512x/mpc512x_shared.c
@@ -35,8 +35,10 @@ static struct mpc512x_reset_module __iomem *reset_module_base;
35static void __init mpc512x_restart_init(void) 35static void __init mpc512x_restart_init(void)
36{ 36{
37 struct device_node *np; 37 struct device_node *np;
38 const char *reset_compat;
38 39
39 np = of_find_compatible_node(NULL, NULL, "fsl,mpc5121-reset"); 40 reset_compat = mpc512x_select_reset_compat();
41 np = of_find_compatible_node(NULL, NULL, reset_compat);
40 if (!np) 42 if (!np)
41 return; 43 return;
42 44
@@ -58,7 +60,7 @@ void mpc512x_restart(char *cmd)
58 ; 60 ;
59} 61}
60 62
61#if defined(CONFIG_FB_FSL_DIU) || defined(CONFIG_FB_FSL_DIU_MODULE) 63#if IS_ENABLED(CONFIG_FB_FSL_DIU)
62 64
63struct fsl_diu_shared_fb { 65struct fsl_diu_shared_fb {
64 u8 gamma[0x300]; /* 32-bit aligned! */ 66 u8 gamma[0x300]; /* 32-bit aligned! */
@@ -355,6 +357,17 @@ const char *mpc512x_select_psc_compat(void)
355 return NULL; 357 return NULL;
356} 358}
357 359
360const char *mpc512x_select_reset_compat(void)
361{
362 if (of_machine_is_compatible("fsl,mpc5121"))
363 return "fsl,mpc5121-reset";
364
365 if (of_machine_is_compatible("fsl,mpc5125"))
366 return "fsl,mpc5125-reset";
367
368 return NULL;
369}
370
358static unsigned int __init get_fifo_size(struct device_node *np, 371static unsigned int __init get_fifo_size(struct device_node *np,
359 char *prop_name) 372 char *prop_name)
360{ 373{
@@ -436,14 +449,26 @@ void __init mpc512x_psc_fifo_init(void)
436 } 449 }
437} 450}
438 451
452void __init mpc512x_init_early(void)
453{
454 mpc512x_restart_init();
455 if (IS_ENABLED(CONFIG_FB_FSL_DIU))
456 mpc512x_init_diu();
457}
458
439void __init mpc512x_init(void) 459void __init mpc512x_init(void)
440{ 460{
441 mpc5121_clk_init(); 461 mpc5121_clk_init();
442 mpc512x_declare_of_platform_devices(); 462 mpc512x_declare_of_platform_devices();
443 mpc512x_restart_init();
444 mpc512x_psc_fifo_init(); 463 mpc512x_psc_fifo_init();
445} 464}
446 465
466void __init mpc512x_setup_arch(void)
467{
468 if (IS_ENABLED(CONFIG_FB_FSL_DIU))
469 mpc512x_setup_diu();
470}
471
447/** 472/**
448 * mpc512x_cs_config - Setup chip select configuration 473 * mpc512x_cs_config - Setup chip select configuration
449 * @cs: chip select number 474 * @cs: chip select number
diff --git a/arch/powerpc/platforms/512x/pdm360ng.c b/arch/powerpc/platforms/512x/pdm360ng.c
index 0575e858291c..24b314d7bd5f 100644
--- a/arch/powerpc/platforms/512x/pdm360ng.c
+++ b/arch/powerpc/platforms/512x/pdm360ng.c
@@ -119,9 +119,9 @@ static int __init pdm360ng_probe(void)
119define_machine(pdm360ng) { 119define_machine(pdm360ng) {
120 .name = "PDM360NG", 120 .name = "PDM360NG",
121 .probe = pdm360ng_probe, 121 .probe = pdm360ng_probe,
122 .setup_arch = mpc512x_setup_diu, 122 .setup_arch = mpc512x_setup_arch,
123 .init = pdm360ng_init, 123 .init = pdm360ng_init,
124 .init_early = mpc512x_init_diu, 124 .init_early = mpc512x_init_early,
125 .init_IRQ = mpc512x_init_IRQ, 125 .init_IRQ = mpc512x_init_IRQ,
126 .get_irq = ipic_get_irq, 126 .get_irq = ipic_get_irq,
127 .calibrate_decr = generic_calibrate_decr, 127 .calibrate_decr = generic_calibrate_decr,
diff --git a/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c b/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c
index 624cb51d19c9..7bc315822935 100644
--- a/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c
+++ b/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c
@@ -231,17 +231,7 @@ static struct i2c_driver mcu_driver = {
231 .id_table = mcu_ids, 231 .id_table = mcu_ids,
232}; 232};
233 233
234static int __init mcu_init(void) 234module_i2c_driver(mcu_driver);
235{
236 return i2c_add_driver(&mcu_driver);
237}
238module_init(mcu_init);
239
240static void __exit mcu_exit(void)
241{
242 i2c_del_driver(&mcu_driver);
243}
244module_exit(mcu_exit);
245 235
246MODULE_DESCRIPTION("Power Management and GPIO expander driver for " 236MODULE_DESCRIPTION("Power Management and GPIO expander driver for "
247 "MPC8349E-mITX-compatible MCU"); 237 "MPC8349E-mITX-compatible MCU");
diff --git a/arch/powerpc/platforms/85xx/p5020_ds.c b/arch/powerpc/platforms/85xx/p5020_ds.c
index 753a42c29d4d..39cfa4044e6c 100644
--- a/arch/powerpc/platforms/85xx/p5020_ds.c
+++ b/arch/powerpc/platforms/85xx/p5020_ds.c
@@ -75,12 +75,7 @@ define_machine(p5020_ds) {
75#ifdef CONFIG_PCI 75#ifdef CONFIG_PCI
76 .pcibios_fixup_bus = fsl_pcibios_fixup_bus, 76 .pcibios_fixup_bus = fsl_pcibios_fixup_bus,
77#endif 77#endif
78/* coreint doesn't play nice with lazy EE, use legacy mpic for now */
79#ifdef CONFIG_PPC64
80 .get_irq = mpic_get_irq,
81#else
82 .get_irq = mpic_get_coreint_irq, 78 .get_irq = mpic_get_coreint_irq,
83#endif
84 .restart = fsl_rstcr_restart, 79 .restart = fsl_rstcr_restart,
85 .calibrate_decr = generic_calibrate_decr, 80 .calibrate_decr = generic_calibrate_decr,
86 .progress = udbg_progress, 81 .progress = udbg_progress,
diff --git a/arch/powerpc/platforms/85xx/p5040_ds.c b/arch/powerpc/platforms/85xx/p5040_ds.c
index 11381851828e..f70e74cddf97 100644
--- a/arch/powerpc/platforms/85xx/p5040_ds.c
+++ b/arch/powerpc/platforms/85xx/p5040_ds.c
@@ -66,12 +66,7 @@ define_machine(p5040_ds) {
66#ifdef CONFIG_PCI 66#ifdef CONFIG_PCI
67 .pcibios_fixup_bus = fsl_pcibios_fixup_bus, 67 .pcibios_fixup_bus = fsl_pcibios_fixup_bus,
68#endif 68#endif
69/* coreint doesn't play nice with lazy EE, use legacy mpic for now */
70#ifdef CONFIG_PPC64
71 .get_irq = mpic_get_irq,
72#else
73 .get_irq = mpic_get_coreint_irq, 69 .get_irq = mpic_get_coreint_irq,
74#endif
75 .restart = fsl_rstcr_restart, 70 .restart = fsl_rstcr_restart,
76 .calibrate_decr = generic_calibrate_decr, 71 .calibrate_decr = generic_calibrate_decr,
77 .progress = udbg_progress, 72 .progress = udbg_progress,
diff --git a/arch/powerpc/platforms/85xx/smp.c b/arch/powerpc/platforms/85xx/smp.c
index 6a1759939c6b..5ced4f5bb2b2 100644
--- a/arch/powerpc/platforms/85xx/smp.c
+++ b/arch/powerpc/platforms/85xx/smp.c
@@ -99,7 +99,7 @@ static void mpc85xx_take_timebase(void)
99} 99}
100 100
101#ifdef CONFIG_HOTPLUG_CPU 101#ifdef CONFIG_HOTPLUG_CPU
102static void __cpuinit smp_85xx_mach_cpu_die(void) 102static void smp_85xx_mach_cpu_die(void)
103{ 103{
104 unsigned int cpu = smp_processor_id(); 104 unsigned int cpu = smp_processor_id();
105 u32 tmp; 105 u32 tmp;
@@ -141,7 +141,7 @@ static inline u32 read_spin_table_addr_l(void *spin_table)
141 return in_be32(&((struct epapr_spin_table *)spin_table)->addr_l); 141 return in_be32(&((struct epapr_spin_table *)spin_table)->addr_l);
142} 142}
143 143
144static int __cpuinit smp_85xx_kick_cpu(int nr) 144static int smp_85xx_kick_cpu(int nr)
145{ 145{
146 unsigned long flags; 146 unsigned long flags;
147 const u64 *cpu_rel_addr; 147 const u64 *cpu_rel_addr;
@@ -362,7 +362,7 @@ static void mpc85xx_smp_machine_kexec(struct kimage *image)
362} 362}
363#endif /* CONFIG_KEXEC */ 363#endif /* CONFIG_KEXEC */
364 364
365static void __cpuinit smp_85xx_setup_cpu(int cpu_nr) 365static void smp_85xx_setup_cpu(int cpu_nr)
366{ 366{
367 if (smp_85xx_ops.probe == smp_mpic_probe) 367 if (smp_85xx_ops.probe == smp_mpic_probe)
368 mpic_setup_this_cpu(); 368 mpic_setup_this_cpu();
diff --git a/arch/powerpc/platforms/85xx/t4240_qds.c b/arch/powerpc/platforms/85xx/t4240_qds.c
index 5998e9f33304..91ead6b1b8af 100644
--- a/arch/powerpc/platforms/85xx/t4240_qds.c
+++ b/arch/powerpc/platforms/85xx/t4240_qds.c
@@ -75,12 +75,7 @@ define_machine(t4240_qds) {
75#ifdef CONFIG_PCI 75#ifdef CONFIG_PCI
76 .pcibios_fixup_bus = fsl_pcibios_fixup_bus, 76 .pcibios_fixup_bus = fsl_pcibios_fixup_bus,
77#endif 77#endif
78/* coreint doesn't play nice with lazy EE, use legacy mpic for now */
79#ifdef CONFIG_PPC64
80 .get_irq = mpic_get_irq,
81#else
82 .get_irq = mpic_get_coreint_irq, 78 .get_irq = mpic_get_coreint_irq,
83#endif
84 .restart = fsl_rstcr_restart, 79 .restart = fsl_rstcr_restart,
85 .calibrate_decr = generic_calibrate_decr, 80 .calibrate_decr = generic_calibrate_decr,
86 .progress = udbg_progress, 81 .progress = udbg_progress,
diff --git a/arch/powerpc/platforms/8xx/m8xx_setup.c b/arch/powerpc/platforms/8xx/m8xx_setup.c
index 1e121088826f..587a2828b06c 100644
--- a/arch/powerpc/platforms/8xx/m8xx_setup.c
+++ b/arch/powerpc/platforms/8xx/m8xx_setup.c
@@ -43,6 +43,7 @@ static irqreturn_t timebase_interrupt(int irq, void *dev)
43 43
44static struct irqaction tbint_irqaction = { 44static struct irqaction tbint_irqaction = {
45 .handler = timebase_interrupt, 45 .handler = timebase_interrupt,
46 .flags = IRQF_NO_THREAD,
46 .name = "tbint", 47 .name = "tbint",
47}; 48};
48 49
@@ -218,19 +219,12 @@ void mpc8xx_restart(char *cmd)
218 219
219static void cpm_cascade(unsigned int irq, struct irq_desc *desc) 220static void cpm_cascade(unsigned int irq, struct irq_desc *desc)
220{ 221{
221 struct irq_chip *chip; 222 struct irq_chip *chip = irq_desc_get_chip(desc);
222 int cascade_irq; 223 int cascade_irq = cpm_get_irq();
223
224 if ((cascade_irq = cpm_get_irq()) >= 0) {
225 struct irq_desc *cdesc = irq_to_desc(cascade_irq);
226 224
225 if (cascade_irq >= 0)
227 generic_handle_irq(cascade_irq); 226 generic_handle_irq(cascade_irq);
228 227
229 chip = irq_desc_get_chip(cdesc);
230 chip->irq_eoi(&cdesc->irq_data);
231 }
232
233 chip = irq_desc_get_chip(desc);
234 chip->irq_eoi(&desc->irq_data); 228 chip->irq_eoi(&desc->irq_data);
235} 229}
236 230
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index e17cdfc5ba40..d703775bda30 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -86,6 +86,27 @@ config MPIC
86 bool 86 bool
87 default n 87 default n
88 88
89config MPIC_TIMER
90 bool "MPIC Global Timer"
91 depends on MPIC && FSL_SOC
92 default n
93 help
94 The MPIC global timer is a hardware timer inside the
95 Freescale PIC complying with OpenPIC standard. When the
96 specified interval times out, the hardware timer generates
97 an interrupt. The driver currently is only tested on fsl
98 chip, but it can potentially support other global timers
99 complying with the OpenPIC standard.
100
101config FSL_MPIC_TIMER_WAKEUP
102 tristate "Freescale MPIC global timer wakeup driver"
103 depends on FSL_SOC && MPIC_TIMER && PM
104 default n
105 help
106 The driver provides a way to wake up the system by MPIC
107 timer.
108 e.g. "echo 5 > /sys/devices/system/mpic/timer_wakeup"
109
89config PPC_EPAPR_HV_PIC 110config PPC_EPAPR_HV_PIC
90 bool 111 bool
91 default n 112 default n
@@ -164,6 +185,11 @@ config IBMEBUS
164 help 185 help
165 Bus device driver for GX bus based adapters. 186 Bus device driver for GX bus based adapters.
166 187
188config EEH
189 bool
190 depends on (PPC_POWERNV || PPC_PSERIES) && PCI
191 default y
192
167config PPC_MPC106 193config PPC_MPC106
168 bool 194 bool
169 default n 195 default n
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 7819c40a6bc3..47d9a03dd415 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -71,6 +71,7 @@ config PPC_BOOK3S_64
71 select PPC_FPU 71 select PPC_FPU
72 select PPC_HAVE_PMU_SUPPORT 72 select PPC_HAVE_PMU_SUPPORT
73 select SYS_SUPPORTS_HUGETLBFS 73 select SYS_SUPPORTS_HUGETLBFS
74 select HAVE_ARCH_TRANSPARENT_HUGEPAGE if PPC_64K_PAGES
74 75
75config PPC_BOOK3E_64 76config PPC_BOOK3E_64
76 bool "Embedded processors" 77 bool "Embedded processors"
diff --git a/arch/powerpc/platforms/cell/beat_htab.c b/arch/powerpc/platforms/cell/beat_htab.c
index 246e1d8b3af3..c34ee4e60873 100644
--- a/arch/powerpc/platforms/cell/beat_htab.c
+++ b/arch/powerpc/platforms/cell/beat_htab.c
@@ -185,7 +185,8 @@ static void beat_lpar_hptab_clear(void)
185static long beat_lpar_hpte_updatepp(unsigned long slot, 185static long beat_lpar_hpte_updatepp(unsigned long slot,
186 unsigned long newpp, 186 unsigned long newpp,
187 unsigned long vpn, 187 unsigned long vpn,
188 int psize, int ssize, int local) 188 int psize, int apsize,
189 int ssize, int local)
189{ 190{
190 unsigned long lpar_rc; 191 unsigned long lpar_rc;
191 u64 dummy0, dummy1; 192 u64 dummy0, dummy1;
@@ -274,7 +275,8 @@ static void beat_lpar_hpte_updateboltedpp(unsigned long newpp,
274} 275}
275 276
276static void beat_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn, 277static void beat_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn,
277 int psize, int ssize, int local) 278 int psize, int apsize,
279 int ssize, int local)
278{ 280{
279 unsigned long want_v; 281 unsigned long want_v;
280 unsigned long lpar_rc; 282 unsigned long lpar_rc;
@@ -364,9 +366,10 @@ static long beat_lpar_hpte_insert_v3(unsigned long hpte_group,
364 * already zero. For now I am paranoid. 366 * already zero. For now I am paranoid.
365 */ 367 */
366static long beat_lpar_hpte_updatepp_v3(unsigned long slot, 368static long beat_lpar_hpte_updatepp_v3(unsigned long slot,
367 unsigned long newpp, 369 unsigned long newpp,
368 unsigned long vpn, 370 unsigned long vpn,
369 int psize, int ssize, int local) 371 int psize, int apsize,
372 int ssize, int local)
370{ 373{
371 unsigned long lpar_rc; 374 unsigned long lpar_rc;
372 unsigned long want_v; 375 unsigned long want_v;
@@ -394,7 +397,8 @@ static long beat_lpar_hpte_updatepp_v3(unsigned long slot,
394} 397}
395 398
396static void beat_lpar_hpte_invalidate_v3(unsigned long slot, unsigned long vpn, 399static void beat_lpar_hpte_invalidate_v3(unsigned long slot, unsigned long vpn,
397 int psize, int ssize, int local) 400 int psize, int apsize,
401 int ssize, int local)
398{ 402{
399 unsigned long want_v; 403 unsigned long want_v;
400 unsigned long lpar_rc; 404 unsigned long lpar_rc;
diff --git a/arch/powerpc/platforms/cell/smp.c b/arch/powerpc/platforms/cell/smp.c
index d35dbbc8ec79..f75f6fcac729 100644
--- a/arch/powerpc/platforms/cell/smp.c
+++ b/arch/powerpc/platforms/cell/smp.c
@@ -142,7 +142,7 @@ static int smp_cell_cpu_bootable(unsigned int nr)
142 * during boot if the user requests it. Odd-numbered 142 * during boot if the user requests it. Odd-numbered
143 * cpus are assumed to be secondary threads. 143 * cpus are assumed to be secondary threads.
144 */ 144 */
145 if (system_state < SYSTEM_RUNNING && 145 if (system_state == SYSTEM_BOOTING &&
146 cpu_has_feature(CPU_FTR_SMT) && 146 cpu_has_feature(CPU_FTR_SMT) &&
147 !smt_enabled_at_boot && cpu_thread_in_core(nr) != 0) 147 !smt_enabled_at_boot && cpu_thread_in_core(nr) != 0)
148 return 0; 148 return 0;
diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c
index bdb738a69e41..49c9f9501c21 100644
--- a/arch/powerpc/platforms/powermac/smp.c
+++ b/arch/powerpc/platforms/powermac/smp.c
@@ -885,7 +885,7 @@ static int smp_core99_cpu_notify(struct notifier_block *self,
885 return NOTIFY_OK; 885 return NOTIFY_OK;
886} 886}
887 887
888static struct notifier_block __cpuinitdata smp_core99_cpu_nb = { 888static struct notifier_block smp_core99_cpu_nb = {
889 .notifier_call = smp_core99_cpu_notify, 889 .notifier_call = smp_core99_cpu_notify,
890}; 890};
891#endif /* CONFIG_HOTPLUG_CPU */ 891#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index bcc3cb48a44e..7fe595152478 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -3,3 +3,4 @@ obj-y += opal-rtc.o opal-nvram.o
3 3
4obj-$(CONFIG_SMP) += smp.o 4obj-$(CONFIG_SMP) += smp.o
5obj-$(CONFIG_PCI) += pci.o pci-p5ioc2.o pci-ioda.o 5obj-$(CONFIG_PCI) += pci.o pci-p5ioc2.o pci-ioda.o
6obj-$(CONFIG_EEH) += eeh-ioda.o eeh-powernv.o
diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
new file mode 100644
index 000000000000..0cd1c4a71755
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -0,0 +1,916 @@
1/*
2 * The file intends to implement the functions needed by EEH, which is
3 * built on IODA compliant chip. Actually, lots of functions related
4 * to EEH would be built based on the OPAL APIs.
5 *
6 * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2013.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 */
13
14#include <linux/bootmem.h>
15#include <linux/debugfs.h>
16#include <linux/delay.h>
17#include <linux/init.h>
18#include <linux/io.h>
19#include <linux/irq.h>
20#include <linux/kernel.h>
21#include <linux/msi.h>
22#include <linux/notifier.h>
23#include <linux/pci.h>
24#include <linux/string.h>
25
26#include <asm/eeh.h>
27#include <asm/eeh_event.h>
28#include <asm/io.h>
29#include <asm/iommu.h>
30#include <asm/msi_bitmap.h>
31#include <asm/opal.h>
32#include <asm/pci-bridge.h>
33#include <asm/ppc-pci.h>
34#include <asm/tce.h>
35
36#include "powernv.h"
37#include "pci.h"
38
39/* Debugging option */
40#ifdef IODA_EEH_DBG_ON
41#define IODA_EEH_DBG(args...) pr_info(args)
42#else
43#define IODA_EEH_DBG(args...)
44#endif
45
46static char *hub_diag = NULL;
47static int ioda_eeh_nb_init = 0;
48
49static int ioda_eeh_event(struct notifier_block *nb,
50 unsigned long events, void *change)
51{
52 uint64_t changed_evts = (uint64_t)change;
53
54 /* We simply send special EEH event */
55 if ((changed_evts & OPAL_EVENT_PCI_ERROR) &&
56 (events & OPAL_EVENT_PCI_ERROR))
57 eeh_send_failure_event(NULL);
58
59 return 0;
60}
61
62static struct notifier_block ioda_eeh_nb = {
63 .notifier_call = ioda_eeh_event,
64 .next = NULL,
65 .priority = 0
66};
67
68#ifdef CONFIG_DEBUG_FS
69static int ioda_eeh_dbgfs_set(void *data, u64 val)
70{
71 struct pci_controller *hose = data;
72 struct pnv_phb *phb = hose->private_data;
73
74 out_be64(phb->regs + 0xD10, val);
75 return 0;
76}
77
78static int ioda_eeh_dbgfs_get(void *data, u64 *val)
79{
80 struct pci_controller *hose = data;
81 struct pnv_phb *phb = hose->private_data;
82
83 *val = in_be64(phb->regs + 0xD10);
84 return 0;
85}
86
87DEFINE_SIMPLE_ATTRIBUTE(ioda_eeh_dbgfs_ops, ioda_eeh_dbgfs_get,
88 ioda_eeh_dbgfs_set, "0x%llx\n");
89#endif /* CONFIG_DEBUG_FS */
90
91/**
92 * ioda_eeh_post_init - Chip dependent post initialization
93 * @hose: PCI controller
94 *
95 * The function will be called after eeh PEs and devices
96 * have been built. That means the EEH is ready to supply
97 * service with I/O cache.
98 */
99static int ioda_eeh_post_init(struct pci_controller *hose)
100{
101 struct pnv_phb *phb = hose->private_data;
102 int ret;
103
104 /* Register OPAL event notifier */
105 if (!ioda_eeh_nb_init) {
106 ret = opal_notifier_register(&ioda_eeh_nb);
107 if (ret) {
108 pr_err("%s: Can't register OPAL event notifier (%d)\n",
109 __func__, ret);
110 return ret;
111 }
112
113 ioda_eeh_nb_init = 1;
114 }
115
116 /* FIXME: Enable it for PHB3 later */
117 if (phb->type == PNV_PHB_IODA1) {
118 if (!hub_diag) {
119 hub_diag = (char *)__get_free_page(GFP_KERNEL |
120 __GFP_ZERO);
121 if (!hub_diag) {
122 pr_err("%s: Out of memory !\n",
123 __func__);
124 return -ENOMEM;
125 }
126 }
127
128#ifdef CONFIG_DEBUG_FS
129 if (phb->dbgfs)
130 debugfs_create_file("err_injct", 0600,
131 phb->dbgfs, hose,
132 &ioda_eeh_dbgfs_ops);
133#endif
134
135 phb->eeh_state |= PNV_EEH_STATE_ENABLED;
136 }
137
138 return 0;
139}
140
141/**
142 * ioda_eeh_set_option - Set EEH operation or I/O setting
143 * @pe: EEH PE
144 * @option: options
145 *
146 * Enable or disable EEH option for the indicated PE. The
147 * function also can be used to enable I/O or DMA for the
148 * PE.
149 */
150static int ioda_eeh_set_option(struct eeh_pe *pe, int option)
151{
152 s64 ret;
153 u32 pe_no;
154 struct pci_controller *hose = pe->phb;
155 struct pnv_phb *phb = hose->private_data;
156
157 /* Check on PE number */
158 if (pe->addr < 0 || pe->addr >= phb->ioda.total_pe) {
159 pr_err("%s: PE address %x out of range [0, %x] "
160 "on PHB#%x\n",
161 __func__, pe->addr, phb->ioda.total_pe,
162 hose->global_number);
163 return -EINVAL;
164 }
165
166 pe_no = pe->addr;
167 switch (option) {
168 case EEH_OPT_DISABLE:
169 ret = -EEXIST;
170 break;
171 case EEH_OPT_ENABLE:
172 ret = 0;
173 break;
174 case EEH_OPT_THAW_MMIO:
175 ret = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no,
176 OPAL_EEH_ACTION_CLEAR_FREEZE_MMIO);
177 if (ret) {
178 pr_warning("%s: Failed to enable MMIO for "
179 "PHB#%x-PE#%x, err=%lld\n",
180 __func__, hose->global_number, pe_no, ret);
181 return -EIO;
182 }
183
184 break;
185 case EEH_OPT_THAW_DMA:
186 ret = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no,
187 OPAL_EEH_ACTION_CLEAR_FREEZE_DMA);
188 if (ret) {
189 pr_warning("%s: Failed to enable DMA for "
190 "PHB#%x-PE#%x, err=%lld\n",
191 __func__, hose->global_number, pe_no, ret);
192 return -EIO;
193 }
194
195 break;
196 default:
197 pr_warning("%s: Invalid option %d\n", __func__, option);
198 return -EINVAL;
199 }
200
201 return ret;
202}
203
204/**
205 * ioda_eeh_get_state - Retrieve the state of PE
206 * @pe: EEH PE
207 *
208 * The PE's state should be retrieved from the PEEV, PEST
209 * IODA tables. Since the OPAL has exported the function
210 * to do it, it'd better to use that.
211 */
212static int ioda_eeh_get_state(struct eeh_pe *pe)
213{
214 s64 ret = 0;
215 u8 fstate;
216 u16 pcierr;
217 u32 pe_no;
218 int result;
219 struct pci_controller *hose = pe->phb;
220 struct pnv_phb *phb = hose->private_data;
221
222 /*
223 * Sanity check on PE address. The PHB PE address should
224 * be zero.
225 */
226 if (pe->addr < 0 || pe->addr >= phb->ioda.total_pe) {
227 pr_err("%s: PE address %x out of range [0, %x] "
228 "on PHB#%x\n",
229 __func__, pe->addr, phb->ioda.total_pe,
230 hose->global_number);
231 return EEH_STATE_NOT_SUPPORT;
232 }
233
234 /* Retrieve PE status through OPAL */
235 pe_no = pe->addr;
236 ret = opal_pci_eeh_freeze_status(phb->opal_id, pe_no,
237 &fstate, &pcierr, NULL);
238 if (ret) {
239 pr_err("%s: Failed to get EEH status on "
240 "PHB#%x-PE#%x\n, err=%lld\n",
241 __func__, hose->global_number, pe_no, ret);
242 return EEH_STATE_NOT_SUPPORT;
243 }
244
245 /* Check PHB status */
246 if (pe->type & EEH_PE_PHB) {
247 result = 0;
248 result &= ~EEH_STATE_RESET_ACTIVE;
249
250 if (pcierr != OPAL_EEH_PHB_ERROR) {
251 result |= EEH_STATE_MMIO_ACTIVE;
252 result |= EEH_STATE_DMA_ACTIVE;
253 result |= EEH_STATE_MMIO_ENABLED;
254 result |= EEH_STATE_DMA_ENABLED;
255 }
256
257 return result;
258 }
259
260 /* Parse result out */
261 result = 0;
262 switch (fstate) {
263 case OPAL_EEH_STOPPED_NOT_FROZEN:
264 result &= ~EEH_STATE_RESET_ACTIVE;
265 result |= EEH_STATE_MMIO_ACTIVE;
266 result |= EEH_STATE_DMA_ACTIVE;
267 result |= EEH_STATE_MMIO_ENABLED;
268 result |= EEH_STATE_DMA_ENABLED;
269 break;
270 case OPAL_EEH_STOPPED_MMIO_FREEZE:
271 result &= ~EEH_STATE_RESET_ACTIVE;
272 result |= EEH_STATE_DMA_ACTIVE;
273 result |= EEH_STATE_DMA_ENABLED;
274 break;
275 case OPAL_EEH_STOPPED_DMA_FREEZE:
276 result &= ~EEH_STATE_RESET_ACTIVE;
277 result |= EEH_STATE_MMIO_ACTIVE;
278 result |= EEH_STATE_MMIO_ENABLED;
279 break;
280 case OPAL_EEH_STOPPED_MMIO_DMA_FREEZE:
281 result &= ~EEH_STATE_RESET_ACTIVE;
282 break;
283 case OPAL_EEH_STOPPED_RESET:
284 result |= EEH_STATE_RESET_ACTIVE;
285 break;
286 case OPAL_EEH_STOPPED_TEMP_UNAVAIL:
287 result |= EEH_STATE_UNAVAILABLE;
288 break;
289 case OPAL_EEH_STOPPED_PERM_UNAVAIL:
290 result |= EEH_STATE_NOT_SUPPORT;
291 break;
292 default:
293 pr_warning("%s: Unexpected EEH status 0x%x "
294 "on PHB#%x-PE#%x\n",
295 __func__, fstate, hose->global_number, pe_no);
296 }
297
298 return result;
299}
300
301static int ioda_eeh_pe_clear(struct eeh_pe *pe)
302{
303 struct pci_controller *hose;
304 struct pnv_phb *phb;
305 u32 pe_no;
306 u8 fstate;
307 u16 pcierr;
308 s64 ret;
309
310 pe_no = pe->addr;
311 hose = pe->phb;
312 phb = pe->phb->private_data;
313
314 /* Clear the EEH error on the PE */
315 ret = opal_pci_eeh_freeze_clear(phb->opal_id,
316 pe_no, OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
317 if (ret) {
318 pr_err("%s: Failed to clear EEH error for "
319 "PHB#%x-PE#%x, err=%lld\n",
320 __func__, hose->global_number, pe_no, ret);
321 return -EIO;
322 }
323
324 /*
325 * Read the PE state back and verify that the frozen
326 * state has been removed.
327 */
328 ret = opal_pci_eeh_freeze_status(phb->opal_id, pe_no,
329 &fstate, &pcierr, NULL);
330 if (ret) {
331 pr_err("%s: Failed to get EEH status on "
332 "PHB#%x-PE#%x\n, err=%lld\n",
333 __func__, hose->global_number, pe_no, ret);
334 return -EIO;
335 }
336
337 if (fstate != OPAL_EEH_STOPPED_NOT_FROZEN) {
338 pr_err("%s: Frozen state not cleared on "
339 "PHB#%x-PE#%x, sts=%x\n",
340 __func__, hose->global_number, pe_no, fstate);
341 return -EIO;
342 }
343
344 return 0;
345}
346
347static s64 ioda_eeh_phb_poll(struct pnv_phb *phb)
348{
349 s64 rc = OPAL_HARDWARE;
350
351 while (1) {
352 rc = opal_pci_poll(phb->opal_id);
353 if (rc <= 0)
354 break;
355
356 msleep(rc);
357 }
358
359 return rc;
360}
361
362static int ioda_eeh_phb_reset(struct pci_controller *hose, int option)
363{
364 struct pnv_phb *phb = hose->private_data;
365 s64 rc = OPAL_HARDWARE;
366
367 pr_debug("%s: Reset PHB#%x, option=%d\n",
368 __func__, hose->global_number, option);
369
370 /* Issue PHB complete reset request */
371 if (option == EEH_RESET_FUNDAMENTAL ||
372 option == EEH_RESET_HOT)
373 rc = opal_pci_reset(phb->opal_id,
374 OPAL_PHB_COMPLETE,
375 OPAL_ASSERT_RESET);
376 else if (option == EEH_RESET_DEACTIVATE)
377 rc = opal_pci_reset(phb->opal_id,
378 OPAL_PHB_COMPLETE,
379 OPAL_DEASSERT_RESET);
380 if (rc < 0)
381 goto out;
382
383 /*
384 * Poll state of the PHB until the request is done
385 * successfully.
386 */
387 rc = ioda_eeh_phb_poll(phb);
388out:
389 if (rc != OPAL_SUCCESS)
390 return -EIO;
391
392 return 0;
393}
394
395static int ioda_eeh_root_reset(struct pci_controller *hose, int option)
396{
397 struct pnv_phb *phb = hose->private_data;
398 s64 rc = OPAL_SUCCESS;
399
400 pr_debug("%s: Reset PHB#%x, option=%d\n",
401 __func__, hose->global_number, option);
402
403 /*
404 * During the reset deassert time, we needn't care
405 * the reset scope because the firmware does nothing
406 * for fundamental or hot reset during deassert phase.
407 */
408 if (option == EEH_RESET_FUNDAMENTAL)
409 rc = opal_pci_reset(phb->opal_id,
410 OPAL_PCI_FUNDAMENTAL_RESET,
411 OPAL_ASSERT_RESET);
412 else if (option == EEH_RESET_HOT)
413 rc = opal_pci_reset(phb->opal_id,
414 OPAL_PCI_HOT_RESET,
415 OPAL_ASSERT_RESET);
416 else if (option == EEH_RESET_DEACTIVATE)
417 rc = opal_pci_reset(phb->opal_id,
418 OPAL_PCI_HOT_RESET,
419 OPAL_DEASSERT_RESET);
420 if (rc < 0)
421 goto out;
422
423 /* Poll state of the PHB until the request is done */
424 rc = ioda_eeh_phb_poll(phb);
425out:
426 if (rc != OPAL_SUCCESS)
427 return -EIO;
428
429 return 0;
430}
431
432static int ioda_eeh_bridge_reset(struct pci_controller *hose,
433 struct pci_dev *dev, int option)
434{
435 u16 ctrl;
436
437 pr_debug("%s: Reset device %04x:%02x:%02x.%01x with option %d\n",
438 __func__, hose->global_number, dev->bus->number,
439 PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn), option);
440
441 switch (option) {
442 case EEH_RESET_FUNDAMENTAL:
443 case EEH_RESET_HOT:
444 pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &ctrl);
445 ctrl |= PCI_BRIDGE_CTL_BUS_RESET;
446 pci_write_config_word(dev, PCI_BRIDGE_CONTROL, ctrl);
447 break;
448 case EEH_RESET_DEACTIVATE:
449 pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &ctrl);
450 ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET;
451 pci_write_config_word(dev, PCI_BRIDGE_CONTROL, ctrl);
452 break;
453 }
454
455 return 0;
456}
457
458/**
459 * ioda_eeh_reset - Reset the indicated PE
460 * @pe: EEH PE
461 * @option: reset option
462 *
463 * Do reset on the indicated PE. For PCI bus sensitive PE,
464 * we need to reset the parent p2p bridge. The PHB has to
465 * be reinitialized if the p2p bridge is root bridge. For
466 * PCI device sensitive PE, we will try to reset the device
467 * through FLR. For now, we don't have OPAL APIs to do HARD
468 * reset yet, so all reset would be SOFT (HOT) reset.
469 */
470static int ioda_eeh_reset(struct eeh_pe *pe, int option)
471{
472 struct pci_controller *hose = pe->phb;
473 struct eeh_dev *edev;
474 struct pci_dev *dev;
475 int ret;
476
477 /*
478 * Anyway, we have to clear the problematic state for the
479 * corresponding PE. However, we needn't do it if the PE
480 * is PHB associated. That means the PHB is having fatal
481 * errors and it needs reset. Further more, the AIB interface
482 * isn't reliable any more.
483 */
484 if (!(pe->type & EEH_PE_PHB) &&
485 (option == EEH_RESET_HOT ||
486 option == EEH_RESET_FUNDAMENTAL)) {
487 ret = ioda_eeh_pe_clear(pe);
488 if (ret)
489 return -EIO;
490 }
491
492 /*
493 * The rules applied to reset, either fundamental or hot reset:
494 *
495 * We always reset the direct upstream bridge of the PE. If the
496 * direct upstream bridge isn't root bridge, we always take hot
497 * reset no matter what option (fundamental or hot) is. Otherwise,
498 * we should do the reset according to the required option.
499 */
500 if (pe->type & EEH_PE_PHB) {
501 ret = ioda_eeh_phb_reset(hose, option);
502 } else {
503 if (pe->type & EEH_PE_DEVICE) {
504 /*
505 * If it's device PE, we didn't refer to the parent
506 * PCI bus yet. So we have to figure it out indirectly.
507 */
508 edev = list_first_entry(&pe->edevs,
509 struct eeh_dev, list);
510 dev = eeh_dev_to_pci_dev(edev);
511 dev = dev->bus->self;
512 } else {
513 /*
514 * If it's bus PE, the parent PCI bus is already there
515 * and just pick it up.
516 */
517 dev = pe->bus->self;
518 }
519
520 /*
521 * Do reset based on the fact that the direct upstream bridge
522 * is root bridge (port) or not.
523 */
524 if (dev->bus->number == 0)
525 ret = ioda_eeh_root_reset(hose, option);
526 else
527 ret = ioda_eeh_bridge_reset(hose, dev, option);
528 }
529
530 return ret;
531}
532
533/**
534 * ioda_eeh_get_log - Retrieve error log
535 * @pe: EEH PE
536 * @severity: Severity level of the log
537 * @drv_log: buffer to store the log
538 * @len: space of the log buffer
539 *
540 * The function is used to retrieve error log from P7IOC.
541 */
542static int ioda_eeh_get_log(struct eeh_pe *pe, int severity,
543 char *drv_log, unsigned long len)
544{
545 s64 ret;
546 unsigned long flags;
547 struct pci_controller *hose = pe->phb;
548 struct pnv_phb *phb = hose->private_data;
549
550 spin_lock_irqsave(&phb->lock, flags);
551
552 ret = opal_pci_get_phb_diag_data2(phb->opal_id,
553 phb->diag.blob, PNV_PCI_DIAG_BUF_SIZE);
554 if (ret) {
555 spin_unlock_irqrestore(&phb->lock, flags);
556 pr_warning("%s: Failed to get log for PHB#%x-PE#%x\n",
557 __func__, hose->global_number, pe->addr);
558 return -EIO;
559 }
560
561 /*
562 * FIXME: We probably need log the error in somewhere.
563 * Lets make it up in future.
564 */
565 /* pr_info("%s", phb->diag.blob); */
566
567 spin_unlock_irqrestore(&phb->lock, flags);
568
569 return 0;
570}
571
572/**
573 * ioda_eeh_configure_bridge - Configure the PCI bridges for the indicated PE
574 * @pe: EEH PE
575 *
576 * For particular PE, it might have included PCI bridges. In order
577 * to make the PE work properly, those PCI bridges should be configured
578 * correctly. However, we need do nothing on P7IOC since the reset
579 * function will do everything that should be covered by the function.
580 */
581static int ioda_eeh_configure_bridge(struct eeh_pe *pe)
582{
583 return 0;
584}
585
586static void ioda_eeh_hub_diag_common(struct OpalIoP7IOCErrorData *data)
587{
588 /* GEM */
589 pr_info(" GEM XFIR: %016llx\n", data->gemXfir);
590 pr_info(" GEM RFIR: %016llx\n", data->gemRfir);
591 pr_info(" GEM RIRQFIR: %016llx\n", data->gemRirqfir);
592 pr_info(" GEM Mask: %016llx\n", data->gemMask);
593 pr_info(" GEM RWOF: %016llx\n", data->gemRwof);
594
595 /* LEM */
596 pr_info(" LEM FIR: %016llx\n", data->lemFir);
597 pr_info(" LEM Error Mask: %016llx\n", data->lemErrMask);
598 pr_info(" LEM Action 0: %016llx\n", data->lemAction0);
599 pr_info(" LEM Action 1: %016llx\n", data->lemAction1);
600 pr_info(" LEM WOF: %016llx\n", data->lemWof);
601}
602
603static void ioda_eeh_hub_diag(struct pci_controller *hose)
604{
605 struct pnv_phb *phb = hose->private_data;
606 struct OpalIoP7IOCErrorData *data;
607 long rc;
608
609 data = (struct OpalIoP7IOCErrorData *)ioda_eeh_hub_diag;
610 rc = opal_pci_get_hub_diag_data(phb->hub_id, data, PAGE_SIZE);
611 if (rc != OPAL_SUCCESS) {
612 pr_warning("%s: Failed to get HUB#%llx diag-data (%ld)\n",
613 __func__, phb->hub_id, rc);
614 return;
615 }
616
617 switch (data->type) {
618 case OPAL_P7IOC_DIAG_TYPE_RGC:
619 pr_info("P7IOC diag-data for RGC\n\n");
620 ioda_eeh_hub_diag_common(data);
621 pr_info(" RGC Status: %016llx\n", data->rgc.rgcStatus);
622 pr_info(" RGC LDCP: %016llx\n", data->rgc.rgcLdcp);
623 break;
624 case OPAL_P7IOC_DIAG_TYPE_BI:
625 pr_info("P7IOC diag-data for BI %s\n\n",
626 data->bi.biDownbound ? "Downbound" : "Upbound");
627 ioda_eeh_hub_diag_common(data);
628 pr_info(" BI LDCP 0: %016llx\n", data->bi.biLdcp0);
629 pr_info(" BI LDCP 1: %016llx\n", data->bi.biLdcp1);
630 pr_info(" BI LDCP 2: %016llx\n", data->bi.biLdcp2);
631 pr_info(" BI Fence Status: %016llx\n", data->bi.biFenceStatus);
632 break;
633 case OPAL_P7IOC_DIAG_TYPE_CI:
634 pr_info("P7IOC diag-data for CI Port %d\\nn",
635 data->ci.ciPort);
636 ioda_eeh_hub_diag_common(data);
637 pr_info(" CI Port Status: %016llx\n", data->ci.ciPortStatus);
638 pr_info(" CI Port LDCP: %016llx\n", data->ci.ciPortLdcp);
639 break;
640 case OPAL_P7IOC_DIAG_TYPE_MISC:
641 pr_info("P7IOC diag-data for MISC\n\n");
642 ioda_eeh_hub_diag_common(data);
643 break;
644 case OPAL_P7IOC_DIAG_TYPE_I2C:
645 pr_info("P7IOC diag-data for I2C\n\n");
646 ioda_eeh_hub_diag_common(data);
647 break;
648 default:
649 pr_warning("%s: Invalid type of HUB#%llx diag-data (%d)\n",
650 __func__, phb->hub_id, data->type);
651 }
652}
653
654static void ioda_eeh_p7ioc_phb_diag(struct pci_controller *hose,
655 struct OpalIoPhbErrorCommon *common)
656{
657 struct OpalIoP7IOCPhbErrorData *data;
658 int i;
659
660 data = (struct OpalIoP7IOCPhbErrorData *)common;
661
662 pr_info("P7IOC PHB#%x Diag-data (Version: %d)\n\n",
663 hose->global_number, common->version);
664
665 pr_info(" brdgCtl: %08x\n", data->brdgCtl);
666
667 pr_info(" portStatusReg: %08x\n", data->portStatusReg);
668 pr_info(" rootCmplxStatus: %08x\n", data->rootCmplxStatus);
669 pr_info(" busAgentStatus: %08x\n", data->busAgentStatus);
670
671 pr_info(" deviceStatus: %08x\n", data->deviceStatus);
672 pr_info(" slotStatus: %08x\n", data->slotStatus);
673 pr_info(" linkStatus: %08x\n", data->linkStatus);
674 pr_info(" devCmdStatus: %08x\n", data->devCmdStatus);
675 pr_info(" devSecStatus: %08x\n", data->devSecStatus);
676
677 pr_info(" rootErrorStatus: %08x\n", data->rootErrorStatus);
678 pr_info(" uncorrErrorStatus: %08x\n", data->uncorrErrorStatus);
679 pr_info(" corrErrorStatus: %08x\n", data->corrErrorStatus);
680 pr_info(" tlpHdr1: %08x\n", data->tlpHdr1);
681 pr_info(" tlpHdr2: %08x\n", data->tlpHdr2);
682 pr_info(" tlpHdr3: %08x\n", data->tlpHdr3);
683 pr_info(" tlpHdr4: %08x\n", data->tlpHdr4);
684 pr_info(" sourceId: %08x\n", data->sourceId);
685
686 pr_info(" errorClass: %016llx\n", data->errorClass);
687 pr_info(" correlator: %016llx\n", data->correlator);
688 pr_info(" p7iocPlssr: %016llx\n", data->p7iocPlssr);
689 pr_info(" p7iocCsr: %016llx\n", data->p7iocCsr);
690 pr_info(" lemFir: %016llx\n", data->lemFir);
691 pr_info(" lemErrorMask: %016llx\n", data->lemErrorMask);
692 pr_info(" lemWOF: %016llx\n", data->lemWOF);
693 pr_info(" phbErrorStatus: %016llx\n", data->phbErrorStatus);
694 pr_info(" phbFirstErrorStatus: %016llx\n", data->phbFirstErrorStatus);
695 pr_info(" phbErrorLog0: %016llx\n", data->phbErrorLog0);
696 pr_info(" phbErrorLog1: %016llx\n", data->phbErrorLog1);
697 pr_info(" mmioErrorStatus: %016llx\n", data->mmioErrorStatus);
698 pr_info(" mmioFirstErrorStatus: %016llx\n", data->mmioFirstErrorStatus);
699 pr_info(" mmioErrorLog0: %016llx\n", data->mmioErrorLog0);
700 pr_info(" mmioErrorLog1: %016llx\n", data->mmioErrorLog1);
701 pr_info(" dma0ErrorStatus: %016llx\n", data->dma0ErrorStatus);
702 pr_info(" dma0FirstErrorStatus: %016llx\n", data->dma0FirstErrorStatus);
703 pr_info(" dma0ErrorLog0: %016llx\n", data->dma0ErrorLog0);
704 pr_info(" dma0ErrorLog1: %016llx\n", data->dma0ErrorLog1);
705 pr_info(" dma1ErrorStatus: %016llx\n", data->dma1ErrorStatus);
706 pr_info(" dma1FirstErrorStatus: %016llx\n", data->dma1FirstErrorStatus);
707 pr_info(" dma1ErrorLog0: %016llx\n", data->dma1ErrorLog0);
708 pr_info(" dma1ErrorLog1: %016llx\n", data->dma1ErrorLog1);
709
710 for (i = 0; i < OPAL_P7IOC_NUM_PEST_REGS; i++) {
711 if ((data->pestA[i] >> 63) == 0 &&
712 (data->pestB[i] >> 63) == 0)
713 continue;
714
715 pr_info(" PE[%3d] PESTA: %016llx\n", i, data->pestA[i]);
716 pr_info(" PESTB: %016llx\n", data->pestB[i]);
717 }
718}
719
720static void ioda_eeh_phb_diag(struct pci_controller *hose)
721{
722 struct pnv_phb *phb = hose->private_data;
723 struct OpalIoPhbErrorCommon *common;
724 long rc;
725
726 common = (struct OpalIoPhbErrorCommon *)phb->diag.blob;
727 rc = opal_pci_get_phb_diag_data2(phb->opal_id, common, PAGE_SIZE);
728 if (rc != OPAL_SUCCESS) {
729 pr_warning("%s: Failed to get diag-data for PHB#%x (%ld)\n",
730 __func__, hose->global_number, rc);
731 return;
732 }
733
734 switch (common->ioType) {
735 case OPAL_PHB_ERROR_DATA_TYPE_P7IOC:
736 ioda_eeh_p7ioc_phb_diag(hose, common);
737 break;
738 default:
739 pr_warning("%s: Unrecognized I/O chip %d\n",
740 __func__, common->ioType);
741 }
742}
743
744static int ioda_eeh_get_phb_pe(struct pci_controller *hose,
745 struct eeh_pe **pe)
746{
747 struct eeh_pe *phb_pe;
748
749 phb_pe = eeh_phb_pe_get(hose);
750 if (!phb_pe) {
751 pr_warning("%s Can't find PE for PHB#%d\n",
752 __func__, hose->global_number);
753 return -EEXIST;
754 }
755
756 *pe = phb_pe;
757 return 0;
758}
759
760static int ioda_eeh_get_pe(struct pci_controller *hose,
761 u16 pe_no, struct eeh_pe **pe)
762{
763 struct eeh_pe *phb_pe, *dev_pe;
764 struct eeh_dev dev;
765
766 /* Find the PHB PE */
767 if (ioda_eeh_get_phb_pe(hose, &phb_pe))
768 return -EEXIST;
769
770 /* Find the PE according to PE# */
771 memset(&dev, 0, sizeof(struct eeh_dev));
772 dev.phb = hose;
773 dev.pe_config_addr = pe_no;
774 dev_pe = eeh_pe_get(&dev);
775 if (!dev_pe) {
776 pr_warning("%s: Can't find PE for PHB#%x - PE#%x\n",
777 __func__, hose->global_number, pe_no);
778 return -EEXIST;
779 }
780
781 *pe = dev_pe;
782 return 0;
783}
784
785/**
786 * ioda_eeh_next_error - Retrieve next error for EEH core to handle
787 * @pe: The affected PE
788 *
789 * The function is expected to be called by EEH core while it gets
790 * special EEH event (without binding PE). The function calls to
791 * OPAL APIs for next error to handle. The informational error is
792 * handled internally by platform. However, the dead IOC, dead PHB,
793 * fenced PHB and frozen PE should be handled by EEH core eventually.
794 */
795static int ioda_eeh_next_error(struct eeh_pe **pe)
796{
797 struct pci_controller *hose, *tmp;
798 struct pnv_phb *phb;
799 u64 frozen_pe_no;
800 u16 err_type, severity;
801 long rc;
802 int ret = 1;
803
804 /*
805 * While running here, it's safe to purge the event queue.
806 * And we should keep the cached OPAL notifier event sychronized
807 * between the kernel and firmware.
808 */
809 eeh_remove_event(NULL);
810 opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul);
811
812 list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
813 /*
814 * If the subordinate PCI buses of the PHB has been
815 * removed, we needn't take care of it any more.
816 */
817 phb = hose->private_data;
818 if (phb->eeh_state & PNV_EEH_STATE_REMOVED)
819 continue;
820
821 rc = opal_pci_next_error(phb->opal_id,
822 &frozen_pe_no, &err_type, &severity);
823
824 /* If OPAL API returns error, we needn't proceed */
825 if (rc != OPAL_SUCCESS) {
826 IODA_EEH_DBG("%s: Invalid return value on "
827 "PHB#%x (0x%lx) from opal_pci_next_error",
828 __func__, hose->global_number, rc);
829 continue;
830 }
831
832 /* If the PHB doesn't have error, stop processing */
833 if (err_type == OPAL_EEH_NO_ERROR ||
834 severity == OPAL_EEH_SEV_NO_ERROR) {
835 IODA_EEH_DBG("%s: No error found on PHB#%x\n",
836 __func__, hose->global_number);
837 continue;
838 }
839
840 /*
841 * Processing the error. We're expecting the error with
842 * highest priority reported upon multiple errors on the
843 * specific PHB.
844 */
845 IODA_EEH_DBG("%s: Error (%d, %d, %d) on PHB#%x\n",
846 err_type, severity, pe_no, hose->global_number);
847 switch (err_type) {
848 case OPAL_EEH_IOC_ERROR:
849 if (severity == OPAL_EEH_SEV_IOC_DEAD) {
850 list_for_each_entry_safe(hose, tmp,
851 &hose_list, list_node) {
852 phb = hose->private_data;
853 phb->eeh_state |= PNV_EEH_STATE_REMOVED;
854 }
855
856 pr_err("EEH: dead IOC detected\n");
857 ret = 4;
858 goto out;
859 } else if (severity == OPAL_EEH_SEV_INF) {
860 pr_info("EEH: IOC informative error "
861 "detected\n");
862 ioda_eeh_hub_diag(hose);
863 }
864
865 break;
866 case OPAL_EEH_PHB_ERROR:
867 if (severity == OPAL_EEH_SEV_PHB_DEAD) {
868 if (ioda_eeh_get_phb_pe(hose, pe))
869 break;
870
871 pr_err("EEH: dead PHB#%x detected\n",
872 hose->global_number);
873 phb->eeh_state |= PNV_EEH_STATE_REMOVED;
874 ret = 3;
875 goto out;
876 } else if (severity == OPAL_EEH_SEV_PHB_FENCED) {
877 if (ioda_eeh_get_phb_pe(hose, pe))
878 break;
879
880 pr_err("EEH: fenced PHB#%x detected\n",
881 hose->global_number);
882 ret = 2;
883 goto out;
884 } else if (severity == OPAL_EEH_SEV_INF) {
885 pr_info("EEH: PHB#%x informative error "
886 "detected\n",
887 hose->global_number);
888 ioda_eeh_phb_diag(hose);
889 }
890
891 break;
892 case OPAL_EEH_PE_ERROR:
893 if (ioda_eeh_get_pe(hose, frozen_pe_no, pe))
894 break;
895
896 pr_err("EEH: Frozen PE#%x on PHB#%x detected\n",
897 (*pe)->addr, (*pe)->phb->global_number);
898 ret = 1;
899 goto out;
900 }
901 }
902
903 ret = 0;
904out:
905 return ret;
906}
907
908struct pnv_eeh_ops ioda_eeh_ops = {
909 .post_init = ioda_eeh_post_init,
910 .set_option = ioda_eeh_set_option,
911 .get_state = ioda_eeh_get_state,
912 .reset = ioda_eeh_reset,
913 .get_log = ioda_eeh_get_log,
914 .configure_bridge = ioda_eeh_configure_bridge,
915 .next_error = ioda_eeh_next_error
916};
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
new file mode 100644
index 000000000000..969cce73055a
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -0,0 +1,379 @@
1/*
2 * The file intends to implement the platform dependent EEH operations on
3 * powernv platform. Actually, the powernv was created in order to fully
4 * hypervisor support.
5 *
6 * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2013.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 */
13
14#include <linux/atomic.h>
15#include <linux/delay.h>
16#include <linux/export.h>
17#include <linux/init.h>
18#include <linux/list.h>
19#include <linux/msi.h>
20#include <linux/of.h>
21#include <linux/pci.h>
22#include <linux/proc_fs.h>
23#include <linux/rbtree.h>
24#include <linux/sched.h>
25#include <linux/seq_file.h>
26#include <linux/spinlock.h>
27
28#include <asm/eeh.h>
29#include <asm/eeh_event.h>
30#include <asm/firmware.h>
31#include <asm/io.h>
32#include <asm/iommu.h>
33#include <asm/machdep.h>
34#include <asm/msi_bitmap.h>
35#include <asm/opal.h>
36#include <asm/ppc-pci.h>
37
38#include "powernv.h"
39#include "pci.h"
40
41/**
42 * powernv_eeh_init - EEH platform dependent initialization
43 *
44 * EEH platform dependent initialization on powernv
45 */
46static int powernv_eeh_init(void)
47{
48 /* We require OPALv3 */
49 if (!firmware_has_feature(FW_FEATURE_OPALv3)) {
50 pr_warning("%s: OPALv3 is required !\n", __func__);
51 return -EINVAL;
52 }
53
54 /* Set EEH probe mode */
55 eeh_probe_mode_set(EEH_PROBE_MODE_DEV);
56
57 return 0;
58}
59
60/**
61 * powernv_eeh_post_init - EEH platform dependent post initialization
62 *
63 * EEH platform dependent post initialization on powernv. When
64 * the function is called, the EEH PEs and devices should have
65 * been built. If the I/O cache staff has been built, EEH is
66 * ready to supply service.
67 */
68static int powernv_eeh_post_init(void)
69{
70 struct pci_controller *hose;
71 struct pnv_phb *phb;
72 int ret = 0;
73
74 list_for_each_entry(hose, &hose_list, list_node) {
75 phb = hose->private_data;
76
77 if (phb->eeh_ops && phb->eeh_ops->post_init) {
78 ret = phb->eeh_ops->post_init(hose);
79 if (ret)
80 break;
81 }
82 }
83
84 return ret;
85}
86
87/**
88 * powernv_eeh_dev_probe - Do probe on PCI device
89 * @dev: PCI device
90 * @flag: unused
91 *
92 * When EEH module is installed during system boot, all PCI devices
93 * are checked one by one to see if it supports EEH. The function
94 * is introduced for the purpose. By default, EEH has been enabled
95 * on all PCI devices. That's to say, we only need do necessary
96 * initialization on the corresponding eeh device and create PE
97 * accordingly.
98 *
99 * It's notable that's unsafe to retrieve the EEH device through
100 * the corresponding PCI device. During the PCI device hotplug, which
101 * was possiblly triggered by EEH core, the binding between EEH device
102 * and the PCI device isn't built yet.
103 */
104static int powernv_eeh_dev_probe(struct pci_dev *dev, void *flag)
105{
106 struct pci_controller *hose = pci_bus_to_host(dev->bus);
107 struct pnv_phb *phb = hose->private_data;
108 struct device_node *dn = pci_device_to_OF_node(dev);
109 struct eeh_dev *edev = of_node_to_eeh_dev(dn);
110
111 /*
112 * When probing the root bridge, which doesn't have any
113 * subordinate PCI devices. We don't have OF node for
114 * the root bridge. So it's not reasonable to continue
115 * the probing.
116 */
117 if (!dn || !edev)
118 return 0;
119
120 /* Skip for PCI-ISA bridge */
121 if ((dev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
122 return 0;
123
124 /* Initialize eeh device */
125 edev->class_code = dev->class;
126 edev->mode = 0;
127 edev->config_addr = ((dev->bus->number << 8) | dev->devfn);
128 edev->pe_config_addr = phb->bdfn_to_pe(phb, dev->bus, dev->devfn & 0xff);
129
130 /* Create PE */
131 eeh_add_to_parent_pe(edev);
132
133 /*
134 * Enable EEH explicitly so that we will do EEH check
135 * while accessing I/O stuff
136 *
137 * FIXME: Enable that for PHB3 later
138 */
139 if (phb->type == PNV_PHB_IODA1)
140 eeh_subsystem_enabled = 1;
141
142 /* Save memory bars */
143 eeh_save_bars(edev);
144
145 return 0;
146}
147
148/**
149 * powernv_eeh_set_option - Initialize EEH or MMIO/DMA reenable
150 * @pe: EEH PE
151 * @option: operation to be issued
152 *
153 * The function is used to control the EEH functionality globally.
154 * Currently, following options are support according to PAPR:
155 * Enable EEH, Disable EEH, Enable MMIO and Enable DMA
156 */
157static int powernv_eeh_set_option(struct eeh_pe *pe, int option)
158{
159 struct pci_controller *hose = pe->phb;
160 struct pnv_phb *phb = hose->private_data;
161 int ret = -EEXIST;
162
163 /*
164 * What we need do is pass it down for hardware
165 * implementation to handle it.
166 */
167 if (phb->eeh_ops && phb->eeh_ops->set_option)
168 ret = phb->eeh_ops->set_option(pe, option);
169
170 return ret;
171}
172
173/**
174 * powernv_eeh_get_pe_addr - Retrieve PE address
175 * @pe: EEH PE
176 *
177 * Retrieve the PE address according to the given tranditional
178 * PCI BDF (Bus/Device/Function) address.
179 */
180static int powernv_eeh_get_pe_addr(struct eeh_pe *pe)
181{
182 return pe->addr;
183}
184
185/**
186 * powernv_eeh_get_state - Retrieve PE state
187 * @pe: EEH PE
188 * @delay: delay while PE state is temporarily unavailable
189 *
190 * Retrieve the state of the specified PE. For IODA-compitable
191 * platform, it should be retrieved from IODA table. Therefore,
192 * we prefer passing down to hardware implementation to handle
193 * it.
194 */
195static int powernv_eeh_get_state(struct eeh_pe *pe, int *delay)
196{
197 struct pci_controller *hose = pe->phb;
198 struct pnv_phb *phb = hose->private_data;
199 int ret = EEH_STATE_NOT_SUPPORT;
200
201 if (phb->eeh_ops && phb->eeh_ops->get_state) {
202 ret = phb->eeh_ops->get_state(pe);
203
204 /*
205 * If the PE state is temporarily unavailable,
206 * to inform the EEH core delay for default
207 * period (1 second)
208 */
209 if (delay) {
210 *delay = 0;
211 if (ret & EEH_STATE_UNAVAILABLE)
212 *delay = 1000;
213 }
214 }
215
216 return ret;
217}
218
219/**
220 * powernv_eeh_reset - Reset the specified PE
221 * @pe: EEH PE
222 * @option: reset option
223 *
224 * Reset the specified PE
225 */
226static int powernv_eeh_reset(struct eeh_pe *pe, int option)
227{
228 struct pci_controller *hose = pe->phb;
229 struct pnv_phb *phb = hose->private_data;
230 int ret = -EEXIST;
231
232 if (phb->eeh_ops && phb->eeh_ops->reset)
233 ret = phb->eeh_ops->reset(pe, option);
234
235 return ret;
236}
237
238/**
239 * powernv_eeh_wait_state - Wait for PE state
240 * @pe: EEH PE
241 * @max_wait: maximal period in microsecond
242 *
243 * Wait for the state of associated PE. It might take some time
244 * to retrieve the PE's state.
245 */
246static int powernv_eeh_wait_state(struct eeh_pe *pe, int max_wait)
247{
248 int ret;
249 int mwait;
250
251 while (1) {
252 ret = powernv_eeh_get_state(pe, &mwait);
253
254 /*
255 * If the PE's state is temporarily unavailable,
256 * we have to wait for the specified time. Otherwise,
257 * the PE's state will be returned immediately.
258 */
259 if (ret != EEH_STATE_UNAVAILABLE)
260 return ret;
261
262 max_wait -= mwait;
263 if (max_wait <= 0) {
264 pr_warning("%s: Timeout getting PE#%x's state (%d)\n",
265 __func__, pe->addr, max_wait);
266 return EEH_STATE_NOT_SUPPORT;
267 }
268
269 msleep(mwait);
270 }
271
272 return EEH_STATE_NOT_SUPPORT;
273}
274
275/**
276 * powernv_eeh_get_log - Retrieve error log
277 * @pe: EEH PE
278 * @severity: temporary or permanent error log
279 * @drv_log: driver log to be combined with retrieved error log
280 * @len: length of driver log
281 *
282 * Retrieve the temporary or permanent error from the PE.
283 */
284static int powernv_eeh_get_log(struct eeh_pe *pe, int severity,
285 char *drv_log, unsigned long len)
286{
287 struct pci_controller *hose = pe->phb;
288 struct pnv_phb *phb = hose->private_data;
289 int ret = -EEXIST;
290
291 if (phb->eeh_ops && phb->eeh_ops->get_log)
292 ret = phb->eeh_ops->get_log(pe, severity, drv_log, len);
293
294 return ret;
295}
296
297/**
298 * powernv_eeh_configure_bridge - Configure PCI bridges in the indicated PE
299 * @pe: EEH PE
300 *
301 * The function will be called to reconfigure the bridges included
302 * in the specified PE so that the mulfunctional PE would be recovered
303 * again.
304 */
305static int powernv_eeh_configure_bridge(struct eeh_pe *pe)
306{
307 struct pci_controller *hose = pe->phb;
308 struct pnv_phb *phb = hose->private_data;
309 int ret = 0;
310
311 if (phb->eeh_ops && phb->eeh_ops->configure_bridge)
312 ret = phb->eeh_ops->configure_bridge(pe);
313
314 return ret;
315}
316
317/**
318 * powernv_eeh_next_error - Retrieve next EEH error to handle
319 * @pe: Affected PE
320 *
321 * Using OPAL API, to retrieve next EEH error for EEH core to handle
322 */
323static int powernv_eeh_next_error(struct eeh_pe **pe)
324{
325 struct pci_controller *hose;
326 struct pnv_phb *phb = NULL;
327
328 list_for_each_entry(hose, &hose_list, list_node) {
329 phb = hose->private_data;
330 break;
331 }
332
333 if (phb && phb->eeh_ops->next_error)
334 return phb->eeh_ops->next_error(pe);
335
336 return -EEXIST;
337}
338
339static struct eeh_ops powernv_eeh_ops = {
340 .name = "powernv",
341 .init = powernv_eeh_init,
342 .post_init = powernv_eeh_post_init,
343 .of_probe = NULL,
344 .dev_probe = powernv_eeh_dev_probe,
345 .set_option = powernv_eeh_set_option,
346 .get_pe_addr = powernv_eeh_get_pe_addr,
347 .get_state = powernv_eeh_get_state,
348 .reset = powernv_eeh_reset,
349 .wait_state = powernv_eeh_wait_state,
350 .get_log = powernv_eeh_get_log,
351 .configure_bridge = powernv_eeh_configure_bridge,
352 .read_config = pnv_pci_cfg_read,
353 .write_config = pnv_pci_cfg_write,
354 .next_error = powernv_eeh_next_error
355};
356
357/**
358 * eeh_powernv_init - Register platform dependent EEH operations
359 *
360 * EEH initialization on powernv platform. This function should be
361 * called before any EEH related functions.
362 */
363static int __init eeh_powernv_init(void)
364{
365 int ret = -EINVAL;
366
367 if (!machine_is(powernv))
368 return ret;
369
370 ret = eeh_ops_register(&powernv_eeh_ops);
371 if (!ret)
372 pr_info("EEH: PowerNV platform initialized\n");
373 else
374 pr_info("EEH: Failed to initialize PowerNV platform (%d)\n", ret);
375
376 return ret;
377}
378
379early_initcall(eeh_powernv_init);
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index 6fabe92eafb6..e88863ffb135 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -107,4 +107,7 @@ OPAL_CALL(opal_pci_mask_pe_error, OPAL_PCI_MASK_PE_ERROR);
107OPAL_CALL(opal_set_slot_led_status, OPAL_SET_SLOT_LED_STATUS); 107OPAL_CALL(opal_set_slot_led_status, OPAL_SET_SLOT_LED_STATUS);
108OPAL_CALL(opal_get_epow_status, OPAL_GET_EPOW_STATUS); 108OPAL_CALL(opal_get_epow_status, OPAL_GET_EPOW_STATUS);
109OPAL_CALL(opal_set_system_attention_led, OPAL_SET_SYSTEM_ATTENTION_LED); 109OPAL_CALL(opal_set_system_attention_led, OPAL_SET_SYSTEM_ATTENTION_LED);
110OPAL_CALL(opal_pci_next_error, OPAL_PCI_NEXT_ERROR);
111OPAL_CALL(opal_pci_poll, OPAL_PCI_POLL);
110OPAL_CALL(opal_pci_msi_eoi, OPAL_PCI_MSI_EOI); 112OPAL_CALL(opal_pci_msi_eoi, OPAL_PCI_MSI_EOI);
113OPAL_CALL(opal_pci_get_phb_diag_data2, OPAL_PCI_GET_PHB_DIAG_DATA2);
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 628c564ceadb..106301fd2fa5 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -15,6 +15,7 @@
15#include <linux/of.h> 15#include <linux/of.h>
16#include <linux/of_platform.h> 16#include <linux/of_platform.h>
17#include <linux/interrupt.h> 17#include <linux/interrupt.h>
18#include <linux/notifier.h>
18#include <linux/slab.h> 19#include <linux/slab.h>
19#include <asm/opal.h> 20#include <asm/opal.h>
20#include <asm/firmware.h> 21#include <asm/firmware.h>
@@ -31,6 +32,10 @@ static DEFINE_SPINLOCK(opal_write_lock);
31extern u64 opal_mc_secondary_handler[]; 32extern u64 opal_mc_secondary_handler[];
32static unsigned int *opal_irqs; 33static unsigned int *opal_irqs;
33static unsigned int opal_irq_count; 34static unsigned int opal_irq_count;
35static ATOMIC_NOTIFIER_HEAD(opal_notifier_head);
36static DEFINE_SPINLOCK(opal_notifier_lock);
37static uint64_t last_notified_mask = 0x0ul;
38static atomic_t opal_notifier_hold = ATOMIC_INIT(0);
34 39
35int __init early_init_dt_scan_opal(unsigned long node, 40int __init early_init_dt_scan_opal(unsigned long node,
36 const char *uname, int depth, void *data) 41 const char *uname, int depth, void *data)
@@ -95,6 +100,68 @@ static int __init opal_register_exception_handlers(void)
95 100
96early_initcall(opal_register_exception_handlers); 101early_initcall(opal_register_exception_handlers);
97 102
103int opal_notifier_register(struct notifier_block *nb)
104{
105 if (!nb) {
106 pr_warning("%s: Invalid argument (%p)\n",
107 __func__, nb);
108 return -EINVAL;
109 }
110
111 atomic_notifier_chain_register(&opal_notifier_head, nb);
112 return 0;
113}
114
115static void opal_do_notifier(uint64_t events)
116{
117 unsigned long flags;
118 uint64_t changed_mask;
119
120 if (atomic_read(&opal_notifier_hold))
121 return;
122
123 spin_lock_irqsave(&opal_notifier_lock, flags);
124 changed_mask = last_notified_mask ^ events;
125 last_notified_mask = events;
126 spin_unlock_irqrestore(&opal_notifier_lock, flags);
127
128 /*
129 * We feed with the event bits and changed bits for
130 * enough information to the callback.
131 */
132 atomic_notifier_call_chain(&opal_notifier_head,
133 events, (void *)changed_mask);
134}
135
136void opal_notifier_update_evt(uint64_t evt_mask,
137 uint64_t evt_val)
138{
139 unsigned long flags;
140
141 spin_lock_irqsave(&opal_notifier_lock, flags);
142 last_notified_mask &= ~evt_mask;
143 last_notified_mask |= evt_val;
144 spin_unlock_irqrestore(&opal_notifier_lock, flags);
145}
146
147void opal_notifier_enable(void)
148{
149 int64_t rc;
150 uint64_t evt = 0;
151
152 atomic_set(&opal_notifier_hold, 0);
153
154 /* Process pending events */
155 rc = opal_poll_events(&evt);
156 if (rc == OPAL_SUCCESS && evt)
157 opal_do_notifier(evt);
158}
159
160void opal_notifier_disable(void)
161{
162 atomic_set(&opal_notifier_hold, 1);
163}
164
98int opal_get_chars(uint32_t vtermno, char *buf, int count) 165int opal_get_chars(uint32_t vtermno, char *buf, int count)
99{ 166{
100 s64 len, rc; 167 s64 len, rc;
@@ -297,7 +364,7 @@ static irqreturn_t opal_interrupt(int irq, void *data)
297 364
298 opal_handle_interrupt(virq_to_hw(irq), &events); 365 opal_handle_interrupt(virq_to_hw(irq), &events);
299 366
300 /* XXX TODO: Do something with the events */ 367 opal_do_notifier(events);
301 368
302 return IRQ_HANDLED; 369 return IRQ_HANDLED;
303} 370}
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 9c9d15e4cdf2..49b57b9f835d 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -13,6 +13,7 @@
13 13
14#include <linux/kernel.h> 14#include <linux/kernel.h>
15#include <linux/pci.h> 15#include <linux/pci.h>
16#include <linux/debugfs.h>
16#include <linux/delay.h> 17#include <linux/delay.h>
17#include <linux/string.h> 18#include <linux/string.h>
18#include <linux/init.h> 19#include <linux/init.h>
@@ -32,6 +33,7 @@
32#include <asm/iommu.h> 33#include <asm/iommu.h>
33#include <asm/tce.h> 34#include <asm/tce.h>
34#include <asm/xics.h> 35#include <asm/xics.h>
36#include <asm/debug.h>
35 37
36#include "powernv.h" 38#include "powernv.h"
37#include "pci.h" 39#include "pci.h"
@@ -441,6 +443,17 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev
441 set_iommu_table_base(&pdev->dev, &pe->tce32_table); 443 set_iommu_table_base(&pdev->dev, &pe->tce32_table);
442} 444}
443 445
446static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus)
447{
448 struct pci_dev *dev;
449
450 list_for_each_entry(dev, &bus->devices, bus_list) {
451 set_iommu_table_base(&dev->dev, &pe->tce32_table);
452 if (dev->subordinate)
453 pnv_ioda_setup_bus_dma(pe, dev->subordinate);
454 }
455}
456
444static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl, 457static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl,
445 u64 *startp, u64 *endp) 458 u64 *startp, u64 *endp)
446{ 459{
@@ -595,6 +608,12 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
595 TCE_PCI_SWINV_PAIR; 608 TCE_PCI_SWINV_PAIR;
596 } 609 }
597 iommu_init_table(tbl, phb->hose->node); 610 iommu_init_table(tbl, phb->hose->node);
611 iommu_register_group(tbl, pci_domain_nr(pe->pbus), pe->pe_number);
612
613 if (pe->pdev)
614 set_iommu_table_base(&pe->pdev->dev, tbl);
615 else
616 pnv_ioda_setup_bus_dma(pe, pe->pbus);
598 617
599 return; 618 return;
600 fail: 619 fail:
@@ -667,6 +686,11 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
667 } 686 }
668 iommu_init_table(tbl, phb->hose->node); 687 iommu_init_table(tbl, phb->hose->node);
669 688
689 if (pe->pdev)
690 set_iommu_table_base(&pe->pdev->dev, tbl);
691 else
692 pnv_ioda_setup_bus_dma(pe, pe->pbus);
693
670 return; 694 return;
671fail: 695fail:
672 if (pe->tce32_seg >= 0) 696 if (pe->tce32_seg >= 0)
@@ -968,11 +992,38 @@ static void pnv_pci_ioda_setup_DMA(void)
968 } 992 }
969} 993}
970 994
995static void pnv_pci_ioda_create_dbgfs(void)
996{
997#ifdef CONFIG_DEBUG_FS
998 struct pci_controller *hose, *tmp;
999 struct pnv_phb *phb;
1000 char name[16];
1001
1002 list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
1003 phb = hose->private_data;
1004
1005 sprintf(name, "PCI%04x", hose->global_number);
1006 phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root);
1007 if (!phb->dbgfs)
1008 pr_warning("%s: Error on creating debugfs on PHB#%x\n",
1009 __func__, hose->global_number);
1010 }
1011#endif /* CONFIG_DEBUG_FS */
1012}
1013
971static void pnv_pci_ioda_fixup(void) 1014static void pnv_pci_ioda_fixup(void)
972{ 1015{
973 pnv_pci_ioda_setup_PEs(); 1016 pnv_pci_ioda_setup_PEs();
974 pnv_pci_ioda_setup_seg(); 1017 pnv_pci_ioda_setup_seg();
975 pnv_pci_ioda_setup_DMA(); 1018 pnv_pci_ioda_setup_DMA();
1019
1020 pnv_pci_ioda_create_dbgfs();
1021
1022#ifdef CONFIG_EEH
1023 eeh_probe_mode_set(EEH_PROBE_MODE_DEV);
1024 eeh_addr_cache_build();
1025 eeh_init();
1026#endif
976} 1027}
977 1028
978/* 1029/*
@@ -1049,7 +1100,8 @@ static void pnv_pci_ioda_shutdown(struct pnv_phb *phb)
1049 OPAL_ASSERT_RESET); 1100 OPAL_ASSERT_RESET);
1050} 1101}
1051 1102
1052void __init pnv_pci_init_ioda_phb(struct device_node *np, int ioda_type) 1103void __init pnv_pci_init_ioda_phb(struct device_node *np,
1104 u64 hub_id, int ioda_type)
1053{ 1105{
1054 struct pci_controller *hose; 1106 struct pci_controller *hose;
1055 static int primary = 1; 1107 static int primary = 1;
@@ -1087,6 +1139,7 @@ void __init pnv_pci_init_ioda_phb(struct device_node *np, int ioda_type)
1087 hose->first_busno = 0; 1139 hose->first_busno = 0;
1088 hose->last_busno = 0xff; 1140 hose->last_busno = 0xff;
1089 hose->private_data = phb; 1141 hose->private_data = phb;
1142 phb->hub_id = hub_id;
1090 phb->opal_id = phb_id; 1143 phb->opal_id = phb_id;
1091 phb->type = ioda_type; 1144 phb->type = ioda_type;
1092 1145
@@ -1172,6 +1225,9 @@ void __init pnv_pci_init_ioda_phb(struct device_node *np, int ioda_type)
1172 phb->ioda.io_size, phb->ioda.io_segsize); 1225 phb->ioda.io_size, phb->ioda.io_segsize);
1173 1226
1174 phb->hose->ops = &pnv_pci_ops; 1227 phb->hose->ops = &pnv_pci_ops;
1228#ifdef CONFIG_EEH
1229 phb->eeh_ops = &ioda_eeh_ops;
1230#endif
1175 1231
1176 /* Setup RID -> PE mapping function */ 1232 /* Setup RID -> PE mapping function */
1177 phb->bdfn_to_pe = pnv_ioda_bdfn_to_pe; 1233 phb->bdfn_to_pe = pnv_ioda_bdfn_to_pe;
@@ -1212,7 +1268,7 @@ void __init pnv_pci_init_ioda_phb(struct device_node *np, int ioda_type)
1212 1268
1213void pnv_pci_init_ioda2_phb(struct device_node *np) 1269void pnv_pci_init_ioda2_phb(struct device_node *np)
1214{ 1270{
1215 pnv_pci_init_ioda_phb(np, PNV_PHB_IODA2); 1271 pnv_pci_init_ioda_phb(np, 0, PNV_PHB_IODA2);
1216} 1272}
1217 1273
1218void __init pnv_pci_init_ioda_hub(struct device_node *np) 1274void __init pnv_pci_init_ioda_hub(struct device_node *np)
@@ -1235,6 +1291,6 @@ void __init pnv_pci_init_ioda_hub(struct device_node *np)
1235 for_each_child_of_node(np, phbn) { 1291 for_each_child_of_node(np, phbn) {
1236 /* Look for IODA1 PHBs */ 1292 /* Look for IODA1 PHBs */
1237 if (of_device_is_compatible(phbn, "ibm,ioda-phb")) 1293 if (of_device_is_compatible(phbn, "ibm,ioda-phb"))
1238 pnv_pci_init_ioda_phb(phbn, PNV_PHB_IODA1); 1294 pnv_pci_init_ioda_phb(phbn, hub_id, PNV_PHB_IODA1);
1239 } 1295 }
1240} 1296}
diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
index 92b37a0186c9..b68db6325c1b 100644
--- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c
+++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
@@ -86,13 +86,16 @@ static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb) { }
86static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb, 86static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb,
87 struct pci_dev *pdev) 87 struct pci_dev *pdev)
88{ 88{
89 if (phb->p5ioc2.iommu_table.it_map == NULL) 89 if (phb->p5ioc2.iommu_table.it_map == NULL) {
90 iommu_init_table(&phb->p5ioc2.iommu_table, phb->hose->node); 90 iommu_init_table(&phb->p5ioc2.iommu_table, phb->hose->node);
91 iommu_register_group(&phb->p5ioc2.iommu_table,
92 pci_domain_nr(phb->hose->bus), phb->opal_id);
93 }
91 94
92 set_iommu_table_base(&pdev->dev, &phb->p5ioc2.iommu_table); 95 set_iommu_table_base(&pdev->dev, &phb->p5ioc2.iommu_table);
93} 96}
94 97
95static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, 98static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id,
96 void *tce_mem, u64 tce_size) 99 void *tce_mem, u64 tce_size)
97{ 100{
98 struct pnv_phb *phb; 101 struct pnv_phb *phb;
@@ -133,6 +136,7 @@ static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np,
133 phb->hose->first_busno = 0; 136 phb->hose->first_busno = 0;
134 phb->hose->last_busno = 0xff; 137 phb->hose->last_busno = 0xff;
135 phb->hose->private_data = phb; 138 phb->hose->private_data = phb;
139 phb->hub_id = hub_id;
136 phb->opal_id = phb_id; 140 phb->opal_id = phb_id;
137 phb->type = PNV_PHB_P5IOC2; 141 phb->type = PNV_PHB_P5IOC2;
138 phb->model = PNV_PHB_MODEL_P5IOC2; 142 phb->model = PNV_PHB_MODEL_P5IOC2;
@@ -226,7 +230,8 @@ void __init pnv_pci_init_p5ioc2_hub(struct device_node *np)
226 for_each_child_of_node(np, phbn) { 230 for_each_child_of_node(np, phbn) {
227 if (of_device_is_compatible(phbn, "ibm,p5ioc2-pcix") || 231 if (of_device_is_compatible(phbn, "ibm,p5ioc2-pcix") ||
228 of_device_is_compatible(phbn, "ibm,p5ioc2-pciex")) { 232 of_device_is_compatible(phbn, "ibm,p5ioc2-pciex")) {
229 pnv_pci_init_p5ioc2_phb(phbn, tce_mem, tce_per_phb); 233 pnv_pci_init_p5ioc2_phb(phbn, hub_id,
234 tce_mem, tce_per_phb);
230 tce_mem += tce_per_phb; 235 tce_mem += tce_per_phb;
231 } 236 }
232 } 237 }
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 277343cc6a3d..a28d3b5e6393 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -20,6 +20,7 @@
20#include <linux/irq.h> 20#include <linux/irq.h>
21#include <linux/io.h> 21#include <linux/io.h>
22#include <linux/msi.h> 22#include <linux/msi.h>
23#include <linux/iommu.h>
23 24
24#include <asm/sections.h> 25#include <asm/sections.h>
25#include <asm/io.h> 26#include <asm/io.h>
@@ -32,6 +33,8 @@
32#include <asm/iommu.h> 33#include <asm/iommu.h>
33#include <asm/tce.h> 34#include <asm/tce.h>
34#include <asm/firmware.h> 35#include <asm/firmware.h>
36#include <asm/eeh_event.h>
37#include <asm/eeh.h>
35 38
36#include "powernv.h" 39#include "powernv.h"
37#include "pci.h" 40#include "pci.h"
@@ -202,7 +205,8 @@ static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no)
202 205
203 spin_lock_irqsave(&phb->lock, flags); 206 spin_lock_irqsave(&phb->lock, flags);
204 207
205 rc = opal_pci_get_phb_diag_data(phb->opal_id, phb->diag.blob, PNV_PCI_DIAG_BUF_SIZE); 208 rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob,
209 PNV_PCI_DIAG_BUF_SIZE);
206 has_diag = (rc == OPAL_SUCCESS); 210 has_diag = (rc == OPAL_SUCCESS);
207 211
208 rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no, 212 rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no,
@@ -227,43 +231,50 @@ static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no)
227 spin_unlock_irqrestore(&phb->lock, flags); 231 spin_unlock_irqrestore(&phb->lock, flags);
228} 232}
229 233
230static void pnv_pci_config_check_eeh(struct pnv_phb *phb, struct pci_bus *bus, 234static void pnv_pci_config_check_eeh(struct pnv_phb *phb,
231 u32 bdfn) 235 struct device_node *dn)
232{ 236{
233 s64 rc; 237 s64 rc;
234 u8 fstate; 238 u8 fstate;
235 u16 pcierr; 239 u16 pcierr;
236 u32 pe_no; 240 u32 pe_no;
237 241
238 /* Get PE# if we support IODA */ 242 /*
239 pe_no = phb->bdfn_to_pe ? phb->bdfn_to_pe(phb, bus, bdfn & 0xff) : 0; 243 * Get the PE#. During the PCI probe stage, we might not
244 * setup that yet. So all ER errors should be mapped to
245 * PE#0
246 */
247 pe_no = PCI_DN(dn)->pe_number;
248 if (pe_no == IODA_INVALID_PE)
249 pe_no = 0;
240 250
241 /* Read freeze status */ 251 /* Read freeze status */
242 rc = opal_pci_eeh_freeze_status(phb->opal_id, pe_no, &fstate, &pcierr, 252 rc = opal_pci_eeh_freeze_status(phb->opal_id, pe_no, &fstate, &pcierr,
243 NULL); 253 NULL);
244 if (rc) { 254 if (rc) {
245 pr_warning("PCI %d: Failed to read EEH status for PE#%d," 255 pr_warning("%s: Can't read EEH status (PE#%d) for "
246 " err %lld\n", phb->hose->global_number, pe_no, rc); 256 "%s, err %lld\n",
257 __func__, pe_no, dn->full_name, rc);
247 return; 258 return;
248 } 259 }
249 cfg_dbg(" -> EEH check, bdfn=%04x PE%d fstate=%x\n", 260 cfg_dbg(" -> EEH check, bdfn=%04x PE#%d fstate=%x\n",
250 bdfn, pe_no, fstate); 261 (PCI_DN(dn)->busno << 8) | (PCI_DN(dn)->devfn),
262 pe_no, fstate);
251 if (fstate != 0) 263 if (fstate != 0)
252 pnv_pci_handle_eeh_config(phb, pe_no); 264 pnv_pci_handle_eeh_config(phb, pe_no);
253} 265}
254 266
255static int pnv_pci_read_config(struct pci_bus *bus, 267int pnv_pci_cfg_read(struct device_node *dn,
256 unsigned int devfn, 268 int where, int size, u32 *val)
257 int where, int size, u32 *val)
258{ 269{
259 struct pci_controller *hose = pci_bus_to_host(bus); 270 struct pci_dn *pdn = PCI_DN(dn);
260 struct pnv_phb *phb = hose->private_data; 271 struct pnv_phb *phb = pdn->phb->private_data;
261 u32 bdfn = (((uint64_t)bus->number) << 8) | devfn; 272 u32 bdfn = (pdn->busno << 8) | pdn->devfn;
273#ifdef CONFIG_EEH
274 struct eeh_pe *phb_pe = NULL;
275#endif
262 s64 rc; 276 s64 rc;
263 277
264 if (hose == NULL)
265 return PCIBIOS_DEVICE_NOT_FOUND;
266
267 switch (size) { 278 switch (size) {
268 case 1: { 279 case 1: {
269 u8 v8; 280 u8 v8;
@@ -287,28 +298,43 @@ static int pnv_pci_read_config(struct pci_bus *bus,
287 default: 298 default:
288 return PCIBIOS_FUNC_NOT_SUPPORTED; 299 return PCIBIOS_FUNC_NOT_SUPPORTED;
289 } 300 }
290 cfg_dbg("pnv_pci_read_config bus: %x devfn: %x +%x/%x -> %08x\n", 301 cfg_dbg("%s: bus: %x devfn: %x +%x/%x -> %08x\n",
291 bus->number, devfn, where, size, *val); 302 __func__, pdn->busno, pdn->devfn, where, size, *val);
292 303
293 /* Check if the PHB got frozen due to an error (no response) */ 304 /*
294 pnv_pci_config_check_eeh(phb, bus, bdfn); 305 * Check if the specified PE has been put into frozen
306 * state. On the other hand, we needn't do that while
307 * the PHB has been put into frozen state because of
308 * PHB-fatal errors.
309 */
310#ifdef CONFIG_EEH
311 phb_pe = eeh_phb_pe_get(pdn->phb);
312 if (phb_pe && (phb_pe->state & EEH_PE_ISOLATED))
313 return PCIBIOS_SUCCESSFUL;
314
315 if (phb->eeh_state & PNV_EEH_STATE_ENABLED) {
316 if (*val == EEH_IO_ERROR_VALUE(size) &&
317 eeh_dev_check_failure(of_node_to_eeh_dev(dn)))
318 return PCIBIOS_DEVICE_NOT_FOUND;
319 } else {
320 pnv_pci_config_check_eeh(phb, dn);
321 }
322#else
323 pnv_pci_config_check_eeh(phb, dn);
324#endif
295 325
296 return PCIBIOS_SUCCESSFUL; 326 return PCIBIOS_SUCCESSFUL;
297} 327}
298 328
299static int pnv_pci_write_config(struct pci_bus *bus, 329int pnv_pci_cfg_write(struct device_node *dn,
300 unsigned int devfn, 330 int where, int size, u32 val)
301 int where, int size, u32 val)
302{ 331{
303 struct pci_controller *hose = pci_bus_to_host(bus); 332 struct pci_dn *pdn = PCI_DN(dn);
304 struct pnv_phb *phb = hose->private_data; 333 struct pnv_phb *phb = pdn->phb->private_data;
305 u32 bdfn = (((uint64_t)bus->number) << 8) | devfn; 334 u32 bdfn = (pdn->busno << 8) | pdn->devfn;
306
307 if (hose == NULL)
308 return PCIBIOS_DEVICE_NOT_FOUND;
309 335
310 cfg_dbg("pnv_pci_write_config bus: %x devfn: %x +%x/%x -> %08x\n", 336 cfg_dbg("%s: bus: %x devfn: %x +%x/%x -> %08x\n",
311 bus->number, devfn, where, size, val); 337 pdn->busno, pdn->devfn, where, size, val);
312 switch (size) { 338 switch (size) {
313 case 1: 339 case 1:
314 opal_pci_config_write_byte(phb->opal_id, bdfn, where, val); 340 opal_pci_config_write_byte(phb->opal_id, bdfn, where, val);
@@ -322,14 +348,54 @@ static int pnv_pci_write_config(struct pci_bus *bus,
322 default: 348 default:
323 return PCIBIOS_FUNC_NOT_SUPPORTED; 349 return PCIBIOS_FUNC_NOT_SUPPORTED;
324 } 350 }
351
325 /* Check if the PHB got frozen due to an error (no response) */ 352 /* Check if the PHB got frozen due to an error (no response) */
326 pnv_pci_config_check_eeh(phb, bus, bdfn); 353#ifdef CONFIG_EEH
354 if (!(phb->eeh_state & PNV_EEH_STATE_ENABLED))
355 pnv_pci_config_check_eeh(phb, dn);
356#else
357 pnv_pci_config_check_eeh(phb, dn);
358#endif
327 359
328 return PCIBIOS_SUCCESSFUL; 360 return PCIBIOS_SUCCESSFUL;
329} 361}
330 362
363static int pnv_pci_read_config(struct pci_bus *bus,
364 unsigned int devfn,
365 int where, int size, u32 *val)
366{
367 struct device_node *dn, *busdn = pci_bus_to_OF_node(bus);
368 struct pci_dn *pdn;
369
370 for (dn = busdn->child; dn; dn = dn->sibling) {
371 pdn = PCI_DN(dn);
372 if (pdn && pdn->devfn == devfn)
373 return pnv_pci_cfg_read(dn, where, size, val);
374 }
375
376 *val = 0xFFFFFFFF;
377 return PCIBIOS_DEVICE_NOT_FOUND;
378
379}
380
381static int pnv_pci_write_config(struct pci_bus *bus,
382 unsigned int devfn,
383 int where, int size, u32 val)
384{
385 struct device_node *dn, *busdn = pci_bus_to_OF_node(bus);
386 struct pci_dn *pdn;
387
388 for (dn = busdn->child; dn; dn = dn->sibling) {
389 pdn = PCI_DN(dn);
390 if (pdn && pdn->devfn == devfn)
391 return pnv_pci_cfg_write(dn, where, size, val);
392 }
393
394 return PCIBIOS_DEVICE_NOT_FOUND;
395}
396
331struct pci_ops pnv_pci_ops = { 397struct pci_ops pnv_pci_ops = {
332 .read = pnv_pci_read_config, 398 .read = pnv_pci_read_config,
333 .write = pnv_pci_write_config, 399 .write = pnv_pci_write_config,
334}; 400};
335 401
@@ -412,6 +478,7 @@ static struct iommu_table *pnv_pci_setup_bml_iommu(struct pci_controller *hose)
412 pnv_pci_setup_iommu_table(tbl, __va(be64_to_cpup(basep)), 478 pnv_pci_setup_iommu_table(tbl, __va(be64_to_cpup(basep)),
413 be32_to_cpup(sizep), 0); 479 be32_to_cpup(sizep), 0);
414 iommu_init_table(tbl, hose->node); 480 iommu_init_table(tbl, hose->node);
481 iommu_register_group(tbl, pci_domain_nr(hose->bus), 0);
415 482
416 /* Deal with SW invalidated TCEs when needed (BML way) */ 483 /* Deal with SW invalidated TCEs when needed (BML way) */
417 swinvp = of_get_property(hose->dn, "linux,tce-sw-invalidate-info", 484 swinvp = of_get_property(hose->dn, "linux,tce-sw-invalidate-info",
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 25d76c4df50b..d633c64e05a1 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -66,15 +66,43 @@ struct pnv_ioda_pe {
66 struct list_head list; 66 struct list_head list;
67}; 67};
68 68
69/* IOC dependent EEH operations */
70#ifdef CONFIG_EEH
71struct pnv_eeh_ops {
72 int (*post_init)(struct pci_controller *hose);
73 int (*set_option)(struct eeh_pe *pe, int option);
74 int (*get_state)(struct eeh_pe *pe);
75 int (*reset)(struct eeh_pe *pe, int option);
76 int (*get_log)(struct eeh_pe *pe, int severity,
77 char *drv_log, unsigned long len);
78 int (*configure_bridge)(struct eeh_pe *pe);
79 int (*next_error)(struct eeh_pe **pe);
80};
81
82#define PNV_EEH_STATE_ENABLED (1 << 0) /* EEH enabled */
83#define PNV_EEH_STATE_REMOVED (1 << 1) /* PHB removed */
84
85#endif /* CONFIG_EEH */
86
69struct pnv_phb { 87struct pnv_phb {
70 struct pci_controller *hose; 88 struct pci_controller *hose;
71 enum pnv_phb_type type; 89 enum pnv_phb_type type;
72 enum pnv_phb_model model; 90 enum pnv_phb_model model;
91 u64 hub_id;
73 u64 opal_id; 92 u64 opal_id;
74 void __iomem *regs; 93 void __iomem *regs;
75 int initialized; 94 int initialized;
76 spinlock_t lock; 95 spinlock_t lock;
77 96
97#ifdef CONFIG_EEH
98 struct pnv_eeh_ops *eeh_ops;
99 int eeh_state;
100#endif
101
102#ifdef CONFIG_DEBUG_FS
103 struct dentry *dbgfs;
104#endif
105
78#ifdef CONFIG_PCI_MSI 106#ifdef CONFIG_PCI_MSI
79 unsigned int msi_base; 107 unsigned int msi_base;
80 unsigned int msi32_support; 108 unsigned int msi32_support;
@@ -150,7 +178,14 @@ struct pnv_phb {
150}; 178};
151 179
152extern struct pci_ops pnv_pci_ops; 180extern struct pci_ops pnv_pci_ops;
181#ifdef CONFIG_EEH
182extern struct pnv_eeh_ops ioda_eeh_ops;
183#endif
153 184
185int pnv_pci_cfg_read(struct device_node *dn,
186 int where, int size, u32 *val);
187int pnv_pci_cfg_write(struct device_node *dn,
188 int where, int size, u32 val);
154extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl, 189extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
155 void *tce_mem, u64 tce_size, 190 void *tce_mem, u64 tce_size,
156 u64 dma_offset); 191 u64 dma_offset);
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index d4459bfc92f7..84438af96c05 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -93,6 +93,8 @@ static void __noreturn pnv_restart(char *cmd)
93{ 93{
94 long rc = OPAL_BUSY; 94 long rc = OPAL_BUSY;
95 95
96 opal_notifier_disable();
97
96 while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { 98 while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
97 rc = opal_cec_reboot(); 99 rc = opal_cec_reboot();
98 if (rc == OPAL_BUSY_EVENT) 100 if (rc == OPAL_BUSY_EVENT)
@@ -108,6 +110,8 @@ static void __noreturn pnv_power_off(void)
108{ 110{
109 long rc = OPAL_BUSY; 111 long rc = OPAL_BUSY;
110 112
113 opal_notifier_disable();
114
111 while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { 115 while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
112 rc = opal_cec_power_down(0); 116 rc = opal_cec_power_down(0);
113 if (rc == OPAL_BUSY_EVENT) 117 if (rc == OPAL_BUSY_EVENT)
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index 88c9459c3e07..89e3857af4e0 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -40,7 +40,7 @@
40#define DBG(fmt...) 40#define DBG(fmt...)
41#endif 41#endif
42 42
43static void __cpuinit pnv_smp_setup_cpu(int cpu) 43static void pnv_smp_setup_cpu(int cpu)
44{ 44{
45 if (cpu != boot_cpuid) 45 if (cpu != boot_cpuid)
46 xics_setup_cpu(); 46 xics_setup_cpu();
@@ -51,7 +51,7 @@ static int pnv_smp_cpu_bootable(unsigned int nr)
51 /* Special case - we inhibit secondary thread startup 51 /* Special case - we inhibit secondary thread startup
52 * during boot if the user requests it. 52 * during boot if the user requests it.
53 */ 53 */
54 if (system_state < SYSTEM_RUNNING && cpu_has_feature(CPU_FTR_SMT)) { 54 if (system_state == SYSTEM_BOOTING && cpu_has_feature(CPU_FTR_SMT)) {
55 if (!smt_enabled_at_boot && cpu_thread_in_core(nr) != 0) 55 if (!smt_enabled_at_boot && cpu_thread_in_core(nr) != 0)
56 return 0; 56 return 0;
57 if (smt_enabled_at_boot 57 if (smt_enabled_at_boot
diff --git a/arch/powerpc/platforms/ps3/htab.c b/arch/powerpc/platforms/ps3/htab.c
index 177a2f70700c..3e270e3412ae 100644
--- a/arch/powerpc/platforms/ps3/htab.c
+++ b/arch/powerpc/platforms/ps3/htab.c
@@ -109,7 +109,8 @@ static long ps3_hpte_remove(unsigned long hpte_group)
109} 109}
110 110
111static long ps3_hpte_updatepp(unsigned long slot, unsigned long newpp, 111static long ps3_hpte_updatepp(unsigned long slot, unsigned long newpp,
112 unsigned long vpn, int psize, int ssize, int local) 112 unsigned long vpn, int psize, int apsize,
113 int ssize, int local)
113{ 114{
114 int result; 115 int result;
115 u64 hpte_v, want_v, hpte_rs; 116 u64 hpte_v, want_v, hpte_rs;
@@ -162,7 +163,7 @@ static void ps3_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
162} 163}
163 164
164static void ps3_hpte_invalidate(unsigned long slot, unsigned long vpn, 165static void ps3_hpte_invalidate(unsigned long slot, unsigned long vpn,
165 int psize, int ssize, int local) 166 int psize, int apsize, int ssize, int local)
166{ 167{
167 unsigned long flags; 168 unsigned long flags;
168 int result; 169 int result;
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index 4459eff7a75a..1bd3399146ed 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -33,11 +33,6 @@ config PPC_SPLPAR
33 processors, that is, which share physical processors between 33 processors, that is, which share physical processors between
34 two or more partitions. 34 two or more partitions.
35 35
36config EEH
37 bool
38 depends on PPC_PSERIES && PCI
39 default y
40
41config PSERIES_MSI 36config PSERIES_MSI
42 bool 37 bool
43 depends on PCI_MSI && EEH 38 depends on PCI_MSI && EEH
diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile
index 53866e537a92..8ae010381316 100644
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -6,9 +6,7 @@ obj-y := lpar.o hvCall.o nvram.o reconfig.o \
6 firmware.o power.o dlpar.o mobility.o 6 firmware.o power.o dlpar.o mobility.o
7obj-$(CONFIG_SMP) += smp.o 7obj-$(CONFIG_SMP) += smp.o
8obj-$(CONFIG_SCANLOG) += scanlog.o 8obj-$(CONFIG_SCANLOG) += scanlog.o
9obj-$(CONFIG_EEH) += eeh.o eeh_pe.o eeh_dev.o eeh_cache.o \ 9obj-$(CONFIG_EEH) += eeh_pseries.o
10 eeh_driver.o eeh_event.o eeh_sysfs.o \
11 eeh_pseries.o
12obj-$(CONFIG_KEXEC) += kexec.o 10obj-$(CONFIG_KEXEC) += kexec.o
13obj-$(CONFIG_PCI) += pci.o pci_dlpar.o 11obj-$(CONFIG_PCI) += pci.o pci_dlpar.o
14obj-$(CONFIG_PSERIES_MSI) += msi.o 12obj-$(CONFIG_PSERIES_MSI) += msi.o
diff --git a/arch/powerpc/platforms/pseries/io_event_irq.c b/arch/powerpc/platforms/pseries/io_event_irq.c
index ef9d9d84c7d5..5ea88d1541f7 100644
--- a/arch/powerpc/platforms/pseries/io_event_irq.c
+++ b/arch/powerpc/platforms/pseries/io_event_irq.c
@@ -115,7 +115,7 @@ static struct pseries_io_event * ioei_find_event(struct rtas_error_log *elog)
115 * by scope or event type alone. For example, Torrent ISR route change 115 * by scope or event type alone. For example, Torrent ISR route change
116 * event is reported with scope 0x00 (Not Applicatable) rather than 116 * event is reported with scope 0x00 (Not Applicatable) rather than
117 * 0x3B (Torrent-hub). It is better to let the clients to identify 117 * 0x3B (Torrent-hub). It is better to let the clients to identify
118 * who owns the the event. 118 * who owns the event.
119 */ 119 */
120 120
121static irqreturn_t ioei_interrupt(int irq, void *dev_id) 121static irqreturn_t ioei_interrupt(int irq, void *dev_id)
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 86ae364900d6..23fc1dcf4434 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -614,6 +614,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
614 614
615 iommu_table_setparms(pci->phb, dn, tbl); 615 iommu_table_setparms(pci->phb, dn, tbl);
616 pci->iommu_table = iommu_init_table(tbl, pci->phb->node); 616 pci->iommu_table = iommu_init_table(tbl, pci->phb->node);
617 iommu_register_group(tbl, pci_domain_nr(bus), 0);
617 618
618 /* Divide the rest (1.75GB) among the children */ 619 /* Divide the rest (1.75GB) among the children */
619 pci->phb->dma_window_size = 0x80000000ul; 620 pci->phb->dma_window_size = 0x80000000ul;
@@ -658,6 +659,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
658 ppci->phb->node); 659 ppci->phb->node);
659 iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window); 660 iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window);
660 ppci->iommu_table = iommu_init_table(tbl, ppci->phb->node); 661 ppci->iommu_table = iommu_init_table(tbl, ppci->phb->node);
662 iommu_register_group(tbl, pci_domain_nr(bus), 0);
661 pr_debug(" created table: %p\n", ppci->iommu_table); 663 pr_debug(" created table: %p\n", ppci->iommu_table);
662 } 664 }
663} 665}
@@ -684,6 +686,7 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
684 phb->node); 686 phb->node);
685 iommu_table_setparms(phb, dn, tbl); 687 iommu_table_setparms(phb, dn, tbl);
686 PCI_DN(dn)->iommu_table = iommu_init_table(tbl, phb->node); 688 PCI_DN(dn)->iommu_table = iommu_init_table(tbl, phb->node);
689 iommu_register_group(tbl, pci_domain_nr(phb->bus), 0);
687 set_iommu_table_base(&dev->dev, PCI_DN(dn)->iommu_table); 690 set_iommu_table_base(&dev->dev, PCI_DN(dn)->iommu_table);
688 return; 691 return;
689 } 692 }
@@ -1184,6 +1187,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
1184 pci->phb->node); 1187 pci->phb->node);
1185 iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window); 1188 iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window);
1186 pci->iommu_table = iommu_init_table(tbl, pci->phb->node); 1189 pci->iommu_table = iommu_init_table(tbl, pci->phb->node);
1190 iommu_register_group(tbl, pci_domain_nr(pci->phb->bus), 0);
1187 pr_debug(" created table: %p\n", pci->iommu_table); 1191 pr_debug(" created table: %p\n", pci->iommu_table);
1188 } else { 1192 } else {
1189 pr_debug(" found DMA window, table: %p\n", pci->iommu_table); 1193 pr_debug(" found DMA window, table: %p\n", pci->iommu_table);
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 6d62072a7d5a..02d6e21619bb 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -45,6 +45,13 @@
45#include "plpar_wrappers.h" 45#include "plpar_wrappers.h"
46#include "pseries.h" 46#include "pseries.h"
47 47
48/* Flag bits for H_BULK_REMOVE */
49#define HBR_REQUEST 0x4000000000000000UL
50#define HBR_RESPONSE 0x8000000000000000UL
51#define HBR_END 0xc000000000000000UL
52#define HBR_AVPN 0x0200000000000000UL
53#define HBR_ANDCOND 0x0100000000000000UL
54
48 55
49/* in hvCall.S */ 56/* in hvCall.S */
50EXPORT_SYMBOL(plpar_hcall); 57EXPORT_SYMBOL(plpar_hcall);
@@ -64,6 +71,9 @@ void vpa_init(int cpu)
64 if (cpu_has_feature(CPU_FTR_ALTIVEC)) 71 if (cpu_has_feature(CPU_FTR_ALTIVEC))
65 lppaca_of(cpu).vmxregs_in_use = 1; 72 lppaca_of(cpu).vmxregs_in_use = 1;
66 73
74 if (cpu_has_feature(CPU_FTR_ARCH_207S))
75 lppaca_of(cpu).ebb_regs_in_use = 1;
76
67 addr = __pa(&lppaca_of(cpu)); 77 addr = __pa(&lppaca_of(cpu));
68 ret = register_vpa(hwcpu, addr); 78 ret = register_vpa(hwcpu, addr);
69 79
@@ -240,7 +250,8 @@ static void pSeries_lpar_hptab_clear(void)
240static long pSeries_lpar_hpte_updatepp(unsigned long slot, 250static long pSeries_lpar_hpte_updatepp(unsigned long slot,
241 unsigned long newpp, 251 unsigned long newpp,
242 unsigned long vpn, 252 unsigned long vpn,
243 int psize, int ssize, int local) 253 int psize, int apsize,
254 int ssize, int local)
244{ 255{
245 unsigned long lpar_rc; 256 unsigned long lpar_rc;
246 unsigned long flags = (newpp & 7) | H_AVPN; 257 unsigned long flags = (newpp & 7) | H_AVPN;
@@ -328,7 +339,8 @@ static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp,
328} 339}
329 340
330static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn, 341static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn,
331 int psize, int ssize, int local) 342 int psize, int apsize,
343 int ssize, int local)
332{ 344{
333 unsigned long want_v; 345 unsigned long want_v;
334 unsigned long lpar_rc; 346 unsigned long lpar_rc;
@@ -345,6 +357,113 @@ static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn,
345 BUG_ON(lpar_rc != H_SUCCESS); 357 BUG_ON(lpar_rc != H_SUCCESS);
346} 358}
347 359
360/*
361 * Limit iterations holding pSeries_lpar_tlbie_lock to 3. We also need
362 * to make sure that we avoid bouncing the hypervisor tlbie lock.
363 */
364#define PPC64_HUGE_HPTE_BATCH 12
365
366static void __pSeries_lpar_hugepage_invalidate(unsigned long *slot,
367 unsigned long *vpn, int count,
368 int psize, int ssize)
369{
370 unsigned long param[8];
371 int i = 0, pix = 0, rc;
372 unsigned long flags = 0;
373 int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
374
375 if (lock_tlbie)
376 spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
377
378 for (i = 0; i < count; i++) {
379
380 if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
381 pSeries_lpar_hpte_invalidate(slot[i], vpn[i], psize, 0,
382 ssize, 0);
383 } else {
384 param[pix] = HBR_REQUEST | HBR_AVPN | slot[i];
385 param[pix+1] = hpte_encode_avpn(vpn[i], psize, ssize);
386 pix += 2;
387 if (pix == 8) {
388 rc = plpar_hcall9(H_BULK_REMOVE, param,
389 param[0], param[1], param[2],
390 param[3], param[4], param[5],
391 param[6], param[7]);
392 BUG_ON(rc != H_SUCCESS);
393 pix = 0;
394 }
395 }
396 }
397 if (pix) {
398 param[pix] = HBR_END;
399 rc = plpar_hcall9(H_BULK_REMOVE, param, param[0], param[1],
400 param[2], param[3], param[4], param[5],
401 param[6], param[7]);
402 BUG_ON(rc != H_SUCCESS);
403 }
404
405 if (lock_tlbie)
406 spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
407}
408
409static void pSeries_lpar_hugepage_invalidate(struct mm_struct *mm,
410 unsigned char *hpte_slot_array,
411 unsigned long addr, int psize)
412{
413 int ssize = 0, i, index = 0;
414 unsigned long s_addr = addr;
415 unsigned int max_hpte_count, valid;
416 unsigned long vpn_array[PPC64_HUGE_HPTE_BATCH];
417 unsigned long slot_array[PPC64_HUGE_HPTE_BATCH];
418 unsigned long shift, hidx, vpn = 0, vsid, hash, slot;
419
420 shift = mmu_psize_defs[psize].shift;
421 max_hpte_count = 1U << (PMD_SHIFT - shift);
422
423 for (i = 0; i < max_hpte_count; i++) {
424 valid = hpte_valid(hpte_slot_array, i);
425 if (!valid)
426 continue;
427 hidx = hpte_hash_index(hpte_slot_array, i);
428
429 /* get the vpn */
430 addr = s_addr + (i * (1ul << shift));
431 if (!is_kernel_addr(addr)) {
432 ssize = user_segment_size(addr);
433 vsid = get_vsid(mm->context.id, addr, ssize);
434 WARN_ON(vsid == 0);
435 } else {
436 vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
437 ssize = mmu_kernel_ssize;
438 }
439
440 vpn = hpt_vpn(addr, vsid, ssize);
441 hash = hpt_hash(vpn, shift, ssize);
442 if (hidx & _PTEIDX_SECONDARY)
443 hash = ~hash;
444
445 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
446 slot += hidx & _PTEIDX_GROUP_IX;
447
448 slot_array[index] = slot;
449 vpn_array[index] = vpn;
450 if (index == PPC64_HUGE_HPTE_BATCH - 1) {
451 /*
452 * Now do a bluk invalidate
453 */
454 __pSeries_lpar_hugepage_invalidate(slot_array,
455 vpn_array,
456 PPC64_HUGE_HPTE_BATCH,
457 psize, ssize);
458 index = 0;
459 } else
460 index++;
461 }
462 if (index)
463 __pSeries_lpar_hugepage_invalidate(slot_array, vpn_array,
464 index, psize, ssize);
465}
466
348static void pSeries_lpar_hpte_removebolted(unsigned long ea, 467static void pSeries_lpar_hpte_removebolted(unsigned long ea,
349 int psize, int ssize) 468 int psize, int ssize)
350{ 469{
@@ -356,17 +475,12 @@ static void pSeries_lpar_hpte_removebolted(unsigned long ea,
356 475
357 slot = pSeries_lpar_hpte_find(vpn, psize, ssize); 476 slot = pSeries_lpar_hpte_find(vpn, psize, ssize);
358 BUG_ON(slot == -1); 477 BUG_ON(slot == -1);
359 478 /*
360 pSeries_lpar_hpte_invalidate(slot, vpn, psize, ssize, 0); 479 * lpar doesn't use the passed actual page size
480 */
481 pSeries_lpar_hpte_invalidate(slot, vpn, psize, 0, ssize, 0);
361} 482}
362 483
363/* Flag bits for H_BULK_REMOVE */
364#define HBR_REQUEST 0x4000000000000000UL
365#define HBR_RESPONSE 0x8000000000000000UL
366#define HBR_END 0xc000000000000000UL
367#define HBR_AVPN 0x0200000000000000UL
368#define HBR_ANDCOND 0x0100000000000000UL
369
370/* 484/*
371 * Take a spinlock around flushes to avoid bouncing the hypervisor tlbie 485 * Take a spinlock around flushes to avoid bouncing the hypervisor tlbie
372 * lock. 486 * lock.
@@ -400,8 +514,11 @@ static void pSeries_lpar_flush_hash_range(unsigned long number, int local)
400 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; 514 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
401 slot += hidx & _PTEIDX_GROUP_IX; 515 slot += hidx & _PTEIDX_GROUP_IX;
402 if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) { 516 if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
517 /*
518 * lpar doesn't use the passed actual page size
519 */
403 pSeries_lpar_hpte_invalidate(slot, vpn, psize, 520 pSeries_lpar_hpte_invalidate(slot, vpn, psize,
404 ssize, local); 521 0, ssize, local);
405 } else { 522 } else {
406 param[pix] = HBR_REQUEST | HBR_AVPN | slot; 523 param[pix] = HBR_REQUEST | HBR_AVPN | slot;
407 param[pix+1] = hpte_encode_avpn(vpn, psize, 524 param[pix+1] = hpte_encode_avpn(vpn, psize,
@@ -452,6 +569,7 @@ void __init hpte_init_lpar(void)
452 ppc_md.hpte_removebolted = pSeries_lpar_hpte_removebolted; 569 ppc_md.hpte_removebolted = pSeries_lpar_hpte_removebolted;
453 ppc_md.flush_hash_range = pSeries_lpar_flush_hash_range; 570 ppc_md.flush_hash_range = pSeries_lpar_flush_hash_range;
454 ppc_md.hpte_clear_all = pSeries_lpar_hptab_clear; 571 ppc_md.hpte_clear_all = pSeries_lpar_hptab_clear;
572 ppc_md.hugepage_invalidate = pSeries_lpar_hugepage_invalidate;
455} 573}
456 574
457#ifdef CONFIG_PPC_SMLPAR 575#ifdef CONFIG_PPC_SMLPAR
diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c
index 8733a86ad52e..9f8671a44551 100644
--- a/arch/powerpc/platforms/pseries/nvram.c
+++ b/arch/powerpc/platforms/pseries/nvram.c
@@ -18,6 +18,7 @@
18#include <linux/spinlock.h> 18#include <linux/spinlock.h>
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include <linux/kmsg_dump.h> 20#include <linux/kmsg_dump.h>
21#include <linux/pstore.h>
21#include <linux/ctype.h> 22#include <linux/ctype.h>
22#include <linux/zlib.h> 23#include <linux/zlib.h>
23#include <asm/uaccess.h> 24#include <asm/uaccess.h>
@@ -29,6 +30,13 @@
29/* Max bytes to read/write in one go */ 30/* Max bytes to read/write in one go */
30#define NVRW_CNT 0x20 31#define NVRW_CNT 0x20
31 32
33/*
34 * Set oops header version to distingush between old and new format header.
35 * lnx,oops-log partition max size is 4000, header version > 4000 will
36 * help in identifying new header.
37 */
38#define OOPS_HDR_VERSION 5000
39
32static unsigned int nvram_size; 40static unsigned int nvram_size;
33static int nvram_fetch, nvram_store; 41static int nvram_fetch, nvram_store;
34static char nvram_buf[NVRW_CNT]; /* assume this is in the first 4GB */ 42static char nvram_buf[NVRW_CNT]; /* assume this is in the first 4GB */
@@ -45,20 +53,23 @@ struct nvram_os_partition {
45 int min_size; /* minimum acceptable size (0 means req_size) */ 53 int min_size; /* minimum acceptable size (0 means req_size) */
46 long size; /* size of data portion (excluding err_log_info) */ 54 long size; /* size of data portion (excluding err_log_info) */
47 long index; /* offset of data portion of partition */ 55 long index; /* offset of data portion of partition */
56 bool os_partition; /* partition initialized by OS, not FW */
48}; 57};
49 58
50static struct nvram_os_partition rtas_log_partition = { 59static struct nvram_os_partition rtas_log_partition = {
51 .name = "ibm,rtas-log", 60 .name = "ibm,rtas-log",
52 .req_size = 2079, 61 .req_size = 2079,
53 .min_size = 1055, 62 .min_size = 1055,
54 .index = -1 63 .index = -1,
64 .os_partition = true
55}; 65};
56 66
57static struct nvram_os_partition oops_log_partition = { 67static struct nvram_os_partition oops_log_partition = {
58 .name = "lnx,oops-log", 68 .name = "lnx,oops-log",
59 .req_size = 4000, 69 .req_size = 4000,
60 .min_size = 2000, 70 .min_size = 2000,
61 .index = -1 71 .index = -1,
72 .os_partition = true
62}; 73};
63 74
64static const char *pseries_nvram_os_partitions[] = { 75static const char *pseries_nvram_os_partitions[] = {
@@ -67,6 +78,12 @@ static const char *pseries_nvram_os_partitions[] = {
67 NULL 78 NULL
68}; 79};
69 80
81struct oops_log_info {
82 u16 version;
83 u16 report_length;
84 u64 timestamp;
85} __attribute__((packed));
86
70static void oops_to_nvram(struct kmsg_dumper *dumper, 87static void oops_to_nvram(struct kmsg_dumper *dumper,
71 enum kmsg_dump_reason reason); 88 enum kmsg_dump_reason reason);
72 89
@@ -83,28 +100,28 @@ static unsigned long last_unread_rtas_event; /* timestamp */
83 100
84 * big_oops_buf[] holds the uncompressed text we're capturing. 101 * big_oops_buf[] holds the uncompressed text we're capturing.
85 * 102 *
86 * oops_buf[] holds the compressed text, preceded by a prefix. 103 * oops_buf[] holds the compressed text, preceded by a oops header.
87 * The prefix is just a u16 holding the length of the compressed* text. 104 * oops header has u16 holding the version of oops header (to differentiate
88 * (*Or uncompressed, if compression fails.) oops_buf[] gets written 105 * between old and new format header) followed by u16 holding the length of
89 * to NVRAM. 106 * the compressed* text (*Or uncompressed, if compression fails.) and u64
107 * holding the timestamp. oops_buf[] gets written to NVRAM.
90 * 108 *
91 * oops_len points to the prefix. oops_data points to the compressed text. 109 * oops_log_info points to the header. oops_data points to the compressed text.
92 * 110 *
93 * +- oops_buf 111 * +- oops_buf
94 * | +- oops_data 112 * | +- oops_data
95 * v v 113 * v v
96 * +------------+-----------------------------------------------+ 114 * +-----------+-----------+-----------+------------------------+
97 * | length | text | 115 * | version | length | timestamp | text |
98 * | (2 bytes) | (oops_data_sz bytes) | 116 * | (2 bytes) | (2 bytes) | (8 bytes) | (oops_data_sz bytes) |
99 * +------------+-----------------------------------------------+ 117 * +-----------+-----------+-----------+------------------------+
100 * ^ 118 * ^
101 * +- oops_len 119 * +- oops_log_info
102 * 120 *
103 * We preallocate these buffers during init to avoid kmalloc during oops/panic. 121 * We preallocate these buffers during init to avoid kmalloc during oops/panic.
104 */ 122 */
105static size_t big_oops_buf_sz; 123static size_t big_oops_buf_sz;
106static char *big_oops_buf, *oops_buf; 124static char *big_oops_buf, *oops_buf;
107static u16 *oops_len;
108static char *oops_data; 125static char *oops_data;
109static size_t oops_data_sz; 126static size_t oops_data_sz;
110 127
@@ -114,6 +131,30 @@ static size_t oops_data_sz;
114#define MEM_LEVEL 4 131#define MEM_LEVEL 4
115static struct z_stream_s stream; 132static struct z_stream_s stream;
116 133
134#ifdef CONFIG_PSTORE
135static struct nvram_os_partition of_config_partition = {
136 .name = "of-config",
137 .index = -1,
138 .os_partition = false
139};
140
141static struct nvram_os_partition common_partition = {
142 .name = "common",
143 .index = -1,
144 .os_partition = false
145};
146
147static enum pstore_type_id nvram_type_ids[] = {
148 PSTORE_TYPE_DMESG,
149 PSTORE_TYPE_PPC_RTAS,
150 PSTORE_TYPE_PPC_OF,
151 PSTORE_TYPE_PPC_COMMON,
152 -1
153};
154static int read_type;
155static unsigned long last_rtas_event;
156#endif
157
117static ssize_t pSeries_nvram_read(char *buf, size_t count, loff_t *index) 158static ssize_t pSeries_nvram_read(char *buf, size_t count, loff_t *index)
118{ 159{
119 unsigned int i; 160 unsigned int i;
@@ -275,48 +316,72 @@ int nvram_write_error_log(char * buff, int length,
275{ 316{
276 int rc = nvram_write_os_partition(&rtas_log_partition, buff, length, 317 int rc = nvram_write_os_partition(&rtas_log_partition, buff, length,
277 err_type, error_log_cnt); 318 err_type, error_log_cnt);
278 if (!rc) 319 if (!rc) {
279 last_unread_rtas_event = get_seconds(); 320 last_unread_rtas_event = get_seconds();
321#ifdef CONFIG_PSTORE
322 last_rtas_event = get_seconds();
323#endif
324 }
325
280 return rc; 326 return rc;
281} 327}
282 328
283/* nvram_read_error_log 329/* nvram_read_partition
284 * 330 *
285 * Reads nvram for error log for at most 'length' 331 * Reads nvram partition for at most 'length'
286 */ 332 */
287int nvram_read_error_log(char * buff, int length, 333int nvram_read_partition(struct nvram_os_partition *part, char *buff,
288 unsigned int * err_type, unsigned int * error_log_cnt) 334 int length, unsigned int *err_type,
335 unsigned int *error_log_cnt)
289{ 336{
290 int rc; 337 int rc;
291 loff_t tmp_index; 338 loff_t tmp_index;
292 struct err_log_info info; 339 struct err_log_info info;
293 340
294 if (rtas_log_partition.index == -1) 341 if (part->index == -1)
295 return -1; 342 return -1;
296 343
297 if (length > rtas_log_partition.size) 344 if (length > part->size)
298 length = rtas_log_partition.size; 345 length = part->size;
299 346
300 tmp_index = rtas_log_partition.index; 347 tmp_index = part->index;
301 348
302 rc = ppc_md.nvram_read((char *)&info, sizeof(struct err_log_info), &tmp_index); 349 if (part->os_partition) {
303 if (rc <= 0) { 350 rc = ppc_md.nvram_read((char *)&info,
304 printk(KERN_ERR "nvram_read_error_log: Failed nvram_read (%d)\n", rc); 351 sizeof(struct err_log_info),
305 return rc; 352 &tmp_index);
353 if (rc <= 0) {
354 pr_err("%s: Failed nvram_read (%d)\n", __FUNCTION__,
355 rc);
356 return rc;
357 }
306 } 358 }
307 359
308 rc = ppc_md.nvram_read(buff, length, &tmp_index); 360 rc = ppc_md.nvram_read(buff, length, &tmp_index);
309 if (rc <= 0) { 361 if (rc <= 0) {
310 printk(KERN_ERR "nvram_read_error_log: Failed nvram_read (%d)\n", rc); 362 pr_err("%s: Failed nvram_read (%d)\n", __FUNCTION__, rc);
311 return rc; 363 return rc;
312 } 364 }
313 365
314 *error_log_cnt = info.seq_num; 366 if (part->os_partition) {
315 *err_type = info.error_type; 367 *error_log_cnt = info.seq_num;
368 *err_type = info.error_type;
369 }
316 370
317 return 0; 371 return 0;
318} 372}
319 373
374/* nvram_read_error_log
375 *
376 * Reads nvram for error log for at most 'length'
377 */
378int nvram_read_error_log(char *buff, int length,
379 unsigned int *err_type, unsigned int *error_log_cnt)
380{
381 return nvram_read_partition(&rtas_log_partition, buff, length,
382 err_type, error_log_cnt);
383}
384
320/* This doesn't actually zero anything, but it sets the event_logged 385/* This doesn't actually zero anything, but it sets the event_logged
321 * word to tell that this event is safely in syslog. 386 * word to tell that this event is safely in syslog.
322 */ 387 */
@@ -405,6 +470,349 @@ static int __init pseries_nvram_init_os_partition(struct nvram_os_partition
405 return 0; 470 return 0;
406} 471}
407 472
473/*
474 * Are we using the ibm,rtas-log for oops/panic reports? And if so,
475 * would logging this oops/panic overwrite an RTAS event that rtas_errd
476 * hasn't had a chance to read and process? Return 1 if so, else 0.
477 *
478 * We assume that if rtas_errd hasn't read the RTAS event in
479 * NVRAM_RTAS_READ_TIMEOUT seconds, it's probably not going to.
480 */
481static int clobbering_unread_rtas_event(void)
482{
483 return (oops_log_partition.index == rtas_log_partition.index
484 && last_unread_rtas_event
485 && get_seconds() - last_unread_rtas_event <=
486 NVRAM_RTAS_READ_TIMEOUT);
487}
488
489/* Derived from logfs_compress() */
490static int nvram_compress(const void *in, void *out, size_t inlen,
491 size_t outlen)
492{
493 int err, ret;
494
495 ret = -EIO;
496 err = zlib_deflateInit2(&stream, COMPR_LEVEL, Z_DEFLATED, WINDOW_BITS,
497 MEM_LEVEL, Z_DEFAULT_STRATEGY);
498 if (err != Z_OK)
499 goto error;
500
501 stream.next_in = in;
502 stream.avail_in = inlen;
503 stream.total_in = 0;
504 stream.next_out = out;
505 stream.avail_out = outlen;
506 stream.total_out = 0;
507
508 err = zlib_deflate(&stream, Z_FINISH);
509 if (err != Z_STREAM_END)
510 goto error;
511
512 err = zlib_deflateEnd(&stream);
513 if (err != Z_OK)
514 goto error;
515
516 if (stream.total_out >= stream.total_in)
517 goto error;
518
519 ret = stream.total_out;
520error:
521 return ret;
522}
523
524/* Compress the text from big_oops_buf into oops_buf. */
525static int zip_oops(size_t text_len)
526{
527 struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf;
528 int zipped_len = nvram_compress(big_oops_buf, oops_data, text_len,
529 oops_data_sz);
530 if (zipped_len < 0) {
531 pr_err("nvram: compression failed; returned %d\n", zipped_len);
532 pr_err("nvram: logging uncompressed oops/panic report\n");
533 return -1;
534 }
535 oops_hdr->version = OOPS_HDR_VERSION;
536 oops_hdr->report_length = (u16) zipped_len;
537 oops_hdr->timestamp = get_seconds();
538 return 0;
539}
540
541#ifdef CONFIG_PSTORE
542/* Derived from logfs_uncompress */
543int nvram_decompress(void *in, void *out, size_t inlen, size_t outlen)
544{
545 int err, ret;
546
547 ret = -EIO;
548 err = zlib_inflateInit(&stream);
549 if (err != Z_OK)
550 goto error;
551
552 stream.next_in = in;
553 stream.avail_in = inlen;
554 stream.total_in = 0;
555 stream.next_out = out;
556 stream.avail_out = outlen;
557 stream.total_out = 0;
558
559 err = zlib_inflate(&stream, Z_FINISH);
560 if (err != Z_STREAM_END)
561 goto error;
562
563 err = zlib_inflateEnd(&stream);
564 if (err != Z_OK)
565 goto error;
566
567 ret = stream.total_out;
568error:
569 return ret;
570}
571
572static int unzip_oops(char *oops_buf, char *big_buf)
573{
574 struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf;
575 u64 timestamp = oops_hdr->timestamp;
576 char *big_oops_data = NULL;
577 char *oops_data_buf = NULL;
578 size_t big_oops_data_sz;
579 int unzipped_len;
580
581 big_oops_data = big_buf + sizeof(struct oops_log_info);
582 big_oops_data_sz = big_oops_buf_sz - sizeof(struct oops_log_info);
583 oops_data_buf = oops_buf + sizeof(struct oops_log_info);
584
585 unzipped_len = nvram_decompress(oops_data_buf, big_oops_data,
586 oops_hdr->report_length,
587 big_oops_data_sz);
588
589 if (unzipped_len < 0) {
590 pr_err("nvram: decompression failed; returned %d\n",
591 unzipped_len);
592 return -1;
593 }
594 oops_hdr = (struct oops_log_info *)big_buf;
595 oops_hdr->version = OOPS_HDR_VERSION;
596 oops_hdr->report_length = (u16) unzipped_len;
597 oops_hdr->timestamp = timestamp;
598 return 0;
599}
600
601static int nvram_pstore_open(struct pstore_info *psi)
602{
603 /* Reset the iterator to start reading partitions again */
604 read_type = -1;
605 return 0;
606}
607
608/**
609 * nvram_pstore_write - pstore write callback for nvram
610 * @type: Type of message logged
611 * @reason: reason behind dump (oops/panic)
612 * @id: identifier to indicate the write performed
613 * @part: pstore writes data to registered buffer in parts,
614 * part number will indicate the same.
615 * @count: Indicates oops count
616 * @hsize: Size of header added by pstore
617 * @size: number of bytes written to the registered buffer
618 * @psi: registered pstore_info structure
619 *
620 * Called by pstore_dump() when an oops or panic report is logged in the
621 * printk buffer.
622 * Returns 0 on successful write.
623 */
624static int nvram_pstore_write(enum pstore_type_id type,
625 enum kmsg_dump_reason reason,
626 u64 *id, unsigned int part, int count,
627 size_t hsize, size_t size,
628 struct pstore_info *psi)
629{
630 int rc;
631 unsigned int err_type = ERR_TYPE_KERNEL_PANIC;
632 struct oops_log_info *oops_hdr = (struct oops_log_info *) oops_buf;
633
634 /* part 1 has the recent messages from printk buffer */
635 if (part > 1 || type != PSTORE_TYPE_DMESG ||
636 clobbering_unread_rtas_event())
637 return -1;
638
639 oops_hdr->version = OOPS_HDR_VERSION;
640 oops_hdr->report_length = (u16) size;
641 oops_hdr->timestamp = get_seconds();
642
643 if (big_oops_buf) {
644 rc = zip_oops(size);
645 /*
646 * If compression fails copy recent log messages from
647 * big_oops_buf to oops_data.
648 */
649 if (rc != 0) {
650 size_t diff = size - oops_data_sz + hsize;
651
652 if (size > oops_data_sz) {
653 memcpy(oops_data, big_oops_buf, hsize);
654 memcpy(oops_data + hsize, big_oops_buf + diff,
655 oops_data_sz - hsize);
656
657 oops_hdr->report_length = (u16) oops_data_sz;
658 } else
659 memcpy(oops_data, big_oops_buf, size);
660 } else
661 err_type = ERR_TYPE_KERNEL_PANIC_GZ;
662 }
663
664 rc = nvram_write_os_partition(&oops_log_partition, oops_buf,
665 (int) (sizeof(*oops_hdr) + oops_hdr->report_length), err_type,
666 count);
667
668 if (rc != 0)
669 return rc;
670
671 *id = part;
672 return 0;
673}
674
675/*
676 * Reads the oops/panic report, rtas, of-config and common partition.
677 * Returns the length of the data we read from each partition.
678 * Returns 0 if we've been called before.
679 */
680static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type,
681 int *count, struct timespec *time, char **buf,
682 struct pstore_info *psi)
683{
684 struct oops_log_info *oops_hdr;
685 unsigned int err_type, id_no, size = 0;
686 struct nvram_os_partition *part = NULL;
687 char *buff = NULL, *big_buff = NULL;
688 int rc, sig = 0;
689 loff_t p;
690
691read_partition:
692 read_type++;
693
694 switch (nvram_type_ids[read_type]) {
695 case PSTORE_TYPE_DMESG:
696 part = &oops_log_partition;
697 *type = PSTORE_TYPE_DMESG;
698 break;
699 case PSTORE_TYPE_PPC_RTAS:
700 part = &rtas_log_partition;
701 *type = PSTORE_TYPE_PPC_RTAS;
702 time->tv_sec = last_rtas_event;
703 time->tv_nsec = 0;
704 break;
705 case PSTORE_TYPE_PPC_OF:
706 sig = NVRAM_SIG_OF;
707 part = &of_config_partition;
708 *type = PSTORE_TYPE_PPC_OF;
709 *id = PSTORE_TYPE_PPC_OF;
710 time->tv_sec = 0;
711 time->tv_nsec = 0;
712 break;
713 case PSTORE_TYPE_PPC_COMMON:
714 sig = NVRAM_SIG_SYS;
715 part = &common_partition;
716 *type = PSTORE_TYPE_PPC_COMMON;
717 *id = PSTORE_TYPE_PPC_COMMON;
718 time->tv_sec = 0;
719 time->tv_nsec = 0;
720 break;
721 default:
722 return 0;
723 }
724
725 if (!part->os_partition) {
726 p = nvram_find_partition(part->name, sig, &size);
727 if (p <= 0) {
728 pr_err("nvram: Failed to find partition %s, "
729 "err %d\n", part->name, (int)p);
730 return 0;
731 }
732 part->index = p;
733 part->size = size;
734 }
735
736 buff = kmalloc(part->size, GFP_KERNEL);
737
738 if (!buff)
739 return -ENOMEM;
740
741 if (nvram_read_partition(part, buff, part->size, &err_type, &id_no)) {
742 kfree(buff);
743 return 0;
744 }
745
746 *count = 0;
747
748 if (part->os_partition)
749 *id = id_no;
750
751 if (nvram_type_ids[read_type] == PSTORE_TYPE_DMESG) {
752 oops_hdr = (struct oops_log_info *)buff;
753 *buf = buff + sizeof(*oops_hdr);
754
755 if (err_type == ERR_TYPE_KERNEL_PANIC_GZ) {
756 big_buff = kmalloc(big_oops_buf_sz, GFP_KERNEL);
757 if (!big_buff)
758 return -ENOMEM;
759
760 rc = unzip_oops(buff, big_buff);
761
762 if (rc != 0) {
763 kfree(buff);
764 kfree(big_buff);
765 goto read_partition;
766 }
767
768 oops_hdr = (struct oops_log_info *)big_buff;
769 *buf = big_buff + sizeof(*oops_hdr);
770 kfree(buff);
771 }
772
773 time->tv_sec = oops_hdr->timestamp;
774 time->tv_nsec = 0;
775 return oops_hdr->report_length;
776 }
777
778 *buf = buff;
779 return part->size;
780}
781
782static struct pstore_info nvram_pstore_info = {
783 .owner = THIS_MODULE,
784 .name = "nvram",
785 .open = nvram_pstore_open,
786 .read = nvram_pstore_read,
787 .write = nvram_pstore_write,
788};
789
790static int nvram_pstore_init(void)
791{
792 int rc = 0;
793
794 if (big_oops_buf) {
795 nvram_pstore_info.buf = big_oops_buf;
796 nvram_pstore_info.bufsize = big_oops_buf_sz;
797 } else {
798 nvram_pstore_info.buf = oops_data;
799 nvram_pstore_info.bufsize = oops_data_sz;
800 }
801
802 rc = pstore_register(&nvram_pstore_info);
803 if (rc != 0)
804 pr_err("nvram: pstore_register() failed, defaults to "
805 "kmsg_dump; returned %d\n", rc);
806
807 return rc;
808}
809#else
810static int nvram_pstore_init(void)
811{
812 return -1;
813}
814#endif
815
408static void __init nvram_init_oops_partition(int rtas_partition_exists) 816static void __init nvram_init_oops_partition(int rtas_partition_exists)
409{ 817{
410 int rc; 818 int rc;
@@ -425,9 +833,8 @@ static void __init nvram_init_oops_partition(int rtas_partition_exists)
425 oops_log_partition.name); 833 oops_log_partition.name);
426 return; 834 return;
427 } 835 }
428 oops_len = (u16*) oops_buf; 836 oops_data = oops_buf + sizeof(struct oops_log_info);
429 oops_data = oops_buf + sizeof(u16); 837 oops_data_sz = oops_log_partition.size - sizeof(struct oops_log_info);
430 oops_data_sz = oops_log_partition.size - sizeof(u16);
431 838
432 /* 839 /*
433 * Figure compression (preceded by elimination of each line's <n> 840 * Figure compression (preceded by elimination of each line's <n>
@@ -452,6 +859,11 @@ static void __init nvram_init_oops_partition(int rtas_partition_exists)
452 stream.workspace = NULL; 859 stream.workspace = NULL;
453 } 860 }
454 861
862 rc = nvram_pstore_init();
863
864 if (!rc)
865 return;
866
455 rc = kmsg_dump_register(&nvram_kmsg_dumper); 867 rc = kmsg_dump_register(&nvram_kmsg_dumper);
456 if (rc != 0) { 868 if (rc != 0) {
457 pr_err("nvram: kmsg_dump_register() failed; returned %d\n", rc); 869 pr_err("nvram: kmsg_dump_register() failed; returned %d\n", rc);
@@ -501,70 +913,6 @@ int __init pSeries_nvram_init(void)
501 return 0; 913 return 0;
502} 914}
503 915
504/*
505 * Are we using the ibm,rtas-log for oops/panic reports? And if so,
506 * would logging this oops/panic overwrite an RTAS event that rtas_errd
507 * hasn't had a chance to read and process? Return 1 if so, else 0.
508 *
509 * We assume that if rtas_errd hasn't read the RTAS event in
510 * NVRAM_RTAS_READ_TIMEOUT seconds, it's probably not going to.
511 */
512static int clobbering_unread_rtas_event(void)
513{
514 return (oops_log_partition.index == rtas_log_partition.index
515 && last_unread_rtas_event
516 && get_seconds() - last_unread_rtas_event <=
517 NVRAM_RTAS_READ_TIMEOUT);
518}
519
520/* Derived from logfs_compress() */
521static int nvram_compress(const void *in, void *out, size_t inlen,
522 size_t outlen)
523{
524 int err, ret;
525
526 ret = -EIO;
527 err = zlib_deflateInit2(&stream, COMPR_LEVEL, Z_DEFLATED, WINDOW_BITS,
528 MEM_LEVEL, Z_DEFAULT_STRATEGY);
529 if (err != Z_OK)
530 goto error;
531
532 stream.next_in = in;
533 stream.avail_in = inlen;
534 stream.total_in = 0;
535 stream.next_out = out;
536 stream.avail_out = outlen;
537 stream.total_out = 0;
538
539 err = zlib_deflate(&stream, Z_FINISH);
540 if (err != Z_STREAM_END)
541 goto error;
542
543 err = zlib_deflateEnd(&stream);
544 if (err != Z_OK)
545 goto error;
546
547 if (stream.total_out >= stream.total_in)
548 goto error;
549
550 ret = stream.total_out;
551error:
552 return ret;
553}
554
555/* Compress the text from big_oops_buf into oops_buf. */
556static int zip_oops(size_t text_len)
557{
558 int zipped_len = nvram_compress(big_oops_buf, oops_data, text_len,
559 oops_data_sz);
560 if (zipped_len < 0) {
561 pr_err("nvram: compression failed; returned %d\n", zipped_len);
562 pr_err("nvram: logging uncompressed oops/panic report\n");
563 return -1;
564 }
565 *oops_len = (u16) zipped_len;
566 return 0;
567}
568 916
569/* 917/*
570 * This is our kmsg_dump callback, called after an oops or panic report 918 * This is our kmsg_dump callback, called after an oops or panic report
@@ -576,6 +924,7 @@ static int zip_oops(size_t text_len)
576static void oops_to_nvram(struct kmsg_dumper *dumper, 924static void oops_to_nvram(struct kmsg_dumper *dumper,
577 enum kmsg_dump_reason reason) 925 enum kmsg_dump_reason reason)
578{ 926{
927 struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf;
579 static unsigned int oops_count = 0; 928 static unsigned int oops_count = 0;
580 static bool panicking = false; 929 static bool panicking = false;
581 static DEFINE_SPINLOCK(lock); 930 static DEFINE_SPINLOCK(lock);
@@ -619,14 +968,17 @@ static void oops_to_nvram(struct kmsg_dumper *dumper,
619 } 968 }
620 if (rc != 0) { 969 if (rc != 0) {
621 kmsg_dump_rewind(dumper); 970 kmsg_dump_rewind(dumper);
622 kmsg_dump_get_buffer(dumper, true, 971 kmsg_dump_get_buffer(dumper, false,
623 oops_data, oops_data_sz, &text_len); 972 oops_data, oops_data_sz, &text_len);
624 err_type = ERR_TYPE_KERNEL_PANIC; 973 err_type = ERR_TYPE_KERNEL_PANIC;
625 *oops_len = (u16) text_len; 974 oops_hdr->version = OOPS_HDR_VERSION;
975 oops_hdr->report_length = (u16) text_len;
976 oops_hdr->timestamp = get_seconds();
626 } 977 }
627 978
628 (void) nvram_write_os_partition(&oops_log_partition, oops_buf, 979 (void) nvram_write_os_partition(&oops_log_partition, oops_buf,
629 (int) (sizeof(*oops_len) + *oops_len), err_type, ++oops_count); 980 (int) (sizeof(*oops_hdr) + oops_hdr->report_length), err_type,
981 ++oops_count);
630 982
631 spin_unlock_irqrestore(&lock, flags); 983 spin_unlock_irqrestore(&lock, flags);
632} 984}
diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c
index c91b22be9288..efe61374f6ea 100644
--- a/arch/powerpc/platforms/pseries/pci_dlpar.c
+++ b/arch/powerpc/platforms/pseries/pci_dlpar.c
@@ -64,91 +64,6 @@ pcibios_find_pci_bus(struct device_node *dn)
64} 64}
65EXPORT_SYMBOL_GPL(pcibios_find_pci_bus); 65EXPORT_SYMBOL_GPL(pcibios_find_pci_bus);
66 66
67/**
68 * __pcibios_remove_pci_devices - remove all devices under this bus
69 * @bus: the indicated PCI bus
70 * @purge_pe: destroy the PE on removal of PCI devices
71 *
72 * Remove all of the PCI devices under this bus both from the
73 * linux pci device tree, and from the powerpc EEH address cache.
74 * By default, the corresponding PE will be destroied during the
75 * normal PCI hotplug path. For PCI hotplug during EEH recovery,
76 * the corresponding PE won't be destroied and deallocated.
77 */
78void __pcibios_remove_pci_devices(struct pci_bus *bus, int purge_pe)
79{
80 struct pci_dev *dev, *tmp;
81 struct pci_bus *child_bus;
82
83 /* First go down child busses */
84 list_for_each_entry(child_bus, &bus->children, node)
85 __pcibios_remove_pci_devices(child_bus, purge_pe);
86
87 pr_debug("PCI: Removing devices on bus %04x:%02x\n",
88 pci_domain_nr(bus), bus->number);
89 list_for_each_entry_safe(dev, tmp, &bus->devices, bus_list) {
90 pr_debug(" * Removing %s...\n", pci_name(dev));
91 eeh_remove_bus_device(dev, purge_pe);
92 pci_stop_and_remove_bus_device(dev);
93 }
94}
95
96/**
97 * pcibios_remove_pci_devices - remove all devices under this bus
98 *
99 * Remove all of the PCI devices under this bus both from the
100 * linux pci device tree, and from the powerpc EEH address cache.
101 */
102void pcibios_remove_pci_devices(struct pci_bus *bus)
103{
104 __pcibios_remove_pci_devices(bus, 1);
105}
106EXPORT_SYMBOL_GPL(pcibios_remove_pci_devices);
107
108/**
109 * pcibios_add_pci_devices - adds new pci devices to bus
110 *
111 * This routine will find and fixup new pci devices under
112 * the indicated bus. This routine presumes that there
113 * might already be some devices under this bridge, so
114 * it carefully tries to add only new devices. (And that
115 * is how this routine differs from other, similar pcibios
116 * routines.)
117 */
118void pcibios_add_pci_devices(struct pci_bus * bus)
119{
120 int slotno, num, mode, pass, max;
121 struct pci_dev *dev;
122 struct device_node *dn = pci_bus_to_OF_node(bus);
123
124 eeh_add_device_tree_early(dn);
125
126 mode = PCI_PROBE_NORMAL;
127 if (ppc_md.pci_probe_mode)
128 mode = ppc_md.pci_probe_mode(bus);
129
130 if (mode == PCI_PROBE_DEVTREE) {
131 /* use ofdt-based probe */
132 of_rescan_bus(dn, bus);
133 } else if (mode == PCI_PROBE_NORMAL) {
134 /* use legacy probe */
135 slotno = PCI_SLOT(PCI_DN(dn->child)->devfn);
136 num = pci_scan_slot(bus, PCI_DEVFN(slotno, 0));
137 if (!num)
138 return;
139 pcibios_setup_bus_devices(bus);
140 max = bus->busn_res.start;
141 for (pass=0; pass < 2; pass++)
142 list_for_each_entry(dev, &bus->devices, bus_list) {
143 if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE ||
144 dev->hdr_type == PCI_HEADER_TYPE_CARDBUS)
145 max = pci_scan_bridge(bus, dev, max, pass);
146 }
147 }
148 pcibios_finish_adding_to_bus(bus);
149}
150EXPORT_SYMBOL_GPL(pcibios_add_pci_devices);
151
152struct pci_controller *init_phb_dynamic(struct device_node *dn) 67struct pci_controller *init_phb_dynamic(struct device_node *dn)
153{ 68{
154 struct pci_controller *phb; 69 struct pci_controller *phb;
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index c4dfccd3a3d9..7b3cbde8c783 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -83,7 +83,7 @@ static void handle_system_shutdown(char event_modifier)
83 switch (event_modifier) { 83 switch (event_modifier) {
84 case EPOW_SHUTDOWN_NORMAL: 84 case EPOW_SHUTDOWN_NORMAL:
85 pr_emerg("Firmware initiated power off"); 85 pr_emerg("Firmware initiated power off");
86 orderly_poweroff(1); 86 orderly_poweroff(true);
87 break; 87 break;
88 88
89 case EPOW_SHUTDOWN_ON_UPS: 89 case EPOW_SHUTDOWN_ON_UPS:
@@ -95,13 +95,13 @@ static void handle_system_shutdown(char event_modifier)
95 pr_emerg("Loss of system critical functions reported by " 95 pr_emerg("Loss of system critical functions reported by "
96 "firmware"); 96 "firmware");
97 pr_emerg("Check RTAS error log for details"); 97 pr_emerg("Check RTAS error log for details");
98 orderly_poweroff(1); 98 orderly_poweroff(true);
99 break; 99 break;
100 100
101 case EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH: 101 case EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH:
102 pr_emerg("Ambient temperature too high reported by firmware"); 102 pr_emerg("Ambient temperature too high reported by firmware");
103 pr_emerg("Check RTAS error log for details"); 103 pr_emerg("Check RTAS error log for details");
104 orderly_poweroff(1); 104 orderly_poweroff(true);
105 break; 105 break;
106 106
107 default: 107 default:
@@ -162,7 +162,7 @@ void rtas_parse_epow_errlog(struct rtas_error_log *log)
162 162
163 case EPOW_SYSTEM_HALT: 163 case EPOW_SYSTEM_HALT:
164 pr_emerg("Firmware initiated power off"); 164 pr_emerg("Firmware initiated power off");
165 orderly_poweroff(1); 165 orderly_poweroff(true);
166 break; 166 break;
167 167
168 case EPOW_MAIN_ENCLOSURE: 168 case EPOW_MAIN_ENCLOSURE:
diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c
index 12bc8c3663ad..306643cc9dbc 100644
--- a/arch/powerpc/platforms/pseries/smp.c
+++ b/arch/powerpc/platforms/pseries/smp.c
@@ -192,7 +192,7 @@ static int smp_pSeries_cpu_bootable(unsigned int nr)
192 /* Special case - we inhibit secondary thread startup 192 /* Special case - we inhibit secondary thread startup
193 * during boot if the user requests it. 193 * during boot if the user requests it.
194 */ 194 */
195 if (system_state < SYSTEM_RUNNING && cpu_has_feature(CPU_FTR_SMT)) { 195 if (system_state == SYSTEM_BOOTING && cpu_has_feature(CPU_FTR_SMT)) {
196 if (!smt_enabled_at_boot && cpu_thread_in_core(nr) != 0) 196 if (!smt_enabled_at_boot && cpu_thread_in_core(nr) != 0)
197 return 0; 197 return 0;
198 if (smt_enabled_at_boot 198 if (smt_enabled_at_boot
diff --git a/arch/powerpc/sysdev/Makefile b/arch/powerpc/sysdev/Makefile
index 99464a7bdb3b..f67ac900d870 100644
--- a/arch/powerpc/sysdev/Makefile
+++ b/arch/powerpc/sysdev/Makefile
@@ -4,6 +4,8 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC)
4 4
5mpic-msi-obj-$(CONFIG_PCI_MSI) += mpic_msi.o mpic_u3msi.o mpic_pasemi_msi.o 5mpic-msi-obj-$(CONFIG_PCI_MSI) += mpic_msi.o mpic_u3msi.o mpic_pasemi_msi.o
6obj-$(CONFIG_MPIC) += mpic.o $(mpic-msi-obj-y) 6obj-$(CONFIG_MPIC) += mpic.o $(mpic-msi-obj-y)
7obj-$(CONFIG_MPIC_TIMER) += mpic_timer.o
8obj-$(CONFIG_FSL_MPIC_TIMER_WAKEUP) += fsl_mpic_timer_wakeup.o
7mpic-msgr-obj-$(CONFIG_MPIC_MSGR) += mpic_msgr.o 9mpic-msgr-obj-$(CONFIG_MPIC_MSGR) += mpic_msgr.o
8obj-$(CONFIG_MPIC) += mpic.o $(mpic-msi-obj-y) $(mpic-msgr-obj-y) 10obj-$(CONFIG_MPIC) += mpic.o $(mpic-msi-obj-y) $(mpic-msgr-obj-y)
9obj-$(CONFIG_PPC_EPAPR_HV_PIC) += ehv_pic.o 11obj-$(CONFIG_PPC_EPAPR_HV_PIC) += ehv_pic.o
diff --git a/arch/powerpc/sysdev/cpm1.c b/arch/powerpc/sysdev/cpm1.c
index d4fa03f2b6ac..5e6ff38ea69f 100644
--- a/arch/powerpc/sysdev/cpm1.c
+++ b/arch/powerpc/sysdev/cpm1.c
@@ -120,6 +120,7 @@ static irqreturn_t cpm_error_interrupt(int irq, void *dev)
120 120
121static struct irqaction cpm_error_irqaction = { 121static struct irqaction cpm_error_irqaction = {
122 .handler = cpm_error_interrupt, 122 .handler = cpm_error_interrupt,
123 .flags = IRQF_NO_THREAD,
123 .name = "error", 124 .name = "error",
124}; 125};
125 126
diff --git a/arch/powerpc/sysdev/fsl_mpic_timer_wakeup.c b/arch/powerpc/sysdev/fsl_mpic_timer_wakeup.c
new file mode 100644
index 000000000000..1707bf04dec6
--- /dev/null
+++ b/arch/powerpc/sysdev/fsl_mpic_timer_wakeup.c
@@ -0,0 +1,161 @@
1/*
2 * MPIC timer wakeup driver
3 *
4 * Copyright 2013 Freescale Semiconductor, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the
8 * Free Software Foundation; either version 2 of the License, or (at your
9 * option) any later version.
10 */
11
12#include <linux/kernel.h>
13#include <linux/slab.h>
14#include <linux/errno.h>
15#include <linux/module.h>
16#include <linux/interrupt.h>
17#include <linux/device.h>
18
19#include <asm/mpic_timer.h>
20#include <asm/mpic.h>
21
22struct fsl_mpic_timer_wakeup {
23 struct mpic_timer *timer;
24 struct work_struct free_work;
25};
26
27static struct fsl_mpic_timer_wakeup *fsl_wakeup;
28static DEFINE_MUTEX(sysfs_lock);
29
30static void fsl_free_resource(struct work_struct *ws)
31{
32 struct fsl_mpic_timer_wakeup *wakeup =
33 container_of(ws, struct fsl_mpic_timer_wakeup, free_work);
34
35 mutex_lock(&sysfs_lock);
36
37 if (wakeup->timer) {
38 disable_irq_wake(wakeup->timer->irq);
39 mpic_free_timer(wakeup->timer);
40 }
41
42 wakeup->timer = NULL;
43 mutex_unlock(&sysfs_lock);
44}
45
46static irqreturn_t fsl_mpic_timer_irq(int irq, void *dev_id)
47{
48 struct fsl_mpic_timer_wakeup *wakeup = dev_id;
49
50 schedule_work(&wakeup->free_work);
51
52 return wakeup->timer ? IRQ_HANDLED : IRQ_NONE;
53}
54
55static ssize_t fsl_timer_wakeup_show(struct device *dev,
56 struct device_attribute *attr,
57 char *buf)
58{
59 struct timeval interval;
60 int val = 0;
61
62 mutex_lock(&sysfs_lock);
63 if (fsl_wakeup->timer) {
64 mpic_get_remain_time(fsl_wakeup->timer, &interval);
65 val = interval.tv_sec + 1;
66 }
67 mutex_unlock(&sysfs_lock);
68
69 return sprintf(buf, "%d\n", val);
70}
71
72static ssize_t fsl_timer_wakeup_store(struct device *dev,
73 struct device_attribute *attr,
74 const char *buf,
75 size_t count)
76{
77 struct timeval interval;
78 int ret;
79
80 interval.tv_usec = 0;
81 if (kstrtol(buf, 0, &interval.tv_sec))
82 return -EINVAL;
83
84 mutex_lock(&sysfs_lock);
85
86 if (fsl_wakeup->timer) {
87 disable_irq_wake(fsl_wakeup->timer->irq);
88 mpic_free_timer(fsl_wakeup->timer);
89 fsl_wakeup->timer = NULL;
90 }
91
92 if (!interval.tv_sec) {
93 mutex_unlock(&sysfs_lock);
94 return count;
95 }
96
97 fsl_wakeup->timer = mpic_request_timer(fsl_mpic_timer_irq,
98 fsl_wakeup, &interval);
99 if (!fsl_wakeup->timer) {
100 mutex_unlock(&sysfs_lock);
101 return -EINVAL;
102 }
103
104 ret = enable_irq_wake(fsl_wakeup->timer->irq);
105 if (ret) {
106 mpic_free_timer(fsl_wakeup->timer);
107 fsl_wakeup->timer = NULL;
108 mutex_unlock(&sysfs_lock);
109
110 return ret;
111 }
112
113 mpic_start_timer(fsl_wakeup->timer);
114
115 mutex_unlock(&sysfs_lock);
116
117 return count;
118}
119
120static struct device_attribute mpic_attributes = __ATTR(timer_wakeup, 0644,
121 fsl_timer_wakeup_show, fsl_timer_wakeup_store);
122
123static int __init fsl_wakeup_sys_init(void)
124{
125 int ret;
126
127 fsl_wakeup = kzalloc(sizeof(struct fsl_mpic_timer_wakeup), GFP_KERNEL);
128 if (!fsl_wakeup)
129 return -ENOMEM;
130
131 INIT_WORK(&fsl_wakeup->free_work, fsl_free_resource);
132
133 ret = device_create_file(mpic_subsys.dev_root, &mpic_attributes);
134 if (ret)
135 kfree(fsl_wakeup);
136
137 return ret;
138}
139
140static void __exit fsl_wakeup_sys_exit(void)
141{
142 device_remove_file(mpic_subsys.dev_root, &mpic_attributes);
143
144 mutex_lock(&sysfs_lock);
145
146 if (fsl_wakeup->timer) {
147 disable_irq_wake(fsl_wakeup->timer->irq);
148 mpic_free_timer(fsl_wakeup->timer);
149 }
150
151 kfree(fsl_wakeup);
152
153 mutex_unlock(&sysfs_lock);
154}
155
156module_init(fsl_wakeup_sys_init);
157module_exit(fsl_wakeup_sys_exit);
158
159MODULE_DESCRIPTION("Freescale MPIC global timer wakeup driver");
160MODULE_LICENSE("GPL v2");
161MODULE_AUTHOR("Wang Dongsheng <dongsheng.wang@freescale.com>");
diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c
index 3cc2f9159ab1..1be54faf60dd 100644
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@ -48,6 +48,12 @@
48#define DBG(fmt...) 48#define DBG(fmt...)
49#endif 49#endif
50 50
51struct bus_type mpic_subsys = {
52 .name = "mpic",
53 .dev_name = "mpic",
54};
55EXPORT_SYMBOL_GPL(mpic_subsys);
56
51static struct mpic *mpics; 57static struct mpic *mpics;
52static struct mpic *mpic_primary; 58static struct mpic *mpic_primary;
53static DEFINE_RAW_SPINLOCK(mpic_lock); 59static DEFINE_RAW_SPINLOCK(mpic_lock);
@@ -920,6 +926,22 @@ int mpic_set_irq_type(struct irq_data *d, unsigned int flow_type)
920 return IRQ_SET_MASK_OK_NOCOPY; 926 return IRQ_SET_MASK_OK_NOCOPY;
921} 927}
922 928
929static int mpic_irq_set_wake(struct irq_data *d, unsigned int on)
930{
931 struct irq_desc *desc = container_of(d, struct irq_desc, irq_data);
932 struct mpic *mpic = mpic_from_irq_data(d);
933
934 if (!(mpic->flags & MPIC_FSL))
935 return -ENXIO;
936
937 if (on)
938 desc->action->flags |= IRQF_NO_SUSPEND;
939 else
940 desc->action->flags &= ~IRQF_NO_SUSPEND;
941
942 return 0;
943}
944
923void mpic_set_vector(unsigned int virq, unsigned int vector) 945void mpic_set_vector(unsigned int virq, unsigned int vector)
924{ 946{
925 struct mpic *mpic = mpic_from_irq(virq); 947 struct mpic *mpic = mpic_from_irq(virq);
@@ -957,6 +979,7 @@ static struct irq_chip mpic_irq_chip = {
957 .irq_unmask = mpic_unmask_irq, 979 .irq_unmask = mpic_unmask_irq,
958 .irq_eoi = mpic_end_irq, 980 .irq_eoi = mpic_end_irq,
959 .irq_set_type = mpic_set_irq_type, 981 .irq_set_type = mpic_set_irq_type,
982 .irq_set_wake = mpic_irq_set_wake,
960}; 983};
961 984
962#ifdef CONFIG_SMP 985#ifdef CONFIG_SMP
@@ -971,6 +994,7 @@ static struct irq_chip mpic_tm_chip = {
971 .irq_mask = mpic_mask_tm, 994 .irq_mask = mpic_mask_tm,
972 .irq_unmask = mpic_unmask_tm, 995 .irq_unmask = mpic_unmask_tm,
973 .irq_eoi = mpic_end_irq, 996 .irq_eoi = mpic_end_irq,
997 .irq_set_wake = mpic_irq_set_wake,
974}; 998};
975 999
976#ifdef CONFIG_MPIC_U3_HT_IRQS 1000#ifdef CONFIG_MPIC_U3_HT_IRQS
@@ -1173,10 +1197,33 @@ static struct irq_domain_ops mpic_host_ops = {
1173 .xlate = mpic_host_xlate, 1197 .xlate = mpic_host_xlate,
1174}; 1198};
1175 1199
1200static u32 fsl_mpic_get_version(struct mpic *mpic)
1201{
1202 u32 brr1;
1203
1204 if (!(mpic->flags & MPIC_FSL))
1205 return 0;
1206
1207 brr1 = _mpic_read(mpic->reg_type, &mpic->thiscpuregs,
1208 MPIC_FSL_BRR1);
1209
1210 return brr1 & MPIC_FSL_BRR1_VER;
1211}
1212
1176/* 1213/*
1177 * Exported functions 1214 * Exported functions
1178 */ 1215 */
1179 1216
1217u32 fsl_mpic_primary_get_version(void)
1218{
1219 struct mpic *mpic = mpic_primary;
1220
1221 if (mpic)
1222 return fsl_mpic_get_version(mpic);
1223
1224 return 0;
1225}
1226
1180struct mpic * __init mpic_alloc(struct device_node *node, 1227struct mpic * __init mpic_alloc(struct device_node *node,
1181 phys_addr_t phys_addr, 1228 phys_addr_t phys_addr,
1182 unsigned int flags, 1229 unsigned int flags,
@@ -1323,7 +1370,6 @@ struct mpic * __init mpic_alloc(struct device_node *node,
1323 mpic_map(mpic, mpic->paddr, &mpic->tmregs, MPIC_INFO(TIMER_BASE), 0x1000); 1370 mpic_map(mpic, mpic->paddr, &mpic->tmregs, MPIC_INFO(TIMER_BASE), 0x1000);
1324 1371
1325 if (mpic->flags & MPIC_FSL) { 1372 if (mpic->flags & MPIC_FSL) {
1326 u32 brr1;
1327 int ret; 1373 int ret;
1328 1374
1329 /* 1375 /*
@@ -1334,9 +1380,7 @@ struct mpic * __init mpic_alloc(struct device_node *node,
1334 mpic_map(mpic, mpic->paddr, &mpic->thiscpuregs, 1380 mpic_map(mpic, mpic->paddr, &mpic->thiscpuregs,
1335 MPIC_CPU_THISBASE, 0x1000); 1381 MPIC_CPU_THISBASE, 0x1000);
1336 1382
1337 brr1 = _mpic_read(mpic->reg_type, &mpic->thiscpuregs, 1383 fsl_version = fsl_mpic_get_version(mpic);
1338 MPIC_FSL_BRR1);
1339 fsl_version = brr1 & MPIC_FSL_BRR1_VER;
1340 1384
1341 /* Error interrupt mask register (EIMR) is required for 1385 /* Error interrupt mask register (EIMR) is required for
1342 * handling individual device error interrupts. EIMR 1386 * handling individual device error interrupts. EIMR
@@ -1526,9 +1570,7 @@ void __init mpic_init(struct mpic *mpic)
1526 mpic_cpu_write(MPIC_INFO(CPU_CURRENT_TASK_PRI), 0xf); 1570 mpic_cpu_write(MPIC_INFO(CPU_CURRENT_TASK_PRI), 0xf);
1527 1571
1528 if (mpic->flags & MPIC_FSL) { 1572 if (mpic->flags & MPIC_FSL) {
1529 u32 brr1 = _mpic_read(mpic->reg_type, &mpic->thiscpuregs, 1573 u32 version = fsl_mpic_get_version(mpic);
1530 MPIC_FSL_BRR1);
1531 u32 version = brr1 & MPIC_FSL_BRR1_VER;
1532 1574
1533 /* 1575 /*
1534 * Timer group B is present at the latest in MPIC 3.1 (e.g. 1576 * Timer group B is present at the latest in MPIC 3.1 (e.g.
@@ -1999,6 +2041,8 @@ static struct syscore_ops mpic_syscore_ops = {
1999static int mpic_init_sys(void) 2041static int mpic_init_sys(void)
2000{ 2042{
2001 register_syscore_ops(&mpic_syscore_ops); 2043 register_syscore_ops(&mpic_syscore_ops);
2044 subsys_system_register(&mpic_subsys, NULL);
2045
2002 return 0; 2046 return 0;
2003} 2047}
2004 2048
diff --git a/arch/powerpc/sysdev/mpic_timer.c b/arch/powerpc/sysdev/mpic_timer.c
new file mode 100644
index 000000000000..c06db92a4fb1
--- /dev/null
+++ b/arch/powerpc/sysdev/mpic_timer.c
@@ -0,0 +1,593 @@
1/*
2 * MPIC timer driver
3 *
4 * Copyright 2013 Freescale Semiconductor, Inc.
5 * Author: Dongsheng Wang <Dongsheng.Wang@freescale.com>
6 * Li Yang <leoli@freescale.com>
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms of the GNU General Public License as published by the
10 * Free Software Foundation; either version 2 of the License, or (at your
11 * option) any later version.
12 */
13
14#include <linux/kernel.h>
15#include <linux/init.h>
16#include <linux/module.h>
17#include <linux/errno.h>
18#include <linux/mm.h>
19#include <linux/interrupt.h>
20#include <linux/slab.h>
21#include <linux/of.h>
22#include <linux/of_device.h>
23#include <linux/syscore_ops.h>
24#include <sysdev/fsl_soc.h>
25#include <asm/io.h>
26
27#include <asm/mpic_timer.h>
28
29#define FSL_GLOBAL_TIMER 0x1
30
31/* Clock Ratio
32 * Divide by 64 0x00000300
33 * Divide by 32 0x00000200
34 * Divide by 16 0x00000100
35 * Divide by 8 0x00000000 (Hardware default div)
36 */
37#define MPIC_TIMER_TCR_CLKDIV 0x00000300
38
39#define MPIC_TIMER_TCR_ROVR_OFFSET 24
40
41#define TIMER_STOP 0x80000000
42#define TIMERS_PER_GROUP 4
43#define MAX_TICKS (~0U >> 1)
44#define MAX_TICKS_CASCADE (~0U)
45#define TIMER_OFFSET(num) (1 << (TIMERS_PER_GROUP - 1 - num))
46
47/* tv_usec should be less than ONE_SECOND, otherwise use tv_sec */
48#define ONE_SECOND 1000000
49
50struct timer_regs {
51 u32 gtccr;
52 u32 res0[3];
53 u32 gtbcr;
54 u32 res1[3];
55 u32 gtvpr;
56 u32 res2[3];
57 u32 gtdr;
58 u32 res3[3];
59};
60
61struct cascade_priv {
62 u32 tcr_value; /* TCR register: CASC & ROVR value */
63 unsigned int cascade_map; /* cascade map */
64 unsigned int timer_num; /* cascade control timer */
65};
66
67struct timer_group_priv {
68 struct timer_regs __iomem *regs;
69 struct mpic_timer timer[TIMERS_PER_GROUP];
70 struct list_head node;
71 unsigned int timerfreq;
72 unsigned int idle;
73 unsigned int flags;
74 spinlock_t lock;
75 void __iomem *group_tcr;
76};
77
78static struct cascade_priv cascade_timer[] = {
79 /* cascade timer 0 and 1 */
80 {0x1, 0xc, 0x1},
81 /* cascade timer 1 and 2 */
82 {0x2, 0x6, 0x2},
83 /* cascade timer 2 and 3 */
84 {0x4, 0x3, 0x3}
85};
86
87static LIST_HEAD(timer_group_list);
88
89static void convert_ticks_to_time(struct timer_group_priv *priv,
90 const u64 ticks, struct timeval *time)
91{
92 u64 tmp_sec;
93
94 time->tv_sec = (__kernel_time_t)div_u64(ticks, priv->timerfreq);
95 tmp_sec = (u64)time->tv_sec * (u64)priv->timerfreq;
96
97 time->tv_usec = (__kernel_suseconds_t)
98 div_u64((ticks - tmp_sec) * 1000000, priv->timerfreq);
99
100 return;
101}
102
103/* the time set by the user is converted to "ticks" */
104static int convert_time_to_ticks(struct timer_group_priv *priv,
105 const struct timeval *time, u64 *ticks)
106{
107 u64 max_value; /* prevent u64 overflow */
108 u64 tmp = 0;
109
110 u64 tmp_sec;
111 u64 tmp_ms;
112 u64 tmp_us;
113
114 max_value = div_u64(ULLONG_MAX, priv->timerfreq);
115
116 if (time->tv_sec > max_value ||
117 (time->tv_sec == max_value && time->tv_usec > 0))
118 return -EINVAL;
119
120 tmp_sec = (u64)time->tv_sec * (u64)priv->timerfreq;
121 tmp += tmp_sec;
122
123 tmp_ms = time->tv_usec / 1000;
124 tmp_ms = div_u64((u64)tmp_ms * (u64)priv->timerfreq, 1000);
125 tmp += tmp_ms;
126
127 tmp_us = time->tv_usec % 1000;
128 tmp_us = div_u64((u64)tmp_us * (u64)priv->timerfreq, 1000000);
129 tmp += tmp_us;
130
131 *ticks = tmp;
132
133 return 0;
134}
135
136/* detect whether there is a cascade timer available */
137static struct mpic_timer *detect_idle_cascade_timer(
138 struct timer_group_priv *priv)
139{
140 struct cascade_priv *casc_priv;
141 unsigned int map;
142 unsigned int array_size = ARRAY_SIZE(cascade_timer);
143 unsigned int num;
144 unsigned int i;
145 unsigned long flags;
146
147 casc_priv = cascade_timer;
148 for (i = 0; i < array_size; i++) {
149 spin_lock_irqsave(&priv->lock, flags);
150 map = casc_priv->cascade_map & priv->idle;
151 if (map == casc_priv->cascade_map) {
152 num = casc_priv->timer_num;
153 priv->timer[num].cascade_handle = casc_priv;
154
155 /* set timer busy */
156 priv->idle &= ~casc_priv->cascade_map;
157 spin_unlock_irqrestore(&priv->lock, flags);
158 return &priv->timer[num];
159 }
160 spin_unlock_irqrestore(&priv->lock, flags);
161 casc_priv++;
162 }
163
164 return NULL;
165}
166
167static int set_cascade_timer(struct timer_group_priv *priv, u64 ticks,
168 unsigned int num)
169{
170 struct cascade_priv *casc_priv;
171 u32 tcr;
172 u32 tmp_ticks;
173 u32 rem_ticks;
174
175 /* set group tcr reg for cascade */
176 casc_priv = priv->timer[num].cascade_handle;
177 if (!casc_priv)
178 return -EINVAL;
179
180 tcr = casc_priv->tcr_value |
181 (casc_priv->tcr_value << MPIC_TIMER_TCR_ROVR_OFFSET);
182 setbits32(priv->group_tcr, tcr);
183
184 tmp_ticks = div_u64_rem(ticks, MAX_TICKS_CASCADE, &rem_ticks);
185
186 out_be32(&priv->regs[num].gtccr, 0);
187 out_be32(&priv->regs[num].gtbcr, tmp_ticks | TIMER_STOP);
188
189 out_be32(&priv->regs[num - 1].gtccr, 0);
190 out_be32(&priv->regs[num - 1].gtbcr, rem_ticks);
191
192 return 0;
193}
194
195static struct mpic_timer *get_cascade_timer(struct timer_group_priv *priv,
196 u64 ticks)
197{
198 struct mpic_timer *allocated_timer;
199
200 /* Two cascade timers: Support the maximum time */
201 const u64 max_ticks = (u64)MAX_TICKS * (u64)MAX_TICKS_CASCADE;
202 int ret;
203
204 if (ticks > max_ticks)
205 return NULL;
206
207 /* detect idle timer */
208 allocated_timer = detect_idle_cascade_timer(priv);
209 if (!allocated_timer)
210 return NULL;
211
212 /* set ticks to timer */
213 ret = set_cascade_timer(priv, ticks, allocated_timer->num);
214 if (ret < 0)
215 return NULL;
216
217 return allocated_timer;
218}
219
220static struct mpic_timer *get_timer(const struct timeval *time)
221{
222 struct timer_group_priv *priv;
223 struct mpic_timer *timer;
224
225 u64 ticks;
226 unsigned int num;
227 unsigned int i;
228 unsigned long flags;
229 int ret;
230
231 list_for_each_entry(priv, &timer_group_list, node) {
232 ret = convert_time_to_ticks(priv, time, &ticks);
233 if (ret < 0)
234 return NULL;
235
236 if (ticks > MAX_TICKS) {
237 if (!(priv->flags & FSL_GLOBAL_TIMER))
238 return NULL;
239
240 timer = get_cascade_timer(priv, ticks);
241 if (!timer)
242 continue;
243
244 return timer;
245 }
246
247 for (i = 0; i < TIMERS_PER_GROUP; i++) {
248 /* one timer: Reverse allocation */
249 num = TIMERS_PER_GROUP - 1 - i;
250 spin_lock_irqsave(&priv->lock, flags);
251 if (priv->idle & (1 << i)) {
252 /* set timer busy */
253 priv->idle &= ~(1 << i);
254 /* set ticks & stop timer */
255 out_be32(&priv->regs[num].gtbcr,
256 ticks | TIMER_STOP);
257 out_be32(&priv->regs[num].gtccr, 0);
258 priv->timer[num].cascade_handle = NULL;
259 spin_unlock_irqrestore(&priv->lock, flags);
260 return &priv->timer[num];
261 }
262 spin_unlock_irqrestore(&priv->lock, flags);
263 }
264 }
265
266 return NULL;
267}
268
269/**
270 * mpic_start_timer - start hardware timer
271 * @handle: the timer to be started.
272 *
273 * It will do ->fn(->dev) callback from the hardware interrupt at
274 * the ->timeval point in the future.
275 */
276void mpic_start_timer(struct mpic_timer *handle)
277{
278 struct timer_group_priv *priv = container_of(handle,
279 struct timer_group_priv, timer[handle->num]);
280
281 clrbits32(&priv->regs[handle->num].gtbcr, TIMER_STOP);
282}
283EXPORT_SYMBOL(mpic_start_timer);
284
285/**
286 * mpic_stop_timer - stop hardware timer
287 * @handle: the timer to be stoped
288 *
289 * The timer periodically generates an interrupt. Unless user stops the timer.
290 */
291void mpic_stop_timer(struct mpic_timer *handle)
292{
293 struct timer_group_priv *priv = container_of(handle,
294 struct timer_group_priv, timer[handle->num]);
295 struct cascade_priv *casc_priv;
296
297 setbits32(&priv->regs[handle->num].gtbcr, TIMER_STOP);
298
299 casc_priv = priv->timer[handle->num].cascade_handle;
300 if (casc_priv) {
301 out_be32(&priv->regs[handle->num].gtccr, 0);
302 out_be32(&priv->regs[handle->num - 1].gtccr, 0);
303 } else {
304 out_be32(&priv->regs[handle->num].gtccr, 0);
305 }
306}
307EXPORT_SYMBOL(mpic_stop_timer);
308
309/**
310 * mpic_get_remain_time - get timer time
311 * @handle: the timer to be selected.
312 * @time: time for timer
313 *
314 * Query timer remaining time.
315 */
316void mpic_get_remain_time(struct mpic_timer *handle, struct timeval *time)
317{
318 struct timer_group_priv *priv = container_of(handle,
319 struct timer_group_priv, timer[handle->num]);
320 struct cascade_priv *casc_priv;
321
322 u64 ticks;
323 u32 tmp_ticks;
324
325 casc_priv = priv->timer[handle->num].cascade_handle;
326 if (casc_priv) {
327 tmp_ticks = in_be32(&priv->regs[handle->num].gtccr);
328 ticks = ((u64)tmp_ticks & UINT_MAX) * (u64)MAX_TICKS_CASCADE;
329 tmp_ticks = in_be32(&priv->regs[handle->num - 1].gtccr);
330 ticks += tmp_ticks;
331 } else {
332 ticks = in_be32(&priv->regs[handle->num].gtccr);
333 }
334
335 convert_ticks_to_time(priv, ticks, time);
336}
337EXPORT_SYMBOL(mpic_get_remain_time);
338
339/**
340 * mpic_free_timer - free hardware timer
341 * @handle: the timer to be removed.
342 *
343 * Free the timer.
344 *
345 * Note: can not be used in interrupt context.
346 */
347void mpic_free_timer(struct mpic_timer *handle)
348{
349 struct timer_group_priv *priv = container_of(handle,
350 struct timer_group_priv, timer[handle->num]);
351
352 struct cascade_priv *casc_priv;
353 unsigned long flags;
354
355 mpic_stop_timer(handle);
356
357 casc_priv = priv->timer[handle->num].cascade_handle;
358
359 free_irq(priv->timer[handle->num].irq, priv->timer[handle->num].dev);
360
361 spin_lock_irqsave(&priv->lock, flags);
362 if (casc_priv) {
363 u32 tcr;
364 tcr = casc_priv->tcr_value | (casc_priv->tcr_value <<
365 MPIC_TIMER_TCR_ROVR_OFFSET);
366 clrbits32(priv->group_tcr, tcr);
367 priv->idle |= casc_priv->cascade_map;
368 priv->timer[handle->num].cascade_handle = NULL;
369 } else {
370 priv->idle |= TIMER_OFFSET(handle->num);
371 }
372 spin_unlock_irqrestore(&priv->lock, flags);
373}
374EXPORT_SYMBOL(mpic_free_timer);
375
376/**
377 * mpic_request_timer - get a hardware timer
378 * @fn: interrupt handler function
379 * @dev: callback function of the data
380 * @time: time for timer
381 *
382 * This executes the "request_irq", returning NULL
383 * else "handle" on success.
384 */
385struct mpic_timer *mpic_request_timer(irq_handler_t fn, void *dev,
386 const struct timeval *time)
387{
388 struct mpic_timer *allocated_timer;
389 int ret;
390
391 if (list_empty(&timer_group_list))
392 return NULL;
393
394 if (!(time->tv_sec + time->tv_usec) ||
395 time->tv_sec < 0 || time->tv_usec < 0)
396 return NULL;
397
398 if (time->tv_usec > ONE_SECOND)
399 return NULL;
400
401 allocated_timer = get_timer(time);
402 if (!allocated_timer)
403 return NULL;
404
405 ret = request_irq(allocated_timer->irq, fn,
406 IRQF_TRIGGER_LOW, "global-timer", dev);
407 if (ret) {
408 mpic_free_timer(allocated_timer);
409 return NULL;
410 }
411
412 allocated_timer->dev = dev;
413
414 return allocated_timer;
415}
416EXPORT_SYMBOL(mpic_request_timer);
417
418static int timer_group_get_freq(struct device_node *np,
419 struct timer_group_priv *priv)
420{
421 u32 div;
422
423 if (priv->flags & FSL_GLOBAL_TIMER) {
424 struct device_node *dn;
425
426 dn = of_find_compatible_node(NULL, NULL, "fsl,mpic");
427 if (dn) {
428 of_property_read_u32(dn, "clock-frequency",
429 &priv->timerfreq);
430 of_node_put(dn);
431 }
432 }
433
434 if (priv->timerfreq <= 0)
435 return -EINVAL;
436
437 if (priv->flags & FSL_GLOBAL_TIMER) {
438 div = (1 << (MPIC_TIMER_TCR_CLKDIV >> 8)) * 8;
439 priv->timerfreq /= div;
440 }
441
442 return 0;
443}
444
445static int timer_group_get_irq(struct device_node *np,
446 struct timer_group_priv *priv)
447{
448 const u32 all_timer[] = { 0, TIMERS_PER_GROUP };
449 const u32 *p;
450 u32 offset;
451 u32 count;
452
453 unsigned int i;
454 unsigned int j;
455 unsigned int irq_index = 0;
456 unsigned int irq;
457 int len;
458
459 p = of_get_property(np, "fsl,available-ranges", &len);
460 if (p && len % (2 * sizeof(u32)) != 0) {
461 pr_err("%s: malformed available-ranges property.\n",
462 np->full_name);
463 return -EINVAL;
464 }
465
466 if (!p) {
467 p = all_timer;
468 len = sizeof(all_timer);
469 }
470
471 len /= 2 * sizeof(u32);
472
473 for (i = 0; i < len; i++) {
474 offset = p[i * 2];
475 count = p[i * 2 + 1];
476 for (j = 0; j < count; j++) {
477 irq = irq_of_parse_and_map(np, irq_index);
478 if (!irq) {
479 pr_err("%s: irq parse and map failed.\n",
480 np->full_name);
481 return -EINVAL;
482 }
483
484 /* Set timer idle */
485 priv->idle |= TIMER_OFFSET((offset + j));
486 priv->timer[offset + j].irq = irq;
487 priv->timer[offset + j].num = offset + j;
488 irq_index++;
489 }
490 }
491
492 return 0;
493}
494
495static void timer_group_init(struct device_node *np)
496{
497 struct timer_group_priv *priv;
498 unsigned int i = 0;
499 int ret;
500
501 priv = kzalloc(sizeof(struct timer_group_priv), GFP_KERNEL);
502 if (!priv) {
503 pr_err("%s: cannot allocate memory for group.\n",
504 np->full_name);
505 return;
506 }
507
508 if (of_device_is_compatible(np, "fsl,mpic-global-timer"))
509 priv->flags |= FSL_GLOBAL_TIMER;
510
511 priv->regs = of_iomap(np, i++);
512 if (!priv->regs) {
513 pr_err("%s: cannot ioremap timer register address.\n",
514 np->full_name);
515 goto out;
516 }
517
518 if (priv->flags & FSL_GLOBAL_TIMER) {
519 priv->group_tcr = of_iomap(np, i++);
520 if (!priv->group_tcr) {
521 pr_err("%s: cannot ioremap tcr address.\n",
522 np->full_name);
523 goto out;
524 }
525 }
526
527 ret = timer_group_get_freq(np, priv);
528 if (ret < 0) {
529 pr_err("%s: cannot get timer frequency.\n", np->full_name);
530 goto out;
531 }
532
533 ret = timer_group_get_irq(np, priv);
534 if (ret < 0) {
535 pr_err("%s: cannot get timer irqs.\n", np->full_name);
536 goto out;
537 }
538
539 spin_lock_init(&priv->lock);
540
541 /* Init FSL timer hardware */
542 if (priv->flags & FSL_GLOBAL_TIMER)
543 setbits32(priv->group_tcr, MPIC_TIMER_TCR_CLKDIV);
544
545 list_add_tail(&priv->node, &timer_group_list);
546
547 return;
548
549out:
550 if (priv->regs)
551 iounmap(priv->regs);
552
553 if (priv->group_tcr)
554 iounmap(priv->group_tcr);
555
556 kfree(priv);
557}
558
559static void mpic_timer_resume(void)
560{
561 struct timer_group_priv *priv;
562
563 list_for_each_entry(priv, &timer_group_list, node) {
564 /* Init FSL timer hardware */
565 if (priv->flags & FSL_GLOBAL_TIMER)
566 setbits32(priv->group_tcr, MPIC_TIMER_TCR_CLKDIV);
567 }
568}
569
570static const struct of_device_id mpic_timer_ids[] = {
571 { .compatible = "fsl,mpic-global-timer", },
572 {},
573};
574
575static struct syscore_ops mpic_timer_syscore_ops = {
576 .resume = mpic_timer_resume,
577};
578
579static int __init mpic_timer_init(void)
580{
581 struct device_node *np = NULL;
582
583 for_each_matching_node(np, mpic_timer_ids)
584 timer_group_init(np);
585
586 register_syscore_ops(&mpic_timer_syscore_ops);
587
588 if (list_empty(&timer_group_list))
589 return -ENODEV;
590
591 return 0;
592}
593subsys_initcall(mpic_timer_init);
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 0ea4e591fa78..75fb726de91f 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1364,10 +1364,11 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd)
1364#ifdef CONFIG_TRANSPARENT_HUGEPAGE 1364#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1365 1365
1366#define __HAVE_ARCH_PGTABLE_DEPOSIT 1366#define __HAVE_ARCH_PGTABLE_DEPOSIT
1367extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable); 1367extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
1368 pgtable_t pgtable);
1368 1369
1369#define __HAVE_ARCH_PGTABLE_WITHDRAW 1370#define __HAVE_ARCH_PGTABLE_WITHDRAW
1370extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm); 1371extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
1371 1372
1372static inline int pmd_trans_splitting(pmd_t pmd) 1373static inline int pmd_trans_splitting(pmd_t pmd)
1373{ 1374{
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 17bf4d3d303a..a8154a1a2c94 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -1165,7 +1165,8 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
1165 } 1165 }
1166} 1166}
1167 1167
1168void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) 1168void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
1169 pgtable_t pgtable)
1169{ 1170{
1170 struct list_head *lh = (struct list_head *) pgtable; 1171 struct list_head *lh = (struct list_head *) pgtable;
1171 1172
@@ -1179,7 +1180,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
1179 mm->pmd_huge_pte = pgtable; 1180 mm->pmd_huge_pte = pgtable;
1180} 1181}
1181 1182
1182pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm) 1183pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
1183{ 1184{
1184 struct list_head *lh; 1185 struct list_head *lh;
1185 pgtable_t pgtable; 1186 pgtable_t pgtable;
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 79c214efa3fe..36760317814f 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -853,10 +853,11 @@ extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
853 pmd_t *pmd); 853 pmd_t *pmd);
854 854
855#define __HAVE_ARCH_PGTABLE_DEPOSIT 855#define __HAVE_ARCH_PGTABLE_DEPOSIT
856extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable); 856extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
857 pgtable_t pgtable);
857 858
858#define __HAVE_ARCH_PGTABLE_WITHDRAW 859#define __HAVE_ARCH_PGTABLE_WITHDRAW
859extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm); 860extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
860#endif 861#endif
861 862
862/* Encode and de-code a swap entry */ 863/* Encode and de-code a swap entry */
diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c
index 37e7bc4c95b3..7a91f288c708 100644
--- a/arch/sparc/mm/tlb.c
+++ b/arch/sparc/mm/tlb.c
@@ -188,7 +188,8 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
188 } 188 }
189} 189}
190 190
191void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) 191void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
192 pgtable_t pgtable)
192{ 193{
193 struct list_head *lh = (struct list_head *) pgtable; 194 struct list_head *lh = (struct list_head *) pgtable;
194 195
@@ -202,7 +203,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
202 mm->pmd_huge_pte = pgtable; 203 mm->pmd_huge_pte = pgtable;
203} 204}
204 205
205pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm) 206pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
206{ 207{
207 struct list_head *lh; 208 struct list_head *lh;
208 pgtable_t pgtable; 209 pgtable_t pgtable;
diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c
index f7b3b39e94fc..88d0b0f9f92b 100644
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -935,7 +935,7 @@ static ssize_t erst_reader(u64 *id, enum pstore_type_id *type, int *count,
935 struct timespec *time, char **buf, 935 struct timespec *time, char **buf,
936 struct pstore_info *psi); 936 struct pstore_info *psi);
937static int erst_writer(enum pstore_type_id type, enum kmsg_dump_reason reason, 937static int erst_writer(enum pstore_type_id type, enum kmsg_dump_reason reason,
938 u64 *id, unsigned int part, int count, 938 u64 *id, unsigned int part, int count, size_t hsize,
939 size_t size, struct pstore_info *psi); 939 size_t size, struct pstore_info *psi);
940static int erst_clearer(enum pstore_type_id type, u64 id, int count, 940static int erst_clearer(enum pstore_type_id type, u64 id, int count,
941 struct timespec time, struct pstore_info *psi); 941 struct timespec time, struct pstore_info *psi);
@@ -1055,7 +1055,7 @@ out:
1055} 1055}
1056 1056
1057static int erst_writer(enum pstore_type_id type, enum kmsg_dump_reason reason, 1057static int erst_writer(enum pstore_type_id type, enum kmsg_dump_reason reason,
1058 u64 *id, unsigned int part, int count, 1058 u64 *id, unsigned int part, int count, size_t hsize,
1059 size_t size, struct pstore_info *psi) 1059 size_t size, struct pstore_info *psi)
1060{ 1060{
1061 struct cper_pstore_record *rcd = (struct cper_pstore_record *) 1061 struct cper_pstore_record *rcd = (struct cper_pstore_record *)
diff --git a/drivers/firmware/efi/efi-pstore.c b/drivers/firmware/efi/efi-pstore.c
index 91864ad200ff..73de5a9c2247 100644
--- a/drivers/firmware/efi/efi-pstore.c
+++ b/drivers/firmware/efi/efi-pstore.c
@@ -103,7 +103,7 @@ static ssize_t efi_pstore_read(u64 *id, enum pstore_type_id *type,
103 103
104static int efi_pstore_write(enum pstore_type_id type, 104static int efi_pstore_write(enum pstore_type_id type,
105 enum kmsg_dump_reason reason, u64 *id, 105 enum kmsg_dump_reason reason, u64 *id,
106 unsigned int part, int count, size_t size, 106 unsigned int part, int count, size_t hsize, size_t size,
107 struct pstore_info *psi) 107 struct pstore_info *psi)
108{ 108{
109 char name[DUMP_NAME_LEN]; 109 char name[DUMP_NAME_LEN];
diff --git a/drivers/i2c/busses/i2c-cpm.c b/drivers/i2c/busses/i2c-cpm.c
index 3823623baa48..9e6002108720 100644
--- a/drivers/i2c/busses/i2c-cpm.c
+++ b/drivers/i2c/busses/i2c-cpm.c
@@ -338,6 +338,14 @@ static int cpm_i2c_xfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num)
338 tptr = 0; 338 tptr = 0;
339 rptr = 0; 339 rptr = 0;
340 340
341 /*
342 * If there was a collision in the last i2c transaction,
343 * Set I2COM_MASTER as it was cleared during collision.
344 */
345 if (in_be16(&tbdf->cbd_sc) & BD_SC_CL) {
346 out_8(&cpm->i2c_reg->i2com, I2COM_MASTER);
347 }
348
341 while (tptr < num) { 349 while (tptr < num) {
342 pmsg = &msgs[tptr]; 350 pmsg = &msgs[tptr];
343 dev_dbg(&adap->dev, "R: %d T: %d\n", rptr, tptr); 351 dev_dbg(&adap->dev, "R: %d T: %d\n", rptr, tptr);
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index c332fb98480d..01730b2b9954 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -261,4 +261,12 @@ config SHMOBILE_IOMMU_L1SIZE
261 default 256 if SHMOBILE_IOMMU_ADDRSIZE_64MB 261 default 256 if SHMOBILE_IOMMU_ADDRSIZE_64MB
262 default 128 if SHMOBILE_IOMMU_ADDRSIZE_32MB 262 default 128 if SHMOBILE_IOMMU_ADDRSIZE_32MB
263 263
264config SPAPR_TCE_IOMMU
265 bool "sPAPR TCE IOMMU Support"
266 depends on PPC_POWERNV || PPC_PSERIES
267 select IOMMU_API
268 help
269 Enables bits of IOMMU API required by VFIO. The iommu_ops
270 is not implemented as it is not necessary for VFIO.
271
264endif # IOMMU_SUPPORT 272endif # IOMMU_SUPPORT
diff --git a/drivers/macintosh/adb.c b/drivers/macintosh/adb.c
index b026896206ca..04a50498f257 100644
--- a/drivers/macintosh/adb.c
+++ b/drivers/macintosh/adb.c
@@ -697,7 +697,7 @@ static ssize_t adb_read(struct file *file, char __user *buf,
697 int ret = 0; 697 int ret = 0;
698 struct adbdev_state *state = file->private_data; 698 struct adbdev_state *state = file->private_data;
699 struct adb_request *req; 699 struct adb_request *req;
700 wait_queue_t wait = __WAITQUEUE_INITIALIZER(wait,current); 700 DECLARE_WAITQUEUE(wait,current);
701 unsigned long flags; 701 unsigned long flags;
702 702
703 if (count < 2) 703 if (count < 2)
diff --git a/drivers/macintosh/mac_hid.c b/drivers/macintosh/mac_hid.c
index 6a82388505f0..80d30e8e3389 100644
--- a/drivers/macintosh/mac_hid.c
+++ b/drivers/macintosh/mac_hid.c
@@ -181,7 +181,7 @@ static void mac_hid_stop_emulation(void)
181 mac_hid_destroy_emumouse(); 181 mac_hid_destroy_emumouse();
182} 182}
183 183
184static int mac_hid_toggle_emumouse(ctl_table *table, int write, 184static int mac_hid_toggle_emumouse(struct ctl_table *table, int write,
185 void __user *buffer, size_t *lenp, 185 void __user *buffer, size_t *lenp,
186 loff_t *ppos) 186 loff_t *ppos)
187{ 187{
@@ -214,7 +214,7 @@ static int mac_hid_toggle_emumouse(ctl_table *table, int write,
214} 214}
215 215
216/* file(s) in /proc/sys/dev/mac_hid */ 216/* file(s) in /proc/sys/dev/mac_hid */
217static ctl_table mac_hid_files[] = { 217static struct ctl_table mac_hid_files[] = {
218 { 218 {
219 .procname = "mouse_button_emulation", 219 .procname = "mouse_button_emulation",
220 .data = &mouse_emulate_buttons, 220 .data = &mouse_emulate_buttons,
@@ -240,7 +240,7 @@ static ctl_table mac_hid_files[] = {
240}; 240};
241 241
242/* dir in /proc/sys/dev */ 242/* dir in /proc/sys/dev */
243static ctl_table mac_hid_dir[] = { 243static struct ctl_table mac_hid_dir[] = {
244 { 244 {
245 .procname = "mac_hid", 245 .procname = "mac_hid",
246 .maxlen = 0, 246 .maxlen = 0,
@@ -251,7 +251,7 @@ static ctl_table mac_hid_dir[] = {
251}; 251};
252 252
253/* /proc/sys/dev itself, in case that is not there yet */ 253/* /proc/sys/dev itself, in case that is not there yet */
254static ctl_table mac_hid_root_dir[] = { 254static struct ctl_table mac_hid_root_dir[] = {
255 { 255 {
256 .procname = "dev", 256 .procname = "dev",
257 .maxlen = 0, 257 .maxlen = 0,
diff --git a/drivers/macintosh/via-cuda.c b/drivers/macintosh/via-cuda.c
index 86511c570dd8..d61f271d2207 100644
--- a/drivers/macintosh/via-cuda.c
+++ b/drivers/macintosh/via-cuda.c
@@ -259,7 +259,7 @@ cuda_probe(void)
259 } while (0) 259 } while (0)
260 260
261static int 261static int
262cuda_init_via(void) 262__init cuda_init_via(void)
263{ 263{
264 out_8(&via[DIRB], (in_8(&via[DIRB]) | TACK | TIP) & ~TREQ); /* TACK & TIP out */ 264 out_8(&via[DIRB], (in_8(&via[DIRB]) | TACK | TIP) & ~TREQ); /* TACK & TIP out */
265 out_8(&via[B], in_8(&via[B]) | TACK | TIP); /* negate them */ 265 out_8(&via[B], in_8(&via[B]) | TACK | TIP); /* negate them */
diff --git a/drivers/macintosh/windfarm_pm121.c b/drivers/macintosh/windfarm_pm121.c
index af605e915d41..7fe58b0ae8b4 100644
--- a/drivers/macintosh/windfarm_pm121.c
+++ b/drivers/macintosh/windfarm_pm121.c
@@ -276,6 +276,7 @@ static const char *loop_names[N_LOOPS] = {
276 276
277static unsigned int pm121_failure_state; 277static unsigned int pm121_failure_state;
278static int pm121_readjust, pm121_skipping; 278static int pm121_readjust, pm121_skipping;
279static bool pm121_overtemp;
279static s32 average_power; 280static s32 average_power;
280 281
281struct pm121_correction { 282struct pm121_correction {
@@ -847,6 +848,7 @@ static void pm121_tick(void)
847 if (new_failure & FAILURE_OVERTEMP) { 848 if (new_failure & FAILURE_OVERTEMP) {
848 wf_set_overtemp(); 849 wf_set_overtemp();
849 pm121_skipping = 2; 850 pm121_skipping = 2;
851 pm121_overtemp = true;
850 } 852 }
851 853
852 /* We only clear the overtemp condition if overtemp is cleared 854 /* We only clear the overtemp condition if overtemp is cleared
@@ -855,8 +857,10 @@ static void pm121_tick(void)
855 * the control loop levels, but we don't want to keep it clear 857 * the control loop levels, but we don't want to keep it clear
856 * here in this case 858 * here in this case
857 */ 859 */
858 if (new_failure == 0 && last_failure & FAILURE_OVERTEMP) 860 if (!pm121_failure_state && pm121_overtemp) {
859 wf_clear_overtemp(); 861 wf_clear_overtemp();
862 pm121_overtemp = false;
863 }
860} 864}
861 865
862 866
diff --git a/drivers/macintosh/windfarm_pm81.c b/drivers/macintosh/windfarm_pm81.c
index f84933ff3298..2a5e1b15b1d2 100644
--- a/drivers/macintosh/windfarm_pm81.c
+++ b/drivers/macintosh/windfarm_pm81.c
@@ -149,6 +149,7 @@ static int wf_smu_all_controls_ok, wf_smu_all_sensors_ok, wf_smu_started;
149 149
150static unsigned int wf_smu_failure_state; 150static unsigned int wf_smu_failure_state;
151static int wf_smu_readjust, wf_smu_skipping; 151static int wf_smu_readjust, wf_smu_skipping;
152static bool wf_smu_overtemp;
152 153
153/* 154/*
154 * ****** System Fans Control Loop ****** 155 * ****** System Fans Control Loop ******
@@ -593,6 +594,7 @@ static void wf_smu_tick(void)
593 if (new_failure & FAILURE_OVERTEMP) { 594 if (new_failure & FAILURE_OVERTEMP) {
594 wf_set_overtemp(); 595 wf_set_overtemp();
595 wf_smu_skipping = 2; 596 wf_smu_skipping = 2;
597 wf_smu_overtemp = true;
596 } 598 }
597 599
598 /* We only clear the overtemp condition if overtemp is cleared 600 /* We only clear the overtemp condition if overtemp is cleared
@@ -601,8 +603,10 @@ static void wf_smu_tick(void)
601 * the control loop levels, but we don't want to keep it clear 603 * the control loop levels, but we don't want to keep it clear
602 * here in this case 604 * here in this case
603 */ 605 */
604 if (new_failure == 0 && last_failure & FAILURE_OVERTEMP) 606 if (!wf_smu_failure_state && wf_smu_overtemp) {
605 wf_clear_overtemp(); 607 wf_clear_overtemp();
608 wf_smu_overtemp = false;
609 }
606} 610}
607 611
608static void wf_smu_new_control(struct wf_control *ct) 612static void wf_smu_new_control(struct wf_control *ct)
diff --git a/drivers/macintosh/windfarm_pm91.c b/drivers/macintosh/windfarm_pm91.c
index 2eb484f213c8..a8ac66cd3b13 100644
--- a/drivers/macintosh/windfarm_pm91.c
+++ b/drivers/macintosh/windfarm_pm91.c
@@ -76,6 +76,7 @@ static struct wf_control *cpufreq_clamp;
76 76
77/* Set to kick the control loop into life */ 77/* Set to kick the control loop into life */
78static int wf_smu_all_controls_ok, wf_smu_all_sensors_ok, wf_smu_started; 78static int wf_smu_all_controls_ok, wf_smu_all_sensors_ok, wf_smu_started;
79static bool wf_smu_overtemp;
79 80
80/* Failure handling.. could be nicer */ 81/* Failure handling.. could be nicer */
81#define FAILURE_FAN 0x01 82#define FAILURE_FAN 0x01
@@ -517,6 +518,7 @@ static void wf_smu_tick(void)
517 if (new_failure & FAILURE_OVERTEMP) { 518 if (new_failure & FAILURE_OVERTEMP) {
518 wf_set_overtemp(); 519 wf_set_overtemp();
519 wf_smu_skipping = 2; 520 wf_smu_skipping = 2;
521 wf_smu_overtemp = true;
520 } 522 }
521 523
522 /* We only clear the overtemp condition if overtemp is cleared 524 /* We only clear the overtemp condition if overtemp is cleared
@@ -525,8 +527,10 @@ static void wf_smu_tick(void)
525 * the control loop levels, but we don't want to keep it clear 527 * the control loop levels, but we don't want to keep it clear
526 * here in this case 528 * here in this case
527 */ 529 */
528 if (new_failure == 0 && last_failure & FAILURE_OVERTEMP) 530 if (!wf_smu_failure_state && wf_smu_overtemp) {
529 wf_clear_overtemp(); 531 wf_clear_overtemp();
532 wf_smu_overtemp = false;
533 }
530} 534}
531 535
532 536
diff --git a/drivers/macintosh/windfarm_smu_sat.c b/drivers/macintosh/windfarm_smu_sat.c
index d87f5ee04ca9..ad6223e88340 100644
--- a/drivers/macintosh/windfarm_smu_sat.c
+++ b/drivers/macintosh/windfarm_smu_sat.c
@@ -343,7 +343,6 @@ static int wf_sat_remove(struct i2c_client *client)
343 wf_unregister_sensor(&sens->sens); 343 wf_unregister_sensor(&sens->sens);
344 } 344 }
345 sat->i2c = NULL; 345 sat->i2c = NULL;
346 i2c_set_clientdata(client, NULL);
347 kref_put(&sat->ref, wf_sat_release); 346 kref_put(&sat->ref, wf_sat_release);
348 347
349 return 0; 348 return 0;
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 7cd5dec0abd1..26b3d9d1409f 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
3 depends on VFIO 3 depends on VFIO
4 default n 4 default n
5 5
6config VFIO_IOMMU_SPAPR_TCE
7 tristate
8 depends on VFIO && SPAPR_TCE_IOMMU
9 default n
10
6menuconfig VFIO 11menuconfig VFIO
7 tristate "VFIO Non-Privileged userspace driver framework" 12 tristate "VFIO Non-Privileged userspace driver framework"
8 depends on IOMMU_API 13 depends on IOMMU_API
9 select VFIO_IOMMU_TYPE1 if X86 14 select VFIO_IOMMU_TYPE1 if X86
15 select VFIO_IOMMU_SPAPR_TCE if (PPC_POWERNV || PPC_PSERIES)
10 help 16 help
11 VFIO provides a framework for secure userspace device drivers. 17 VFIO provides a framework for secure userspace device drivers.
12 See Documentation/vfio.txt for more details. 18 See Documentation/vfio.txt for more details.
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 2398d4a0e38b..72bfabc8629e 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -1,3 +1,4 @@
1obj-$(CONFIG_VFIO) += vfio.o 1obj-$(CONFIG_VFIO) += vfio.o
2obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o 2obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
3obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
3obj-$(CONFIG_VFIO_PCI) += pci/ 4obj-$(CONFIG_VFIO_PCI) += pci/
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 6d78736563de..259ad282ae5d 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1415,6 +1415,7 @@ static int __init vfio_init(void)
1415 * drivers. 1415 * drivers.
1416 */ 1416 */
1417 request_module_nowait("vfio_iommu_type1"); 1417 request_module_nowait("vfio_iommu_type1");
1418 request_module_nowait("vfio_iommu_spapr_tce");
1418 1419
1419 return 0; 1420 return 0;
1420 1421
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
new file mode 100644
index 000000000000..bdae7a04af75
--- /dev/null
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -0,0 +1,377 @@
1/*
2 * VFIO: IOMMU DMA mapping support for TCE on POWER
3 *
4 * Copyright (C) 2013 IBM Corp. All rights reserved.
5 * Author: Alexey Kardashevskiy <aik@ozlabs.ru>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * Derived from original vfio_iommu_type1.c:
12 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
13 * Author: Alex Williamson <alex.williamson@redhat.com>
14 */
15
16#include <linux/module.h>
17#include <linux/pci.h>
18#include <linux/slab.h>
19#include <linux/uaccess.h>
20#include <linux/err.h>
21#include <linux/vfio.h>
22#include <asm/iommu.h>
23#include <asm/tce.h>
24
25#define DRIVER_VERSION "0.1"
26#define DRIVER_AUTHOR "aik@ozlabs.ru"
27#define DRIVER_DESC "VFIO IOMMU SPAPR TCE"
28
29static void tce_iommu_detach_group(void *iommu_data,
30 struct iommu_group *iommu_group);
31
32/*
33 * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
34 *
35 * This code handles mapping and unmapping of user data buffers
36 * into DMA'ble space using the IOMMU
37 */
38
39/*
40 * The container descriptor supports only a single group per container.
41 * Required by the API as the container is not supplied with the IOMMU group
42 * at the moment of initialization.
43 */
44struct tce_container {
45 struct mutex lock;
46 struct iommu_table *tbl;
47 bool enabled;
48};
49
50static int tce_iommu_enable(struct tce_container *container)
51{
52 int ret = 0;
53 unsigned long locked, lock_limit, npages;
54 struct iommu_table *tbl = container->tbl;
55
56 if (!container->tbl)
57 return -ENXIO;
58
59 if (!current->mm)
60 return -ESRCH; /* process exited */
61
62 if (container->enabled)
63 return -EBUSY;
64
65 /*
66 * When userspace pages are mapped into the IOMMU, they are effectively
67 * locked memory, so, theoretically, we need to update the accounting
68 * of locked pages on each map and unmap. For powerpc, the map unmap
69 * paths can be very hot, though, and the accounting would kill
70 * performance, especially since it would be difficult to impossible
71 * to handle the accounting in real mode only.
72 *
73 * To address that, rather than precisely accounting every page, we
74 * instead account for a worst case on locked memory when the iommu is
75 * enabled and disabled. The worst case upper bound on locked memory
76 * is the size of the whole iommu window, which is usually relatively
77 * small (compared to total memory sizes) on POWER hardware.
78 *
79 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
80 * that would effectively kill the guest at random points, much better
81 * enforcing the limit based on the max that the guest can map.
82 */
83 down_write(&current->mm->mmap_sem);
84 npages = (tbl->it_size << IOMMU_PAGE_SHIFT) >> PAGE_SHIFT;
85 locked = current->mm->locked_vm + npages;
86 lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
87 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
88 pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
89 rlimit(RLIMIT_MEMLOCK));
90 ret = -ENOMEM;
91 } else {
92
93 current->mm->locked_vm += npages;
94 container->enabled = true;
95 }
96 up_write(&current->mm->mmap_sem);
97
98 return ret;
99}
100
101static void tce_iommu_disable(struct tce_container *container)
102{
103 if (!container->enabled)
104 return;
105
106 container->enabled = false;
107
108 if (!container->tbl || !current->mm)
109 return;
110
111 down_write(&current->mm->mmap_sem);
112 current->mm->locked_vm -= (container->tbl->it_size <<
113 IOMMU_PAGE_SHIFT) >> PAGE_SHIFT;
114 up_write(&current->mm->mmap_sem);
115}
116
117static void *tce_iommu_open(unsigned long arg)
118{
119 struct tce_container *container;
120
121 if (arg != VFIO_SPAPR_TCE_IOMMU) {
122 pr_err("tce_vfio: Wrong IOMMU type\n");
123 return ERR_PTR(-EINVAL);
124 }
125
126 container = kzalloc(sizeof(*container), GFP_KERNEL);
127 if (!container)
128 return ERR_PTR(-ENOMEM);
129
130 mutex_init(&container->lock);
131
132 return container;
133}
134
135static void tce_iommu_release(void *iommu_data)
136{
137 struct tce_container *container = iommu_data;
138
139 WARN_ON(container->tbl && !container->tbl->it_group);
140 tce_iommu_disable(container);
141
142 if (container->tbl && container->tbl->it_group)
143 tce_iommu_detach_group(iommu_data, container->tbl->it_group);
144
145 mutex_destroy(&container->lock);
146
147 kfree(container);
148}
149
150static long tce_iommu_ioctl(void *iommu_data,
151 unsigned int cmd, unsigned long arg)
152{
153 struct tce_container *container = iommu_data;
154 unsigned long minsz;
155 long ret;
156
157 switch (cmd) {
158 case VFIO_CHECK_EXTENSION:
159 return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
160
161 case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
162 struct vfio_iommu_spapr_tce_info info;
163 struct iommu_table *tbl = container->tbl;
164
165 if (WARN_ON(!tbl))
166 return -ENXIO;
167
168 minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
169 dma32_window_size);
170
171 if (copy_from_user(&info, (void __user *)arg, minsz))
172 return -EFAULT;
173
174 if (info.argsz < minsz)
175 return -EINVAL;
176
177 info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
178 info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
179 info.flags = 0;
180
181 if (copy_to_user((void __user *)arg, &info, minsz))
182 return -EFAULT;
183
184 return 0;
185 }
186 case VFIO_IOMMU_MAP_DMA: {
187 struct vfio_iommu_type1_dma_map param;
188 struct iommu_table *tbl = container->tbl;
189 unsigned long tce, i;
190
191 if (!tbl)
192 return -ENXIO;
193
194 BUG_ON(!tbl->it_group);
195
196 minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
197
198 if (copy_from_user(&param, (void __user *)arg, minsz))
199 return -EFAULT;
200
201 if (param.argsz < minsz)
202 return -EINVAL;
203
204 if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ |
205 VFIO_DMA_MAP_FLAG_WRITE))
206 return -EINVAL;
207
208 if ((param.size & ~IOMMU_PAGE_MASK) ||
209 (param.vaddr & ~IOMMU_PAGE_MASK))
210 return -EINVAL;
211
212 /* iova is checked by the IOMMU API */
213 tce = param.vaddr;
214 if (param.flags & VFIO_DMA_MAP_FLAG_READ)
215 tce |= TCE_PCI_READ;
216 if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
217 tce |= TCE_PCI_WRITE;
218
219 ret = iommu_tce_put_param_check(tbl, param.iova, tce);
220 if (ret)
221 return ret;
222
223 for (i = 0; i < (param.size >> IOMMU_PAGE_SHIFT); ++i) {
224 ret = iommu_put_tce_user_mode(tbl,
225 (param.iova >> IOMMU_PAGE_SHIFT) + i,
226 tce);
227 if (ret)
228 break;
229 tce += IOMMU_PAGE_SIZE;
230 }
231 if (ret)
232 iommu_clear_tces_and_put_pages(tbl,
233 param.iova >> IOMMU_PAGE_SHIFT, i);
234
235 iommu_flush_tce(tbl);
236
237 return ret;
238 }
239 case VFIO_IOMMU_UNMAP_DMA: {
240 struct vfio_iommu_type1_dma_unmap param;
241 struct iommu_table *tbl = container->tbl;
242
243 if (WARN_ON(!tbl))
244 return -ENXIO;
245
246 minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
247 size);
248
249 if (copy_from_user(&param, (void __user *)arg, minsz))
250 return -EFAULT;
251
252 if (param.argsz < minsz)
253 return -EINVAL;
254
255 /* No flag is supported now */
256 if (param.flags)
257 return -EINVAL;
258
259 if (param.size & ~IOMMU_PAGE_MASK)
260 return -EINVAL;
261
262 ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
263 param.size >> IOMMU_PAGE_SHIFT);
264 if (ret)
265 return ret;
266
267 ret = iommu_clear_tces_and_put_pages(tbl,
268 param.iova >> IOMMU_PAGE_SHIFT,
269 param.size >> IOMMU_PAGE_SHIFT);
270 iommu_flush_tce(tbl);
271
272 return ret;
273 }
274 case VFIO_IOMMU_ENABLE:
275 mutex_lock(&container->lock);
276 ret = tce_iommu_enable(container);
277 mutex_unlock(&container->lock);
278 return ret;
279
280
281 case VFIO_IOMMU_DISABLE:
282 mutex_lock(&container->lock);
283 tce_iommu_disable(container);
284 mutex_unlock(&container->lock);
285 return 0;
286 }
287
288 return -ENOTTY;
289}
290
291static int tce_iommu_attach_group(void *iommu_data,
292 struct iommu_group *iommu_group)
293{
294 int ret;
295 struct tce_container *container = iommu_data;
296 struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
297
298 BUG_ON(!tbl);
299 mutex_lock(&container->lock);
300
301 /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
302 iommu_group_id(iommu_group), iommu_group); */
303 if (container->tbl) {
304 pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
305 iommu_group_id(container->tbl->it_group),
306 iommu_group_id(iommu_group));
307 ret = -EBUSY;
308 } else if (container->enabled) {
309 pr_err("tce_vfio: attaching group #%u to enabled container\n",
310 iommu_group_id(iommu_group));
311 ret = -EBUSY;
312 } else {
313 ret = iommu_take_ownership(tbl);
314 if (!ret)
315 container->tbl = tbl;
316 }
317
318 mutex_unlock(&container->lock);
319
320 return ret;
321}
322
323static void tce_iommu_detach_group(void *iommu_data,
324 struct iommu_group *iommu_group)
325{
326 struct tce_container *container = iommu_data;
327 struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
328
329 BUG_ON(!tbl);
330 mutex_lock(&container->lock);
331 if (tbl != container->tbl) {
332 pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
333 iommu_group_id(iommu_group),
334 iommu_group_id(tbl->it_group));
335 } else {
336 if (container->enabled) {
337 pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n",
338 iommu_group_id(tbl->it_group));
339 tce_iommu_disable(container);
340 }
341
342 /* pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
343 iommu_group_id(iommu_group), iommu_group); */
344 container->tbl = NULL;
345 iommu_release_ownership(tbl);
346 }
347 mutex_unlock(&container->lock);
348}
349
350const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
351 .name = "iommu-vfio-powerpc",
352 .owner = THIS_MODULE,
353 .open = tce_iommu_open,
354 .release = tce_iommu_release,
355 .ioctl = tce_iommu_ioctl,
356 .attach_group = tce_iommu_attach_group,
357 .detach_group = tce_iommu_detach_group,
358};
359
360static int __init tce_iommu_init(void)
361{
362 return vfio_register_iommu_driver(&tce_iommu_driver_ops);
363}
364
365static void __exit tce_iommu_cleanup(void)
366{
367 vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
368}
369
370module_init(tce_iommu_init);
371module_exit(tce_iommu_cleanup);
372
373MODULE_VERSION(DRIVER_VERSION);
374MODULE_LICENSE("GPL v2");
375MODULE_AUTHOR(DRIVER_AUTHOR);
376MODULE_DESCRIPTION(DRIVER_DESC);
377
diff --git a/drivers/watchdog/booke_wdt.c b/drivers/watchdog/booke_wdt.c
index a8dbceb32914..f1b8d555080e 100644
--- a/drivers/watchdog/booke_wdt.c
+++ b/drivers/watchdog/booke_wdt.c
@@ -138,6 +138,14 @@ static void __booke_wdt_enable(void *data)
138 val &= ~WDTP_MASK; 138 val &= ~WDTP_MASK;
139 val |= (TCR_WIE|TCR_WRC(WRC_CHIP)|WDTP(booke_wdt_period)); 139 val |= (TCR_WIE|TCR_WRC(WRC_CHIP)|WDTP(booke_wdt_period));
140 140
141#ifdef CONFIG_PPC_BOOK3E_64
142 /*
143 * Crit ints are currently broken on PPC64 Book-E, so
144 * just disable them for now.
145 */
146 val &= ~TCR_WIE;
147#endif
148
141 mtspr(SPRN_TCR, val); 149 mtspr(SPRN_TCR, val);
142} 150}
143 151
diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index 43b12807a51d..76a4eeb92982 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c
@@ -44,7 +44,7 @@ static void notrace pstore_ftrace_call(unsigned long ip,
44 rec.parent_ip = parent_ip; 44 rec.parent_ip = parent_ip;
45 pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id()); 45 pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id());
46 psinfo->write_buf(PSTORE_TYPE_FTRACE, 0, NULL, 0, (void *)&rec, 46 psinfo->write_buf(PSTORE_TYPE_FTRACE, 0, NULL, 0, (void *)&rec,
47 sizeof(rec), psinfo); 47 0, sizeof(rec), psinfo);
48 48
49 local_irq_restore(flags); 49 local_irq_restore(flags);
50} 50}
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index bfd95bf38005..71bf5f4ae84c 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -326,6 +326,15 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
326 case PSTORE_TYPE_MCE: 326 case PSTORE_TYPE_MCE:
327 sprintf(name, "mce-%s-%lld", psname, id); 327 sprintf(name, "mce-%s-%lld", psname, id);
328 break; 328 break;
329 case PSTORE_TYPE_PPC_RTAS:
330 sprintf(name, "rtas-%s-%lld", psname, id);
331 break;
332 case PSTORE_TYPE_PPC_OF:
333 sprintf(name, "powerpc-ofw-%s-%lld", psname, id);
334 break;
335 case PSTORE_TYPE_PPC_COMMON:
336 sprintf(name, "powerpc-common-%s-%lld", psname, id);
337 break;
329 case PSTORE_TYPE_UNKNOWN: 338 case PSTORE_TYPE_UNKNOWN:
330 sprintf(name, "unknown-%s-%lld", psname, id); 339 sprintf(name, "unknown-%s-%lld", psname, id);
331 break; 340 break;
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index b7ffe2bcd9c4..422962ae9fc2 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -159,7 +159,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
159 break; 159 break;
160 160
161 ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part, 161 ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part,
162 oopscount, hsize + len, psinfo); 162 oopscount, hsize, hsize + len, psinfo);
163 if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted()) 163 if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted())
164 pstore_new_entry = 1; 164 pstore_new_entry = 1;
165 165
@@ -196,7 +196,7 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
196 spin_lock_irqsave(&psinfo->buf_lock, flags); 196 spin_lock_irqsave(&psinfo->buf_lock, flags);
197 } 197 }
198 memcpy(psinfo->buf, s, c); 198 memcpy(psinfo->buf, s, c);
199 psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, c, psinfo); 199 psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, 0, c, psinfo);
200 spin_unlock_irqrestore(&psinfo->buf_lock, flags); 200 spin_unlock_irqrestore(&psinfo->buf_lock, flags);
201 s += c; 201 s += c;
202 c = e - s; 202 c = e - s;
@@ -221,9 +221,11 @@ static void pstore_register_console(void) {}
221static int pstore_write_compat(enum pstore_type_id type, 221static int pstore_write_compat(enum pstore_type_id type,
222 enum kmsg_dump_reason reason, 222 enum kmsg_dump_reason reason,
223 u64 *id, unsigned int part, int count, 223 u64 *id, unsigned int part, int count,
224 size_t size, struct pstore_info *psi) 224 size_t hsize, size_t size,
225 struct pstore_info *psi)
225{ 226{
226 return psi->write_buf(type, reason, id, part, psinfo->buf, size, psi); 227 return psi->write_buf(type, reason, id, part, psinfo->buf, hsize,
228 size, psi);
227} 229}
228 230
229/* 231/*
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 43abee2c6cb9..a6119f9469e2 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -195,7 +195,8 @@ static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz)
195static int notrace ramoops_pstore_write_buf(enum pstore_type_id type, 195static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
196 enum kmsg_dump_reason reason, 196 enum kmsg_dump_reason reason,
197 u64 *id, unsigned int part, 197 u64 *id, unsigned int part,
198 const char *buf, size_t size, 198 const char *buf,
199 size_t hsize, size_t size,
199 struct pstore_info *psi) 200 struct pstore_info *psi)
200{ 201{
201 struct ramoops_context *cxt = psi->data; 202 struct ramoops_context *cxt = psi->data;
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index a7126d28f4cf..2f47ade1b567 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -173,11 +173,12 @@ extern void pmdp_splitting_flush(struct vm_area_struct *vma,
173#endif 173#endif
174 174
175#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT 175#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
176extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable); 176extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
177 pgtable_t pgtable);
177#endif 178#endif
178 179
179#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW 180#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
180extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm); 181extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
181#endif 182#endif
182 183
183#ifndef __HAVE_ARCH_PMDP_INVALIDATE 184#ifndef __HAVE_ARCH_PMDP_INVALIDATE
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 26ee56c80dc7..b60de92e2edc 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -60,9 +60,9 @@ extern pmd_t *page_check_address_pmd(struct page *page,
60#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER) 60#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
61 61
62#ifdef CONFIG_TRANSPARENT_HUGEPAGE 62#ifdef CONFIG_TRANSPARENT_HUGEPAGE
63#define HPAGE_PMD_SHIFT HPAGE_SHIFT 63#define HPAGE_PMD_SHIFT PMD_SHIFT
64#define HPAGE_PMD_MASK HPAGE_MASK 64#define HPAGE_PMD_SIZE ((1UL) << HPAGE_PMD_SHIFT)
65#define HPAGE_PMD_SIZE HPAGE_SIZE 65#define HPAGE_PMD_MASK (~(HPAGE_PMD_SIZE - 1))
66 66
67extern bool is_vma_temporary_stack(struct vm_area_struct *vma); 67extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
68 68
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index 75d01760c911..4aa80ba830a2 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -35,6 +35,10 @@ enum pstore_type_id {
35 PSTORE_TYPE_MCE = 1, 35 PSTORE_TYPE_MCE = 1,
36 PSTORE_TYPE_CONSOLE = 2, 36 PSTORE_TYPE_CONSOLE = 2,
37 PSTORE_TYPE_FTRACE = 3, 37 PSTORE_TYPE_FTRACE = 3,
38 /* PPC64 partition types */
39 PSTORE_TYPE_PPC_RTAS = 4,
40 PSTORE_TYPE_PPC_OF = 5,
41 PSTORE_TYPE_PPC_COMMON = 6,
38 PSTORE_TYPE_UNKNOWN = 255 42 PSTORE_TYPE_UNKNOWN = 255
39}; 43};
40 44
@@ -54,12 +58,12 @@ struct pstore_info {
54 struct pstore_info *psi); 58 struct pstore_info *psi);
55 int (*write)(enum pstore_type_id type, 59 int (*write)(enum pstore_type_id type,
56 enum kmsg_dump_reason reason, u64 *id, 60 enum kmsg_dump_reason reason, u64 *id,
57 unsigned int part, int count, size_t size, 61 unsigned int part, int count, size_t hsize,
58 struct pstore_info *psi); 62 size_t size, struct pstore_info *psi);
59 int (*write_buf)(enum pstore_type_id type, 63 int (*write_buf)(enum pstore_type_id type,
60 enum kmsg_dump_reason reason, u64 *id, 64 enum kmsg_dump_reason reason, u64 *id,
61 unsigned int part, const char *buf, size_t size, 65 unsigned int part, const char *buf, size_t hsize,
62 struct pstore_info *psi); 66 size_t size, struct pstore_info *psi);
63 int (*erase)(enum pstore_type_id type, u64 id, 67 int (*erase)(enum pstore_type_id type, u64 id,
64 int count, struct timespec time, 68 int count, struct timespec time,
65 struct pstore_info *psi); 69 struct pstore_info *psi);
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 284ff2436829..87ee4f4cff25 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -22,6 +22,7 @@
22/* Extensions */ 22/* Extensions */
23 23
24#define VFIO_TYPE1_IOMMU 1 24#define VFIO_TYPE1_IOMMU 1
25#define VFIO_SPAPR_TCE_IOMMU 2
25 26
26/* 27/*
27 * The IOCTL interface is designed for extensibility by embedding the 28 * The IOCTL interface is designed for extensibility by embedding the
@@ -375,4 +376,37 @@ struct vfio_iommu_type1_dma_unmap {
375 376
376#define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14) 377#define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
377 378
379/*
380 * IOCTLs to enable/disable IOMMU container usage.
381 * No parameters are supported.
382 */
383#define VFIO_IOMMU_ENABLE _IO(VFIO_TYPE, VFIO_BASE + 15)
384#define VFIO_IOMMU_DISABLE _IO(VFIO_TYPE, VFIO_BASE + 16)
385
386/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
387
388/*
389 * The SPAPR TCE info struct provides the information about the PCI bus
390 * address ranges available for DMA, these values are programmed into
391 * the hardware so the guest has to know that information.
392 *
393 * The DMA 32 bit window start is an absolute PCI bus address.
394 * The IOVA address passed via map/unmap ioctls are absolute PCI bus
395 * addresses too so the window works as a filter rather than an offset
396 * for IOVA addresses.
397 *
398 * A flag will need to be added if other page sizes are supported,
399 * so as defined here, it is always 4k.
400 */
401struct vfio_iommu_spapr_tce_info {
402 __u32 argsz;
403 __u32 flags; /* reserved for future use */
404 __u32 dma32_window_start; /* 32 bit window start (bytes) */
405 __u32 dma32_window_size; /* 32 bit window size (bytes) */
406};
407
408#define VFIO_IOMMU_SPAPR_TCE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
409
410/* ***************************************************************** */
411
378#endif /* _UAPIVFIO_H */ 412#endif /* _UAPIVFIO_H */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d8b3b850150c..243e710c6039 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -729,8 +729,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
729 pmd_t entry; 729 pmd_t entry;
730 entry = mk_huge_pmd(page, vma); 730 entry = mk_huge_pmd(page, vma);
731 page_add_new_anon_rmap(page, vma, haddr); 731 page_add_new_anon_rmap(page, vma, haddr);
732 pgtable_trans_huge_deposit(mm, pmd, pgtable);
732 set_pmd_at(mm, haddr, pmd, entry); 733 set_pmd_at(mm, haddr, pmd, entry);
733 pgtable_trans_huge_deposit(mm, pgtable);
734 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); 734 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
735 mm->nr_ptes++; 735 mm->nr_ptes++;
736 spin_unlock(&mm->page_table_lock); 736 spin_unlock(&mm->page_table_lock);
@@ -771,8 +771,8 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
771 entry = mk_pmd(zero_page, vma->vm_page_prot); 771 entry = mk_pmd(zero_page, vma->vm_page_prot);
772 entry = pmd_wrprotect(entry); 772 entry = pmd_wrprotect(entry);
773 entry = pmd_mkhuge(entry); 773 entry = pmd_mkhuge(entry);
774 pgtable_trans_huge_deposit(mm, pmd, pgtable);
774 set_pmd_at(mm, haddr, pmd, entry); 775 set_pmd_at(mm, haddr, pmd, entry);
775 pgtable_trans_huge_deposit(mm, pgtable);
776 mm->nr_ptes++; 776 mm->nr_ptes++;
777 return true; 777 return true;
778} 778}
@@ -916,8 +916,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
916 916
917 pmdp_set_wrprotect(src_mm, addr, src_pmd); 917 pmdp_set_wrprotect(src_mm, addr, src_pmd);
918 pmd = pmd_mkold(pmd_wrprotect(pmd)); 918 pmd = pmd_mkold(pmd_wrprotect(pmd));
919 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
919 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 920 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
920 pgtable_trans_huge_deposit(dst_mm, pgtable);
921 dst_mm->nr_ptes++; 921 dst_mm->nr_ptes++;
922 922
923 ret = 0; 923 ret = 0;
@@ -987,7 +987,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
987 pmdp_clear_flush(vma, haddr, pmd); 987 pmdp_clear_flush(vma, haddr, pmd);
988 /* leave pmd empty until pte is filled */ 988 /* leave pmd empty until pte is filled */
989 989
990 pgtable = pgtable_trans_huge_withdraw(mm); 990 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
991 pmd_populate(mm, &_pmd, pgtable); 991 pmd_populate(mm, &_pmd, pgtable);
992 992
993 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 993 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -1085,7 +1085,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
1085 pmdp_clear_flush(vma, haddr, pmd); 1085 pmdp_clear_flush(vma, haddr, pmd);
1086 /* leave pmd empty until pte is filled */ 1086 /* leave pmd empty until pte is filled */
1087 1087
1088 pgtable = pgtable_trans_huge_withdraw(mm); 1088 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
1089 pmd_populate(mm, &_pmd, pgtable); 1089 pmd_populate(mm, &_pmd, pgtable);
1090 1090
1091 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 1091 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -1265,7 +1265,9 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1265 * young bit, instead of the current set_pmd_at. 1265 * young bit, instead of the current set_pmd_at.
1266 */ 1266 */
1267 _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); 1267 _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
1268 set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); 1268 if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
1269 pmd, _pmd, 1))
1270 update_mmu_cache_pmd(vma, addr, pmd);
1269 } 1271 }
1270 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { 1272 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
1271 if (page->mapping && trylock_page(page)) { 1273 if (page->mapping && trylock_page(page)) {
@@ -1358,9 +1360,15 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1358 struct page *page; 1360 struct page *page;
1359 pgtable_t pgtable; 1361 pgtable_t pgtable;
1360 pmd_t orig_pmd; 1362 pmd_t orig_pmd;
1361 pgtable = pgtable_trans_huge_withdraw(tlb->mm); 1363 /*
1364 * For architectures like ppc64 we look at deposited pgtable
1365 * when calling pmdp_get_and_clear. So do the
1366 * pgtable_trans_huge_withdraw after finishing pmdp related
1367 * operations.
1368 */
1362 orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); 1369 orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
1363 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 1370 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1371 pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
1364 if (is_huge_zero_pmd(orig_pmd)) { 1372 if (is_huge_zero_pmd(orig_pmd)) {
1365 tlb->mm->nr_ptes--; 1373 tlb->mm->nr_ptes--;
1366 spin_unlock(&tlb->mm->page_table_lock); 1374 spin_unlock(&tlb->mm->page_table_lock);
@@ -1691,7 +1699,7 @@ static int __split_huge_page_map(struct page *page,
1691 pmd = page_check_address_pmd(page, mm, address, 1699 pmd = page_check_address_pmd(page, mm, address,
1692 PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); 1700 PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
1693 if (pmd) { 1701 if (pmd) {
1694 pgtable = pgtable_trans_huge_withdraw(mm); 1702 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
1695 pmd_populate(mm, &_pmd, pgtable); 1703 pmd_populate(mm, &_pmd, pgtable);
1696 1704
1697 haddr = address; 1705 haddr = address;
@@ -2359,9 +2367,9 @@ static void collapse_huge_page(struct mm_struct *mm,
2359 spin_lock(&mm->page_table_lock); 2367 spin_lock(&mm->page_table_lock);
2360 BUG_ON(!pmd_none(*pmd)); 2368 BUG_ON(!pmd_none(*pmd));
2361 page_add_new_anon_rmap(new_page, vma, address); 2369 page_add_new_anon_rmap(new_page, vma, address);
2370 pgtable_trans_huge_deposit(mm, pmd, pgtable);
2362 set_pmd_at(mm, address, pmd, _pmd); 2371 set_pmd_at(mm, address, pmd, _pmd);
2363 update_mmu_cache_pmd(vma, address, pmd); 2372 update_mmu_cache_pmd(vma, address, pmd);
2364 pgtable_trans_huge_deposit(mm, pgtable);
2365 spin_unlock(&mm->page_table_lock); 2373 spin_unlock(&mm->page_table_lock);
2366 2374
2367 *hpage = NULL; 2375 *hpage = NULL;
@@ -2667,7 +2675,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
2667 pmdp_clear_flush(vma, haddr, pmd); 2675 pmdp_clear_flush(vma, haddr, pmd);
2668 /* leave pmd empty until pte is filled */ 2676 /* leave pmd empty until pte is filled */
2669 2677
2670 pgtable = pgtable_trans_huge_withdraw(mm); 2678 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2671 pmd_populate(mm, &_pmd, pgtable); 2679 pmd_populate(mm, &_pmd, pgtable);
2672 2680
2673 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 2681 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 0c8323fe6c8f..e1a6e4fab016 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -124,7 +124,8 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
124 124
125#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT 125#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
126#ifdef CONFIG_TRANSPARENT_HUGEPAGE 126#ifdef CONFIG_TRANSPARENT_HUGEPAGE
127void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) 127void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
128 pgtable_t pgtable)
128{ 129{
129 assert_spin_locked(&mm->page_table_lock); 130 assert_spin_locked(&mm->page_table_lock);
130 131
@@ -141,7 +142,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
141#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW 142#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
142#ifdef CONFIG_TRANSPARENT_HUGEPAGE 143#ifdef CONFIG_TRANSPARENT_HUGEPAGE
143/* no "address" argument so destroys page coloring of some arch */ 144/* no "address" argument so destroys page coloring of some arch */
144pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm) 145pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
145{ 146{
146 pgtable_t pgtable; 147 pgtable_t pgtable;
147 148