aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-07-25 14:08:17 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-07-25 14:08:17 -0400
commit5047887caf1806f31652210df27fb62a7c43f27d (patch)
tree4098ead40c1aa7b904167f67cff87a247cfa0b6c /arch
parent996abf053eec4d67136be8b911bbaaf989cfb99c (diff)
parent973b7d83ebeb1e34b8bee69208916e5f0e2353c3 (diff)
Merge branch 'merge' of git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc
* 'merge' of git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc: (34 commits) powerpc: Wireup new syscalls Move update_mmu_cache() declaration from tlbflush.h to pgtable.h powerpc/pseries: Remove kmalloc call in handling writes to lparcfg powerpc/pseries: Update arch vector to indicate support for CMO ibmvfc: Add support for collaborative memory overcommit ibmvscsi: driver enablement for CMO ibmveth: enable driver for CMO ibmveth: Automatically enable larger rx buffer pools for larger mtu powerpc/pseries: Verify CMO memory entitlement updates with virtual I/O powerpc/pseries: vio bus support for CMO powerpc/pseries: iommu enablement for CMO powerpc/pseries: Add CMO paging statistics powerpc/pseries: Add collaborative memory manager powerpc/pseries: Utilities to set firmware page state powerpc/pseries: Enable CMO feature during platform setup powerpc/pseries: Split retrieval of processor entitlement data into a helper routine powerpc/pseries: Add memory entitlement capabilities to /proc/ppc64/lparcfg powerpc/pseries: Split processor entitlement retrieval and gathering to helper routines powerpc/pseries: Remove extraneous error reporting for hcall failures in lparcfg powerpc: Fix compile error with binutils 2.15 ... Fixed up conflict in arch/powerpc/platforms/52xx/Kconfig manually.
Diffstat (limited to 'arch')
-rw-r--r--arch/powerpc/kernel/cputable.c11
-rw-r--r--arch/powerpc/kernel/entry_32.S6
-rw-r--r--arch/powerpc/kernel/iommu.c28
-rw-r--r--arch/powerpc/kernel/lparcfg.c386
-rw-r--r--arch/powerpc/kernel/process.c46
-rw-r--r--arch/powerpc/kernel/prom_init.c9
-rw-r--r--arch/powerpc/kernel/ptrace.c72
-rw-r--r--arch/powerpc/kernel/signal.c6
-rw-r--r--arch/powerpc/kernel/sysfs.c3
-rw-r--r--arch/powerpc/kernel/traps.c16
-rw-r--r--arch/powerpc/kernel/vio.c1033
-rw-r--r--arch/powerpc/kernel/vmlinux.lds.S2
-rw-r--r--arch/powerpc/mm/fault.c25
-rw-r--r--arch/powerpc/platforms/52xx/Kconfig2
-rw-r--r--arch/powerpc/platforms/cell/iommu.c16
-rw-r--r--arch/powerpc/platforms/cell/spufs/sched.c35
-rw-r--r--arch/powerpc/platforms/cell/spufs/sputrace.c3
-rw-r--r--arch/powerpc/platforms/iseries/iommu.c3
-rw-r--r--arch/powerpc/platforms/pasemi/iommu.c3
-rw-r--r--arch/powerpc/platforms/pseries/Kconfig23
-rw-r--r--arch/powerpc/platforms/pseries/Makefile1
-rw-r--r--arch/powerpc/platforms/pseries/cmm.c468
-rw-r--r--arch/powerpc/platforms/pseries/iommu.c42
-rw-r--r--arch/powerpc/platforms/pseries/plpar_wrappers.h10
-rw-r--r--arch/powerpc/platforms/pseries/setup.c71
-rw-r--r--arch/powerpc/sysdev/dart_iommu.c3
26 files changed, 2118 insertions, 205 deletions
diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index b936a1dd0a50..25a052c16754 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -23,6 +23,9 @@
23struct cpu_spec* cur_cpu_spec = NULL; 23struct cpu_spec* cur_cpu_spec = NULL;
24EXPORT_SYMBOL(cur_cpu_spec); 24EXPORT_SYMBOL(cur_cpu_spec);
25 25
26/* The platform string corresponding to the real PVR */
27const char *powerpc_base_platform;
28
26/* NOTE: 29/* NOTE:
27 * Unlike ppc32, ppc64 will only call this once for the boot CPU, it's 30 * Unlike ppc32, ppc64 will only call this once for the boot CPU, it's
28 * the responsibility of the appropriate CPU save/restore functions to 31 * the responsibility of the appropriate CPU save/restore functions to
@@ -1652,6 +1655,14 @@ struct cpu_spec * __init identify_cpu(unsigned long offset, unsigned int pvr)
1652 } else 1655 } else
1653 *t = *s; 1656 *t = *s;
1654 *PTRRELOC(&cur_cpu_spec) = &the_cpu_spec; 1657 *PTRRELOC(&cur_cpu_spec) = &the_cpu_spec;
1658
1659 /*
1660 * Set the base platform string once; assumes
1661 * we're called with real pvr first.
1662 */
1663 if (powerpc_base_platform == NULL)
1664 powerpc_base_platform = t->platform;
1665
1655#if defined(CONFIG_PPC64) || defined(CONFIG_BOOKE) 1666#if defined(CONFIG_PPC64) || defined(CONFIG_BOOKE)
1656 /* ppc64 and booke expect identify_cpu to also call 1667 /* ppc64 and booke expect identify_cpu to also call
1657 * setup_cpu for that processor. I will consolidate 1668 * setup_cpu for that processor. I will consolidate
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index da52269aec1e..81c8324a4a3c 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -148,7 +148,7 @@ transfer_to_handler:
148 /* Check to see if the dbcr0 register is set up to debug. Use the 148 /* Check to see if the dbcr0 register is set up to debug. Use the
149 internal debug mode bit to do this. */ 149 internal debug mode bit to do this. */
150 lwz r12,THREAD_DBCR0(r12) 150 lwz r12,THREAD_DBCR0(r12)
151 andis. r12,r12,DBCR0_IDM@h 151 andis. r12,r12,(DBCR0_IDM | DBSR_DAC1R | DBSR_DAC1W)@h
152 beq+ 3f 152 beq+ 3f
153 /* From user and task is ptraced - load up global dbcr0 */ 153 /* From user and task is ptraced - load up global dbcr0 */
154 li r12,-1 /* clear all pending debug events */ 154 li r12,-1 /* clear all pending debug events */
@@ -292,7 +292,7 @@ syscall_exit_cont:
292 /* If the process has its own DBCR0 value, load it up. The internal 292 /* If the process has its own DBCR0 value, load it up. The internal
293 debug mode bit tells us that dbcr0 should be loaded. */ 293 debug mode bit tells us that dbcr0 should be loaded. */
294 lwz r0,THREAD+THREAD_DBCR0(r2) 294 lwz r0,THREAD+THREAD_DBCR0(r2)
295 andis. r10,r0,DBCR0_IDM@h 295 andis. r10,r0,(DBCR0_IDM | DBSR_DAC1R | DBSR_DAC1W)@h
296 bnel- load_dbcr0 296 bnel- load_dbcr0
297#endif 297#endif
298#ifdef CONFIG_44x 298#ifdef CONFIG_44x
@@ -720,7 +720,7 @@ restore_user:
720 /* Check whether this process has its own DBCR0 value. The internal 720 /* Check whether this process has its own DBCR0 value. The internal
721 debug mode bit tells us that dbcr0 should be loaded. */ 721 debug mode bit tells us that dbcr0 should be loaded. */
722 lwz r0,THREAD+THREAD_DBCR0(r2) 722 lwz r0,THREAD+THREAD_DBCR0(r2)
723 andis. r10,r0,DBCR0_IDM@h 723 andis. r10,r0,(DBCR0_IDM | DBSR_DAC1R | DBSR_DAC1W)@h
724 bnel- load_dbcr0 724 bnel- load_dbcr0
725#endif 725#endif
726 726
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 2385f68c1751..550a19399bfa 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -49,6 +49,8 @@ static int novmerge = 1;
49 49
50static int protect4gb = 1; 50static int protect4gb = 1;
51 51
52static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned int);
53
52static inline unsigned long iommu_num_pages(unsigned long vaddr, 54static inline unsigned long iommu_num_pages(unsigned long vaddr,
53 unsigned long slen) 55 unsigned long slen)
54{ 56{
@@ -191,6 +193,7 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
191{ 193{
192 unsigned long entry, flags; 194 unsigned long entry, flags;
193 dma_addr_t ret = DMA_ERROR_CODE; 195 dma_addr_t ret = DMA_ERROR_CODE;
196 int build_fail;
194 197
195 spin_lock_irqsave(&(tbl->it_lock), flags); 198 spin_lock_irqsave(&(tbl->it_lock), flags);
196 199
@@ -205,9 +208,21 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
205 ret = entry << IOMMU_PAGE_SHIFT; /* Set the return dma address */ 208 ret = entry << IOMMU_PAGE_SHIFT; /* Set the return dma address */
206 209
207 /* Put the TCEs in the HW table */ 210 /* Put the TCEs in the HW table */
208 ppc_md.tce_build(tbl, entry, npages, (unsigned long)page & IOMMU_PAGE_MASK, 211 build_fail = ppc_md.tce_build(tbl, entry, npages,
209 direction, attrs); 212 (unsigned long)page & IOMMU_PAGE_MASK,
213 direction, attrs);
214
215 /* ppc_md.tce_build() only returns non-zero for transient errors.
216 * Clean up the table bitmap in this case and return
217 * DMA_ERROR_CODE. For all other errors the functionality is
218 * not altered.
219 */
220 if (unlikely(build_fail)) {
221 __iommu_free(tbl, ret, npages);
210 222
223 spin_unlock_irqrestore(&(tbl->it_lock), flags);
224 return DMA_ERROR_CODE;
225 }
211 226
212 /* Flush/invalidate TLB caches if necessary */ 227 /* Flush/invalidate TLB caches if necessary */
213 if (ppc_md.tce_flush) 228 if (ppc_md.tce_flush)
@@ -276,7 +291,7 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
276 dma_addr_t dma_next = 0, dma_addr; 291 dma_addr_t dma_next = 0, dma_addr;
277 unsigned long flags; 292 unsigned long flags;
278 struct scatterlist *s, *outs, *segstart; 293 struct scatterlist *s, *outs, *segstart;
279 int outcount, incount, i; 294 int outcount, incount, i, build_fail = 0;
280 unsigned int align; 295 unsigned int align;
281 unsigned long handle; 296 unsigned long handle;
282 unsigned int max_seg_size; 297 unsigned int max_seg_size;
@@ -337,8 +352,11 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
337 npages, entry, dma_addr); 352 npages, entry, dma_addr);
338 353
339 /* Insert into HW table */ 354 /* Insert into HW table */
340 ppc_md.tce_build(tbl, entry, npages, vaddr & IOMMU_PAGE_MASK, 355 build_fail = ppc_md.tce_build(tbl, entry, npages,
341 direction, attrs); 356 vaddr & IOMMU_PAGE_MASK,
357 direction, attrs);
358 if(unlikely(build_fail))
359 goto failure;
342 360
343 /* If we are in an open segment, try merging */ 361 /* If we are in an open segment, try merging */
344 if (segstart != s) { 362 if (segstart != s) {
diff --git a/arch/powerpc/kernel/lparcfg.c b/arch/powerpc/kernel/lparcfg.c
index 827a5726a035..9f856a0c3e38 100644
--- a/arch/powerpc/kernel/lparcfg.c
+++ b/arch/powerpc/kernel/lparcfg.c
@@ -34,8 +34,9 @@
34#include <asm/time.h> 34#include <asm/time.h>
35#include <asm/prom.h> 35#include <asm/prom.h>
36#include <asm/vdso_datapage.h> 36#include <asm/vdso_datapage.h>
37#include <asm/vio.h>
37 38
38#define MODULE_VERS "1.7" 39#define MODULE_VERS "1.8"
39#define MODULE_NAME "lparcfg" 40#define MODULE_NAME "lparcfg"
40 41
41/* #define LPARCFG_DEBUG */ 42/* #define LPARCFG_DEBUG */
@@ -129,32 +130,46 @@ static int iseries_lparcfg_data(struct seq_file *m, void *v)
129/* 130/*
130 * Methods used to fetch LPAR data when running on a pSeries platform. 131 * Methods used to fetch LPAR data when running on a pSeries platform.
131 */ 132 */
132static void log_plpar_hcall_return(unsigned long rc, char *tag) 133/**
134 * h_get_mpp
135 * H_GET_MPP hcall returns info in 7 parms
136 */
137int h_get_mpp(struct hvcall_mpp_data *mpp_data)
133{ 138{
134 switch(rc) { 139 int rc;
135 case 0: 140 unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
136 return; 141
137 case H_HARDWARE: 142 rc = plpar_hcall9(H_GET_MPP, retbuf);
138 printk(KERN_INFO "plpar-hcall (%s) " 143
139 "Hardware fault\n", tag); 144 mpp_data->entitled_mem = retbuf[0];
140 return; 145 mpp_data->mapped_mem = retbuf[1];
141 case H_FUNCTION: 146
142 printk(KERN_INFO "plpar-hcall (%s) " 147 mpp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff;
143 "Function not allowed\n", tag); 148 mpp_data->pool_num = retbuf[2] & 0xffff;
144 return; 149
145 case H_AUTHORITY: 150 mpp_data->mem_weight = (retbuf[3] >> 7 * 8) & 0xff;
146 printk(KERN_INFO "plpar-hcall (%s) " 151 mpp_data->unallocated_mem_weight = (retbuf[3] >> 6 * 8) & 0xff;
147 "Not authorized to this function\n", tag); 152 mpp_data->unallocated_entitlement = retbuf[3] & 0xffffffffffff;
148 return; 153
149 case H_PARAMETER: 154 mpp_data->pool_size = retbuf[4];
150 printk(KERN_INFO "plpar-hcall (%s) " 155 mpp_data->loan_request = retbuf[5];
151 "Bad parameter(s)\n",tag); 156 mpp_data->backing_mem = retbuf[6];
152 return; 157
153 default: 158 return rc;
154 printk(KERN_INFO "plpar-hcall (%s) "
155 "Unexpected rc(0x%lx)\n", tag, rc);
156 }
157} 159}
160EXPORT_SYMBOL(h_get_mpp);
161
162struct hvcall_ppp_data {
163 u64 entitlement;
164 u64 unallocated_entitlement;
165 u16 group_num;
166 u16 pool_num;
167 u8 capped;
168 u8 weight;
169 u8 unallocated_weight;
170 u16 active_procs_in_pool;
171 u16 active_system_procs;
172};
158 173
159/* 174/*
160 * H_GET_PPP hcall returns info in 4 parms. 175 * H_GET_PPP hcall returns info in 4 parms.
@@ -176,27 +191,30 @@ static void log_plpar_hcall_return(unsigned long rc, char *tag)
176 * XXXX - Active processors in Physical Processor Pool. 191 * XXXX - Active processors in Physical Processor Pool.
177 * XXXX - Processors active on platform. 192 * XXXX - Processors active on platform.
178 */ 193 */
179static unsigned int h_get_ppp(unsigned long *entitled, 194static unsigned int h_get_ppp(struct hvcall_ppp_data *ppp_data)
180 unsigned long *unallocated,
181 unsigned long *aggregation,
182 unsigned long *resource)
183{ 195{
184 unsigned long rc; 196 unsigned long rc;
185 unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; 197 unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
186 198
187 rc = plpar_hcall(H_GET_PPP, retbuf); 199 rc = plpar_hcall(H_GET_PPP, retbuf);
188 200
189 *entitled = retbuf[0]; 201 ppp_data->entitlement = retbuf[0];
190 *unallocated = retbuf[1]; 202 ppp_data->unallocated_entitlement = retbuf[1];
191 *aggregation = retbuf[2]; 203
192 *resource = retbuf[3]; 204 ppp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff;
205 ppp_data->pool_num = retbuf[2] & 0xffff;
193 206
194 log_plpar_hcall_return(rc, "H_GET_PPP"); 207 ppp_data->capped = (retbuf[3] >> 6 * 8) & 0x01;
208 ppp_data->weight = (retbuf[3] >> 5 * 8) & 0xff;
209 ppp_data->unallocated_weight = (retbuf[3] >> 4 * 8) & 0xff;
210 ppp_data->active_procs_in_pool = (retbuf[3] >> 2 * 8) & 0xffff;
211 ppp_data->active_system_procs = retbuf[3] & 0xffff;
195 212
196 return rc; 213 return rc;
197} 214}
198 215
199static void h_pic(unsigned long *pool_idle_time, unsigned long *num_procs) 216static unsigned h_pic(unsigned long *pool_idle_time,
217 unsigned long *num_procs)
200{ 218{
201 unsigned long rc; 219 unsigned long rc;
202 unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; 220 unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
@@ -206,8 +224,87 @@ static void h_pic(unsigned long *pool_idle_time, unsigned long *num_procs)
206 *pool_idle_time = retbuf[0]; 224 *pool_idle_time = retbuf[0];
207 *num_procs = retbuf[1]; 225 *num_procs = retbuf[1];
208 226
209 if (rc != H_AUTHORITY) 227 return rc;
210 log_plpar_hcall_return(rc, "H_PIC"); 228}
229
230/*
231 * parse_ppp_data
232 * Parse out the data returned from h_get_ppp and h_pic
233 */
234static void parse_ppp_data(struct seq_file *m)
235{
236 struct hvcall_ppp_data ppp_data;
237 int rc;
238
239 rc = h_get_ppp(&ppp_data);
240 if (rc)
241 return;
242
243 seq_printf(m, "partition_entitled_capacity=%ld\n",
244 ppp_data.entitlement);
245 seq_printf(m, "group=%d\n", ppp_data.group_num);
246 seq_printf(m, "system_active_processors=%d\n",
247 ppp_data.active_system_procs);
248
249 /* pool related entries are apropriate for shared configs */
250 if (lppaca[0].shared_proc) {
251 unsigned long pool_idle_time, pool_procs;
252
253 seq_printf(m, "pool=%d\n", ppp_data.pool_num);
254
255 /* report pool_capacity in percentage */
256 seq_printf(m, "pool_capacity=%d\n",
257 ppp_data.active_procs_in_pool * 100);
258
259 h_pic(&pool_idle_time, &pool_procs);
260 seq_printf(m, "pool_idle_time=%ld\n", pool_idle_time);
261 seq_printf(m, "pool_num_procs=%ld\n", pool_procs);
262 }
263
264 seq_printf(m, "unallocated_capacity_weight=%d\n",
265 ppp_data.unallocated_weight);
266 seq_printf(m, "capacity_weight=%d\n", ppp_data.weight);
267 seq_printf(m, "capped=%d\n", ppp_data.capped);
268 seq_printf(m, "unallocated_capacity=%ld\n",
269 ppp_data.unallocated_entitlement);
270}
271
272/**
273 * parse_mpp_data
274 * Parse out data returned from h_get_mpp
275 */
276static void parse_mpp_data(struct seq_file *m)
277{
278 struct hvcall_mpp_data mpp_data;
279 int rc;
280
281 rc = h_get_mpp(&mpp_data);
282 if (rc)
283 return;
284
285 seq_printf(m, "entitled_memory=%ld\n", mpp_data.entitled_mem);
286
287 if (mpp_data.mapped_mem != -1)
288 seq_printf(m, "mapped_entitled_memory=%ld\n",
289 mpp_data.mapped_mem);
290
291 seq_printf(m, "entitled_memory_group_number=%d\n", mpp_data.group_num);
292 seq_printf(m, "entitled_memory_pool_number=%d\n", mpp_data.pool_num);
293
294 seq_printf(m, "entitled_memory_weight=%d\n", mpp_data.mem_weight);
295 seq_printf(m, "unallocated_entitled_memory_weight=%d\n",
296 mpp_data.unallocated_mem_weight);
297 seq_printf(m, "unallocated_io_mapping_entitlement=%ld\n",
298 mpp_data.unallocated_entitlement);
299
300 if (mpp_data.pool_size != -1)
301 seq_printf(m, "entitled_memory_pool_size=%ld bytes\n",
302 mpp_data.pool_size);
303
304 seq_printf(m, "entitled_memory_loan_request=%ld\n",
305 mpp_data.loan_request);
306
307 seq_printf(m, "backing_memory=%ld bytes\n", mpp_data.backing_mem);
211} 308}
212 309
213#define SPLPAR_CHARACTERISTICS_TOKEN 20 310#define SPLPAR_CHARACTERISTICS_TOKEN 20
@@ -313,6 +410,25 @@ static int lparcfg_count_active_processors(void)
313 return count; 410 return count;
314} 411}
315 412
413static void pseries_cmo_data(struct seq_file *m)
414{
415 int cpu;
416 unsigned long cmo_faults = 0;
417 unsigned long cmo_fault_time = 0;
418
419 if (!firmware_has_feature(FW_FEATURE_CMO))
420 return;
421
422 for_each_possible_cpu(cpu) {
423 cmo_faults += lppaca[cpu].cmo_faults;
424 cmo_fault_time += lppaca[cpu].cmo_fault_time;
425 }
426
427 seq_printf(m, "cmo_faults=%lu\n", cmo_faults);
428 seq_printf(m, "cmo_fault_time_usec=%lu\n",
429 cmo_fault_time / tb_ticks_per_usec);
430}
431
316static int pseries_lparcfg_data(struct seq_file *m, void *v) 432static int pseries_lparcfg_data(struct seq_file *m, void *v)
317{ 433{
318 int partition_potential_processors; 434 int partition_potential_processors;
@@ -334,60 +450,13 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v)
334 partition_active_processors = lparcfg_count_active_processors(); 450 partition_active_processors = lparcfg_count_active_processors();
335 451
336 if (firmware_has_feature(FW_FEATURE_SPLPAR)) { 452 if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
337 unsigned long h_entitled, h_unallocated;
338 unsigned long h_aggregation, h_resource;
339 unsigned long pool_idle_time, pool_procs;
340 unsigned long purr;
341
342 h_get_ppp(&h_entitled, &h_unallocated, &h_aggregation,
343 &h_resource);
344
345 seq_printf(m, "R4=0x%lx\n", h_entitled);
346 seq_printf(m, "R5=0x%lx\n", h_unallocated);
347 seq_printf(m, "R6=0x%lx\n", h_aggregation);
348 seq_printf(m, "R7=0x%lx\n", h_resource);
349
350 purr = get_purr();
351
352 /* this call handles the ibm,get-system-parameter contents */ 453 /* this call handles the ibm,get-system-parameter contents */
353 parse_system_parameter_string(m); 454 parse_system_parameter_string(m);
455 parse_ppp_data(m);
456 parse_mpp_data(m);
457 pseries_cmo_data(m);
354 458
355 seq_printf(m, "partition_entitled_capacity=%ld\n", h_entitled); 459 seq_printf(m, "purr=%ld\n", get_purr());
356
357 seq_printf(m, "group=%ld\n", (h_aggregation >> 2 * 8) & 0xffff);
358
359 seq_printf(m, "system_active_processors=%ld\n",
360 (h_resource >> 0 * 8) & 0xffff);
361
362 /* pool related entries are apropriate for shared configs */
363 if (lppaca[0].shared_proc) {
364
365 h_pic(&pool_idle_time, &pool_procs);
366
367 seq_printf(m, "pool=%ld\n",
368 (h_aggregation >> 0 * 8) & 0xffff);
369
370 /* report pool_capacity in percentage */
371 seq_printf(m, "pool_capacity=%ld\n",
372 ((h_resource >> 2 * 8) & 0xffff) * 100);
373
374 seq_printf(m, "pool_idle_time=%ld\n", pool_idle_time);
375
376 seq_printf(m, "pool_num_procs=%ld\n", pool_procs);
377 }
378
379 seq_printf(m, "unallocated_capacity_weight=%ld\n",
380 (h_resource >> 4 * 8) & 0xFF);
381
382 seq_printf(m, "capacity_weight=%ld\n",
383 (h_resource >> 5 * 8) & 0xFF);
384
385 seq_printf(m, "capped=%ld\n", (h_resource >> 6 * 8) & 0x01);
386
387 seq_printf(m, "unallocated_capacity=%ld\n", h_unallocated);
388
389 seq_printf(m, "purr=%ld\n", purr);
390
391 } else { /* non SPLPAR case */ 460 } else { /* non SPLPAR case */
392 461
393 seq_printf(m, "system_active_processors=%d\n", 462 seq_printf(m, "system_active_processors=%d\n",
@@ -414,6 +483,83 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v)
414 return 0; 483 return 0;
415} 484}
416 485
486static ssize_t update_ppp(u64 *entitlement, u8 *weight)
487{
488 struct hvcall_ppp_data ppp_data;
489 u8 new_weight;
490 u64 new_entitled;
491 ssize_t retval;
492
493 /* Get our current parameters */
494 retval = h_get_ppp(&ppp_data);
495 if (retval)
496 return retval;
497
498 if (entitlement) {
499 new_weight = ppp_data.weight;
500 new_entitled = *entitlement;
501 } else if (weight) {
502 new_weight = *weight;
503 new_entitled = ppp_data.entitlement;
504 } else
505 return -EINVAL;
506
507 pr_debug("%s: current_entitled = %lu, current_weight = %u\n",
508 __FUNCTION__, ppp_data.entitlement, ppp_data.weight);
509
510 pr_debug("%s: new_entitled = %lu, new_weight = %u\n",
511 __FUNCTION__, new_entitled, new_weight);
512
513 retval = plpar_hcall_norets(H_SET_PPP, new_entitled, new_weight);
514 return retval;
515}
516
517/**
518 * update_mpp
519 *
520 * Update the memory entitlement and weight for the partition. Caller must
521 * specify either a new entitlement or weight, not both, to be updated
522 * since the h_set_mpp call takes both entitlement and weight as parameters.
523 */
524static ssize_t update_mpp(u64 *entitlement, u8 *weight)
525{
526 struct hvcall_mpp_data mpp_data;
527 u64 new_entitled;
528 u8 new_weight;
529 ssize_t rc;
530
531 if (entitlement) {
532 /* Check with vio to ensure the new memory entitlement
533 * can be handled.
534 */
535 rc = vio_cmo_entitlement_update(*entitlement);
536 if (rc)
537 return rc;
538 }
539
540 rc = h_get_mpp(&mpp_data);
541 if (rc)
542 return rc;
543
544 if (entitlement) {
545 new_weight = mpp_data.mem_weight;
546 new_entitled = *entitlement;
547 } else if (weight) {
548 new_weight = *weight;
549 new_entitled = mpp_data.entitled_mem;
550 } else
551 return -EINVAL;
552
553 pr_debug("%s: current_entitled = %lu, current_weight = %u\n",
554 __FUNCTION__, mpp_data.entitled_mem, mpp_data.mem_weight);
555
556 pr_debug("%s: new_entitled = %lu, new_weight = %u\n",
557 __FUNCTION__, new_entitled, new_weight);
558
559 rc = plpar_hcall_norets(H_SET_MPP, new_entitled, new_weight);
560 return rc;
561}
562
417/* 563/*
418 * Interface for changing system parameters (variable capacity weight 564 * Interface for changing system parameters (variable capacity weight
419 * and entitled capacity). Format of input is "param_name=value"; 565 * and entitled capacity). Format of input is "param_name=value";
@@ -427,35 +573,27 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v)
427static ssize_t lparcfg_write(struct file *file, const char __user * buf, 573static ssize_t lparcfg_write(struct file *file, const char __user * buf,
428 size_t count, loff_t * off) 574 size_t count, loff_t * off)
429{ 575{
430 char *kbuf; 576 int kbuf_sz = 64;
577 char kbuf[kbuf_sz];
431 char *tmp; 578 char *tmp;
432 u64 new_entitled, *new_entitled_ptr = &new_entitled; 579 u64 new_entitled, *new_entitled_ptr = &new_entitled;
433 u8 new_weight, *new_weight_ptr = &new_weight; 580 u8 new_weight, *new_weight_ptr = &new_weight;
434 581 ssize_t retval;
435 unsigned long current_entitled; /* parameters for h_get_ppp */
436 unsigned long dummy;
437 unsigned long resource;
438 u8 current_weight;
439
440 ssize_t retval = -ENOMEM;
441 582
442 if (!firmware_has_feature(FW_FEATURE_SPLPAR) || 583 if (!firmware_has_feature(FW_FEATURE_SPLPAR) ||
443 firmware_has_feature(FW_FEATURE_ISERIES)) 584 firmware_has_feature(FW_FEATURE_ISERIES))
444 return -EINVAL; 585 return -EINVAL;
445 586
446 kbuf = kmalloc(count, GFP_KERNEL); 587 if (count > kbuf_sz)
447 if (!kbuf) 588 return -EINVAL;
448 goto out;
449 589
450 retval = -EFAULT;
451 if (copy_from_user(kbuf, buf, count)) 590 if (copy_from_user(kbuf, buf, count))
452 goto out; 591 return -EFAULT;
453 592
454 retval = -EINVAL;
455 kbuf[count - 1] = '\0'; 593 kbuf[count - 1] = '\0';
456 tmp = strchr(kbuf, '='); 594 tmp = strchr(kbuf, '=');
457 if (!tmp) 595 if (!tmp)
458 goto out; 596 return -EINVAL;
459 597
460 *tmp++ = '\0'; 598 *tmp++ = '\0';
461 599
@@ -463,34 +601,32 @@ static ssize_t lparcfg_write(struct file *file, const char __user * buf,
463 char *endp; 601 char *endp;
464 *new_entitled_ptr = (u64) simple_strtoul(tmp, &endp, 10); 602 *new_entitled_ptr = (u64) simple_strtoul(tmp, &endp, 10);
465 if (endp == tmp) 603 if (endp == tmp)
466 goto out; 604 return -EINVAL;
467 new_weight_ptr = &current_weight; 605
606 retval = update_ppp(new_entitled_ptr, NULL);
468 } else if (!strcmp(kbuf, "capacity_weight")) { 607 } else if (!strcmp(kbuf, "capacity_weight")) {
469 char *endp; 608 char *endp;
470 *new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10); 609 *new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10);
471 if (endp == tmp) 610 if (endp == tmp)
472 goto out; 611 return -EINVAL;
473 new_entitled_ptr = &current_entitled;
474 } else
475 goto out;
476
477 /* Get our current parameters */
478 retval = h_get_ppp(&current_entitled, &dummy, &dummy, &resource);
479 if (retval) {
480 retval = -EIO;
481 goto out;
482 }
483
484 current_weight = (resource >> 5 * 8) & 0xFF;
485 612
486 pr_debug("%s: current_entitled = %lu, current_weight = %u\n", 613 retval = update_ppp(NULL, new_weight_ptr);
487 __func__, current_entitled, current_weight); 614 } else if (!strcmp(kbuf, "entitled_memory")) {
615 char *endp;
616 *new_entitled_ptr = (u64) simple_strtoul(tmp, &endp, 10);
617 if (endp == tmp)
618 return -EINVAL;
488 619
489 pr_debug("%s: new_entitled = %lu, new_weight = %u\n", 620 retval = update_mpp(new_entitled_ptr, NULL);
490 __func__, *new_entitled_ptr, *new_weight_ptr); 621 } else if (!strcmp(kbuf, "entitled_memory_weight")) {
622 char *endp;
623 *new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10);
624 if (endp == tmp)
625 return -EINVAL;
491 626
492 retval = plpar_hcall_norets(H_SET_PPP, *new_entitled_ptr, 627 retval = update_mpp(NULL, new_weight_ptr);
493 *new_weight_ptr); 628 } else
629 return -EINVAL;
494 630
495 if (retval == H_SUCCESS || retval == H_CONSTRAINED) { 631 if (retval == H_SUCCESS || retval == H_CONSTRAINED) {
496 retval = count; 632 retval = count;
@@ -506,8 +642,6 @@ static ssize_t lparcfg_write(struct file *file, const char __user * buf,
506 retval = -EIO; 642 retval = -EIO;
507 } 643 }
508 644
509out:
510 kfree(kbuf);
511 return retval; 645 return retval;
512} 646}
513 647
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 219f3634115e..db2497ccc111 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -47,6 +47,8 @@
47#ifdef CONFIG_PPC64 47#ifdef CONFIG_PPC64
48#include <asm/firmware.h> 48#include <asm/firmware.h>
49#endif 49#endif
50#include <linux/kprobes.h>
51#include <linux/kdebug.h>
50 52
51extern unsigned long _get_SP(void); 53extern unsigned long _get_SP(void);
52 54
@@ -239,6 +241,35 @@ void discard_lazy_cpu_state(void)
239} 241}
240#endif /* CONFIG_SMP */ 242#endif /* CONFIG_SMP */
241 243
244void do_dabr(struct pt_regs *regs, unsigned long address,
245 unsigned long error_code)
246{
247 siginfo_t info;
248
249 if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code,
250 11, SIGSEGV) == NOTIFY_STOP)
251 return;
252
253 if (debugger_dabr_match(regs))
254 return;
255
256 /* Clear the DAC and struct entries. One shot trigger */
257#if (defined(CONFIG_44x) || defined(CONFIG_BOOKE))
258 mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) & ~(DBSR_DAC1R | DBSR_DAC1W
259 | DBCR0_IDM));
260#endif
261
262 /* Clear the DABR */
263 set_dabr(0);
264
265 /* Deliver the signal to userspace */
266 info.si_signo = SIGTRAP;
267 info.si_errno = 0;
268 info.si_code = TRAP_HWBKPT;
269 info.si_addr = (void __user *)address;
270 force_sig_info(SIGTRAP, &info, current);
271}
272
242static DEFINE_PER_CPU(unsigned long, current_dabr); 273static DEFINE_PER_CPU(unsigned long, current_dabr);
243 274
244int set_dabr(unsigned long dabr) 275int set_dabr(unsigned long dabr)
@@ -254,6 +285,11 @@ int set_dabr(unsigned long dabr)
254#if defined(CONFIG_PPC64) || defined(CONFIG_6xx) 285#if defined(CONFIG_PPC64) || defined(CONFIG_6xx)
255 mtspr(SPRN_DABR, dabr); 286 mtspr(SPRN_DABR, dabr);
256#endif 287#endif
288
289#if defined(CONFIG_44x) || defined(CONFIG_BOOKE)
290 mtspr(SPRN_DAC1, dabr);
291#endif
292
257 return 0; 293 return 0;
258} 294}
259 295
@@ -337,6 +373,12 @@ struct task_struct *__switch_to(struct task_struct *prev,
337 if (unlikely(__get_cpu_var(current_dabr) != new->thread.dabr)) 373 if (unlikely(__get_cpu_var(current_dabr) != new->thread.dabr))
338 set_dabr(new->thread.dabr); 374 set_dabr(new->thread.dabr);
339 375
376#if defined(CONFIG_44x) || defined(CONFIG_BOOKE)
377 /* If new thread DAC (HW breakpoint) is the same then leave it */
378 if (new->thread.dabr)
379 set_dabr(new->thread.dabr);
380#endif
381
340 new_thread = &new->thread; 382 new_thread = &new->thread;
341 old_thread = &current->thread; 383 old_thread = &current->thread;
342 384
@@ -525,6 +567,10 @@ void flush_thread(void)
525 if (current->thread.dabr) { 567 if (current->thread.dabr) {
526 current->thread.dabr = 0; 568 current->thread.dabr = 0;
527 set_dabr(0); 569 set_dabr(0);
570
571#if defined(CONFIG_44x) || defined(CONFIG_BOOKE)
572 current->thread.dbcr0 &= ~(DBSR_DAC1R | DBSR_DAC1W);
573#endif
528 } 574 }
529} 575}
530 576
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 1ea8c8d3ce89..c4ab2195b9cb 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -643,6 +643,11 @@ static void __init early_cmdline_parse(void)
643#else 643#else
644#define OV5_MSI 0x00 644#define OV5_MSI 0x00
645#endif /* CONFIG_PCI_MSI */ 645#endif /* CONFIG_PCI_MSI */
646#ifdef CONFIG_PPC_SMLPAR
647#define OV5_CMO 0x80 /* Cooperative Memory Overcommitment */
648#else
649#define OV5_CMO 0x00
650#endif
646 651
647/* 652/*
648 * The architecture vector has an array of PVR mask/value pairs, 653 * The architecture vector has an array of PVR mask/value pairs,
@@ -687,10 +692,12 @@ static unsigned char ibm_architecture_vec[] = {
687 0, /* don't halt */ 692 0, /* don't halt */
688 693
689 /* option vector 5: PAPR/OF options */ 694 /* option vector 5: PAPR/OF options */
690 3 - 2, /* length */ 695 5 - 2, /* length */
691 0, /* don't ignore, don't halt */ 696 0, /* don't ignore, don't halt */
692 OV5_LPAR | OV5_SPLPAR | OV5_LARGE_PAGES | OV5_DRCONF_MEMORY | 697 OV5_LPAR | OV5_SPLPAR | OV5_LARGE_PAGES | OV5_DRCONF_MEMORY |
693 OV5_DONATE_DEDICATE_CPU | OV5_MSI, 698 OV5_DONATE_DEDICATE_CPU | OV5_MSI,
699 0,
700 OV5_CMO,
694}; 701};
695 702
696/* Old method - ELF header with PT_NOTE sections */ 703/* Old method - ELF header with PT_NOTE sections */
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index 8feb93e7890c..a5d0e78779c8 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -703,7 +703,7 @@ void user_enable_single_step(struct task_struct *task)
703 703
704 if (regs != NULL) { 704 if (regs != NULL) {
705#if defined(CONFIG_40x) || defined(CONFIG_BOOKE) 705#if defined(CONFIG_40x) || defined(CONFIG_BOOKE)
706 task->thread.dbcr0 = DBCR0_IDM | DBCR0_IC; 706 task->thread.dbcr0 |= DBCR0_IDM | DBCR0_IC;
707 regs->msr |= MSR_DE; 707 regs->msr |= MSR_DE;
708#else 708#else
709 regs->msr |= MSR_SE; 709 regs->msr |= MSR_SE;
@@ -716,9 +716,16 @@ void user_disable_single_step(struct task_struct *task)
716{ 716{
717 struct pt_regs *regs = task->thread.regs; 717 struct pt_regs *regs = task->thread.regs;
718 718
719
720#if defined(CONFIG_44x) || defined(CONFIG_BOOKE)
721 /* If DAC then do not single step, skip */
722 if (task->thread.dabr)
723 return;
724#endif
725
719 if (regs != NULL) { 726 if (regs != NULL) {
720#if defined(CONFIG_40x) || defined(CONFIG_BOOKE) 727#if defined(CONFIG_40x) || defined(CONFIG_BOOKE)
721 task->thread.dbcr0 = 0; 728 task->thread.dbcr0 &= ~(DBCR0_IC | DBCR0_IDM);
722 regs->msr &= ~MSR_DE; 729 regs->msr &= ~MSR_DE;
723#else 730#else
724 regs->msr &= ~MSR_SE; 731 regs->msr &= ~MSR_SE;
@@ -727,22 +734,75 @@ void user_disable_single_step(struct task_struct *task)
727 clear_tsk_thread_flag(task, TIF_SINGLESTEP); 734 clear_tsk_thread_flag(task, TIF_SINGLESTEP);
728} 735}
729 736
730static int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, 737int ptrace_set_debugreg(struct task_struct *task, unsigned long addr,
731 unsigned long data) 738 unsigned long data)
732{ 739{
733 /* We only support one DABR and no IABRS at the moment */ 740 /* For ppc64 we support one DABR and no IABR's at the moment (ppc64).
741 * For embedded processors we support one DAC and no IAC's at the
742 * moment.
743 */
734 if (addr > 0) 744 if (addr > 0)
735 return -EINVAL; 745 return -EINVAL;
736 746
737 /* The bottom 3 bits are flags */
738 if ((data & ~0x7UL) >= TASK_SIZE) 747 if ((data & ~0x7UL) >= TASK_SIZE)
739 return -EIO; 748 return -EIO;
740 749
741 /* Ensure translation is on */ 750#ifdef CONFIG_PPC64
751
752 /* For processors using DABR (i.e. 970), the bottom 3 bits are flags.
753 * It was assumed, on previous implementations, that 3 bits were
754 * passed together with the data address, fitting the design of the
755 * DABR register, as follows:
756 *
757 * bit 0: Read flag
758 * bit 1: Write flag
759 * bit 2: Breakpoint translation
760 *
761 * Thus, we use them here as so.
762 */
763
764 /* Ensure breakpoint translation bit is set */
742 if (data && !(data & DABR_TRANSLATION)) 765 if (data && !(data & DABR_TRANSLATION))
743 return -EIO; 766 return -EIO;
744 767
768 /* Move contents to the DABR register */
745 task->thread.dabr = data; 769 task->thread.dabr = data;
770
771#endif
772#if defined(CONFIG_44x) || defined(CONFIG_BOOKE)
773
774 /* As described above, it was assumed 3 bits were passed with the data
775 * address, but we will assume only the mode bits will be passed
776 * as to not cause alignment restrictions for DAC-based processors.
777 */
778
779 /* DAC's hold the whole address without any mode flags */
780 task->thread.dabr = data & ~0x3UL;
781
782 if (task->thread.dabr == 0) {
783 task->thread.dbcr0 &= ~(DBSR_DAC1R | DBSR_DAC1W | DBCR0_IDM);
784 task->thread.regs->msr &= ~MSR_DE;
785 return 0;
786 }
787
788 /* Read or Write bits must be set */
789
790 if (!(data & 0x3UL))
791 return -EINVAL;
792
793 /* Set the Internal Debugging flag (IDM bit 1) for the DBCR0
794 register */
795 task->thread.dbcr0 = DBCR0_IDM;
796
797 /* Check for write and read flags and set DBCR0
798 accordingly */
799 if (data & 0x1UL)
800 task->thread.dbcr0 |= DBSR_DAC1R;
801 if (data & 0x2UL)
802 task->thread.dbcr0 |= DBSR_DAC1W;
803
804 task->thread.regs->msr |= MSR_DE;
805#endif
746 return 0; 806 return 0;
747} 807}
748 808
diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index ad55488939c3..7aada783ec6a 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -145,8 +145,12 @@ int do_signal(sigset_t *oldset, struct pt_regs *regs)
145 * user space. The DABR will have been cleared if it 145 * user space. The DABR will have been cleared if it
146 * triggered inside the kernel. 146 * triggered inside the kernel.
147 */ 147 */
148 if (current->thread.dabr) 148 if (current->thread.dabr) {
149 set_dabr(current->thread.dabr); 149 set_dabr(current->thread.dabr);
150#if defined(CONFIG_44x) || defined(CONFIG_BOOKE)
151 mtspr(SPRN_DBCR0, current->thread.dbcr0);
152#endif
153 }
150 154
151 if (is32) { 155 if (is32) {
152 if (ka.sa.sa_flags & SA_SIGINFO) 156 if (ka.sa.sa_flags & SA_SIGINFO)
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index aba0ba95f062..800e5e9a087b 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -529,7 +529,8 @@ static void register_nodes(void)
529#endif 529#endif
530 530
531/* Only valid if CPU is present. */ 531/* Only valid if CPU is present. */
532static ssize_t show_physical_id(struct sys_device *dev, char *buf) 532static ssize_t show_physical_id(struct sys_device *dev,
533 struct sysdev_attribute *attr, char *buf)
533{ 534{
534 struct cpu *cpu = container_of(dev, struct cpu, sysdev); 535 struct cpu *cpu = container_of(dev, struct cpu, sysdev);
535 536
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 878fbddb6ae1..81ccb8dd1a54 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -1067,6 +1067,22 @@ void __kprobes DebugException(struct pt_regs *regs, unsigned long debug_status)
1067 } 1067 }
1068 1068
1069 _exception(SIGTRAP, regs, TRAP_TRACE, regs->nip); 1069 _exception(SIGTRAP, regs, TRAP_TRACE, regs->nip);
1070 } else if (debug_status & (DBSR_DAC1R | DBSR_DAC1W)) {
1071 regs->msr &= ~MSR_DE;
1072
1073 if (user_mode(regs)) {
1074 current->thread.dbcr0 &= ~(DBSR_DAC1R | DBSR_DAC1W |
1075 DBCR0_IDM);
1076 } else {
1077 /* Disable DAC interupts */
1078 mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) & ~(DBSR_DAC1R |
1079 DBSR_DAC1W | DBCR0_IDM));
1080
1081 /* Clear the DAC event */
1082 mtspr(SPRN_DBSR, (DBSR_DAC1R | DBSR_DAC1W));
1083 }
1084 /* Setup and send the trap to the handler */
1085 do_dabr(regs, mfspr(SPRN_DAC1), debug_status);
1070 } 1086 }
1071} 1087}
1072#endif /* CONFIG_4xx || CONFIG_BOOKE */ 1088#endif /* CONFIG_4xx || CONFIG_BOOKE */
diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c
index b77f8af7ddde..ade8aeaa2e70 100644
--- a/arch/powerpc/kernel/vio.c
+++ b/arch/powerpc/kernel/vio.c
@@ -1,11 +1,12 @@
1/* 1/*
2 * IBM PowerPC Virtual I/O Infrastructure Support. 2 * IBM PowerPC Virtual I/O Infrastructure Support.
3 * 3 *
4 * Copyright (c) 2003-2005 IBM Corp. 4 * Copyright (c) 2003,2008 IBM Corp.
5 * Dave Engebretsen engebret@us.ibm.com 5 * Dave Engebretsen engebret@us.ibm.com
6 * Santiago Leon santil@us.ibm.com 6 * Santiago Leon santil@us.ibm.com
7 * Hollis Blanchard <hollisb@us.ibm.com> 7 * Hollis Blanchard <hollisb@us.ibm.com>
8 * Stephen Rothwell 8 * Stephen Rothwell
9 * Robert Jennings <rcjenn@us.ibm.com>
9 * 10 *
10 * This program is free software; you can redistribute it and/or 11 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License 12 * modify it under the terms of the GNU General Public License
@@ -46,6 +47,996 @@ static struct vio_dev vio_bus_device = { /* fake "parent" device */
46 .dev.bus = &vio_bus_type, 47 .dev.bus = &vio_bus_type,
47}; 48};
48 49
50#ifdef CONFIG_PPC_SMLPAR
51/**
52 * vio_cmo_pool - A pool of IO memory for CMO use
53 *
54 * @size: The size of the pool in bytes
55 * @free: The amount of free memory in the pool
56 */
57struct vio_cmo_pool {
58 size_t size;
59 size_t free;
60};
61
62/* How many ms to delay queued balance work */
63#define VIO_CMO_BALANCE_DELAY 100
64
65/* Portion out IO memory to CMO devices by this chunk size */
66#define VIO_CMO_BALANCE_CHUNK 131072
67
68/**
69 * vio_cmo_dev_entry - A device that is CMO-enabled and requires entitlement
70 *
71 * @vio_dev: struct vio_dev pointer
72 * @list: pointer to other devices on bus that are being tracked
73 */
74struct vio_cmo_dev_entry {
75 struct vio_dev *viodev;
76 struct list_head list;
77};
78
79/**
80 * vio_cmo - VIO bus accounting structure for CMO entitlement
81 *
82 * @lock: spinlock for entire structure
83 * @balance_q: work queue for balancing system entitlement
84 * @device_list: list of CMO-enabled devices requiring entitlement
85 * @entitled: total system entitlement in bytes
86 * @reserve: pool of memory from which devices reserve entitlement, incl. spare
87 * @excess: pool of excess entitlement not needed for device reserves or spare
88 * @spare: IO memory for device hotplug functionality
89 * @min: minimum necessary for system operation
90 * @desired: desired memory for system operation
91 * @curr: bytes currently allocated
92 * @high: high water mark for IO data usage
93 */
94struct vio_cmo {
95 spinlock_t lock;
96 struct delayed_work balance_q;
97 struct list_head device_list;
98 size_t entitled;
99 struct vio_cmo_pool reserve;
100 struct vio_cmo_pool excess;
101 size_t spare;
102 size_t min;
103 size_t desired;
104 size_t curr;
105 size_t high;
106} vio_cmo;
107
108/**
109 * vio_cmo_OF_devices - Count the number of OF devices that have DMA windows
110 */
111static int vio_cmo_num_OF_devs(void)
112{
113 struct device_node *node_vroot;
114 int count = 0;
115
116 /*
117 * Count the number of vdevice entries with an
118 * ibm,my-dma-window OF property
119 */
120 node_vroot = of_find_node_by_name(NULL, "vdevice");
121 if (node_vroot) {
122 struct device_node *of_node;
123 struct property *prop;
124
125 for_each_child_of_node(node_vroot, of_node) {
126 prop = of_find_property(of_node, "ibm,my-dma-window",
127 NULL);
128 if (prop)
129 count++;
130 }
131 }
132 of_node_put(node_vroot);
133 return count;
134}
135
136/**
137 * vio_cmo_alloc - allocate IO memory for CMO-enable devices
138 *
139 * @viodev: VIO device requesting IO memory
140 * @size: size of allocation requested
141 *
142 * Allocations come from memory reserved for the devices and any excess
143 * IO memory available to all devices. The spare pool used to service
144 * hotplug must be equal to %VIO_CMO_MIN_ENT for the excess pool to be
145 * made available.
146 *
147 * Return codes:
148 * 0 for successful allocation and -ENOMEM for a failure
149 */
150static inline int vio_cmo_alloc(struct vio_dev *viodev, size_t size)
151{
152 unsigned long flags;
153 size_t reserve_free = 0;
154 size_t excess_free = 0;
155 int ret = -ENOMEM;
156
157 spin_lock_irqsave(&vio_cmo.lock, flags);
158
159 /* Determine the amount of free entitlement available in reserve */
160 if (viodev->cmo.entitled > viodev->cmo.allocated)
161 reserve_free = viodev->cmo.entitled - viodev->cmo.allocated;
162
163 /* If spare is not fulfilled, the excess pool can not be used. */
164 if (vio_cmo.spare >= VIO_CMO_MIN_ENT)
165 excess_free = vio_cmo.excess.free;
166
167 /* The request can be satisfied */
168 if ((reserve_free + excess_free) >= size) {
169 vio_cmo.curr += size;
170 if (vio_cmo.curr > vio_cmo.high)
171 vio_cmo.high = vio_cmo.curr;
172 viodev->cmo.allocated += size;
173 size -= min(reserve_free, size);
174 vio_cmo.excess.free -= size;
175 ret = 0;
176 }
177
178 spin_unlock_irqrestore(&vio_cmo.lock, flags);
179 return ret;
180}
181
182/**
183 * vio_cmo_dealloc - deallocate IO memory from CMO-enable devices
184 * @viodev: VIO device freeing IO memory
185 * @size: size of deallocation
186 *
187 * IO memory is freed by the device back to the correct memory pools.
188 * The spare pool is replenished first from either memory pool, then
189 * the reserve pool is used to reduce device entitlement, the excess
190 * pool is used to increase the reserve pool toward the desired entitlement
191 * target, and then the remaining memory is returned to the pools.
192 *
193 */
194static inline void vio_cmo_dealloc(struct vio_dev *viodev, size_t size)
195{
196 unsigned long flags;
197 size_t spare_needed = 0;
198 size_t excess_freed = 0;
199 size_t reserve_freed = size;
200 size_t tmp;
201 int balance = 0;
202
203 spin_lock_irqsave(&vio_cmo.lock, flags);
204 vio_cmo.curr -= size;
205
206 /* Amount of memory freed from the excess pool */
207 if (viodev->cmo.allocated > viodev->cmo.entitled) {
208 excess_freed = min(reserve_freed, (viodev->cmo.allocated -
209 viodev->cmo.entitled));
210 reserve_freed -= excess_freed;
211 }
212
213 /* Remove allocation from device */
214 viodev->cmo.allocated -= (reserve_freed + excess_freed);
215
216 /* Spare is a subset of the reserve pool, replenish it first. */
217 spare_needed = VIO_CMO_MIN_ENT - vio_cmo.spare;
218
219 /*
220 * Replenish the spare in the reserve pool from the excess pool.
221 * This moves entitlement into the reserve pool.
222 */
223 if (spare_needed && excess_freed) {
224 tmp = min(excess_freed, spare_needed);
225 vio_cmo.excess.size -= tmp;
226 vio_cmo.reserve.size += tmp;
227 vio_cmo.spare += tmp;
228 excess_freed -= tmp;
229 spare_needed -= tmp;
230 balance = 1;
231 }
232
233 /*
234 * Replenish the spare in the reserve pool from the reserve pool.
235 * This removes entitlement from the device down to VIO_CMO_MIN_ENT,
236 * if needed, and gives it to the spare pool. The amount of used
237 * memory in this pool does not change.
238 */
239 if (spare_needed && reserve_freed) {
240 tmp = min(spare_needed, min(reserve_freed,
241 (viodev->cmo.entitled -
242 VIO_CMO_MIN_ENT)));
243
244 vio_cmo.spare += tmp;
245 viodev->cmo.entitled -= tmp;
246 reserve_freed -= tmp;
247 spare_needed -= tmp;
248 balance = 1;
249 }
250
251 /*
252 * Increase the reserve pool until the desired allocation is met.
253 * Move an allocation freed from the excess pool into the reserve
254 * pool and schedule a balance operation.
255 */
256 if (excess_freed && (vio_cmo.desired > vio_cmo.reserve.size)) {
257 tmp = min(excess_freed, (vio_cmo.desired - vio_cmo.reserve.size));
258
259 vio_cmo.excess.size -= tmp;
260 vio_cmo.reserve.size += tmp;
261 excess_freed -= tmp;
262 balance = 1;
263 }
264
265 /* Return memory from the excess pool to that pool */
266 if (excess_freed)
267 vio_cmo.excess.free += excess_freed;
268
269 if (balance)
270 schedule_delayed_work(&vio_cmo.balance_q, VIO_CMO_BALANCE_DELAY);
271 spin_unlock_irqrestore(&vio_cmo.lock, flags);
272}
273
274/**
275 * vio_cmo_entitlement_update - Manage system entitlement changes
276 *
277 * @new_entitlement: new system entitlement to attempt to accommodate
278 *
279 * Increases in entitlement will be used to fulfill the spare entitlement
280 * and the rest is given to the excess pool. Decreases, if they are
281 * possible, come from the excess pool and from unused device entitlement
282 *
283 * Returns: 0 on success, -ENOMEM when change can not be made
284 */
285int vio_cmo_entitlement_update(size_t new_entitlement)
286{
287 struct vio_dev *viodev;
288 struct vio_cmo_dev_entry *dev_ent;
289 unsigned long flags;
290 size_t avail, delta, tmp;
291
292 spin_lock_irqsave(&vio_cmo.lock, flags);
293
294 /* Entitlement increases */
295 if (new_entitlement > vio_cmo.entitled) {
296 delta = new_entitlement - vio_cmo.entitled;
297
298 /* Fulfill spare allocation */
299 if (vio_cmo.spare < VIO_CMO_MIN_ENT) {
300 tmp = min(delta, (VIO_CMO_MIN_ENT - vio_cmo.spare));
301 vio_cmo.spare += tmp;
302 vio_cmo.reserve.size += tmp;
303 delta -= tmp;
304 }
305
306 /* Remaining new allocation goes to the excess pool */
307 vio_cmo.entitled += delta;
308 vio_cmo.excess.size += delta;
309 vio_cmo.excess.free += delta;
310
311 goto out;
312 }
313
314 /* Entitlement decreases */
315 delta = vio_cmo.entitled - new_entitlement;
316 avail = vio_cmo.excess.free;
317
318 /*
319 * Need to check how much unused entitlement each device can
320 * sacrifice to fulfill entitlement change.
321 */
322 list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
323 if (avail >= delta)
324 break;
325
326 viodev = dev_ent->viodev;
327 if ((viodev->cmo.entitled > viodev->cmo.allocated) &&
328 (viodev->cmo.entitled > VIO_CMO_MIN_ENT))
329 avail += viodev->cmo.entitled -
330 max_t(size_t, viodev->cmo.allocated,
331 VIO_CMO_MIN_ENT);
332 }
333
334 if (delta <= avail) {
335 vio_cmo.entitled -= delta;
336
337 /* Take entitlement from the excess pool first */
338 tmp = min(vio_cmo.excess.free, delta);
339 vio_cmo.excess.size -= tmp;
340 vio_cmo.excess.free -= tmp;
341 delta -= tmp;
342
343 /*
344 * Remove all but VIO_CMO_MIN_ENT bytes from devices
345 * until entitlement change is served
346 */
347 list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
348 if (!delta)
349 break;
350
351 viodev = dev_ent->viodev;
352 tmp = 0;
353 if ((viodev->cmo.entitled > viodev->cmo.allocated) &&
354 (viodev->cmo.entitled > VIO_CMO_MIN_ENT))
355 tmp = viodev->cmo.entitled -
356 max_t(size_t, viodev->cmo.allocated,
357 VIO_CMO_MIN_ENT);
358 viodev->cmo.entitled -= min(tmp, delta);
359 delta -= min(tmp, delta);
360 }
361 } else {
362 spin_unlock_irqrestore(&vio_cmo.lock, flags);
363 return -ENOMEM;
364 }
365
366out:
367 schedule_delayed_work(&vio_cmo.balance_q, 0);
368 spin_unlock_irqrestore(&vio_cmo.lock, flags);
369 return 0;
370}
371
372/**
373 * vio_cmo_balance - Balance entitlement among devices
374 *
375 * @work: work queue structure for this operation
376 *
377 * Any system entitlement above the minimum needed for devices, or
378 * already allocated to devices, can be distributed to the devices.
379 * The list of devices is iterated through to recalculate the desired
380 * entitlement level and to determine how much entitlement above the
381 * minimum entitlement is allocated to devices.
382 *
383 * Small chunks of the available entitlement are given to devices until
384 * their requirements are fulfilled or there is no entitlement left to give.
385 * Upon completion sizes of the reserve and excess pools are calculated.
386 *
387 * The system minimum entitlement level is also recalculated here.
388 * Entitlement will be reserved for devices even after vio_bus_remove to
389 * accommodate reloading the driver. The OF tree is walked to count the
390 * number of devices present and this will remove entitlement for devices
391 * that have actually left the system after having vio_bus_remove called.
392 */
393static void vio_cmo_balance(struct work_struct *work)
394{
395 struct vio_cmo *cmo;
396 struct vio_dev *viodev;
397 struct vio_cmo_dev_entry *dev_ent;
398 unsigned long flags;
399 size_t avail = 0, level, chunk, need;
400 int devcount = 0, fulfilled;
401
402 cmo = container_of(work, struct vio_cmo, balance_q.work);
403
404 spin_lock_irqsave(&vio_cmo.lock, flags);
405
406 /* Calculate minimum entitlement and fulfill spare */
407 cmo->min = vio_cmo_num_OF_devs() * VIO_CMO_MIN_ENT;
408 BUG_ON(cmo->min > cmo->entitled);
409 cmo->spare = min_t(size_t, VIO_CMO_MIN_ENT, (cmo->entitled - cmo->min));
410 cmo->min += cmo->spare;
411 cmo->desired = cmo->min;
412
413 /*
414 * Determine how much entitlement is available and reset device
415 * entitlements
416 */
417 avail = cmo->entitled - cmo->spare;
418 list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
419 viodev = dev_ent->viodev;
420 devcount++;
421 viodev->cmo.entitled = VIO_CMO_MIN_ENT;
422 cmo->desired += (viodev->cmo.desired - VIO_CMO_MIN_ENT);
423 avail -= max_t(size_t, viodev->cmo.allocated, VIO_CMO_MIN_ENT);
424 }
425
426 /*
427 * Having provided each device with the minimum entitlement, loop
428 * over the devices portioning out the remaining entitlement
429 * until there is nothing left.
430 */
431 level = VIO_CMO_MIN_ENT;
432 while (avail) {
433 fulfilled = 0;
434 list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
435 viodev = dev_ent->viodev;
436
437 if (viodev->cmo.desired <= level) {
438 fulfilled++;
439 continue;
440 }
441
442 /*
443 * Give the device up to VIO_CMO_BALANCE_CHUNK
444 * bytes of entitlement, but do not exceed the
445 * desired level of entitlement for the device.
446 */
447 chunk = min_t(size_t, avail, VIO_CMO_BALANCE_CHUNK);
448 chunk = min(chunk, (viodev->cmo.desired -
449 viodev->cmo.entitled));
450 viodev->cmo.entitled += chunk;
451
452 /*
453 * If the memory for this entitlement increase was
454 * already allocated to the device it does not come
455 * from the available pool being portioned out.
456 */
457 need = max(viodev->cmo.allocated, viodev->cmo.entitled)-
458 max(viodev->cmo.allocated, level);
459 avail -= need;
460
461 }
462 if (fulfilled == devcount)
463 break;
464 level += VIO_CMO_BALANCE_CHUNK;
465 }
466
467 /* Calculate new reserve and excess pool sizes */
468 cmo->reserve.size = cmo->min;
469 cmo->excess.free = 0;
470 cmo->excess.size = 0;
471 need = 0;
472 list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
473 viodev = dev_ent->viodev;
474 /* Calculated reserve size above the minimum entitlement */
475 if (viodev->cmo.entitled)
476 cmo->reserve.size += (viodev->cmo.entitled -
477 VIO_CMO_MIN_ENT);
478 /* Calculated used excess entitlement */
479 if (viodev->cmo.allocated > viodev->cmo.entitled)
480 need += viodev->cmo.allocated - viodev->cmo.entitled;
481 }
482 cmo->excess.size = cmo->entitled - cmo->reserve.size;
483 cmo->excess.free = cmo->excess.size - need;
484
485 cancel_delayed_work(container_of(work, struct delayed_work, work));
486 spin_unlock_irqrestore(&vio_cmo.lock, flags);
487}
488
489static void *vio_dma_iommu_alloc_coherent(struct device *dev, size_t size,
490 dma_addr_t *dma_handle, gfp_t flag)
491{
492 struct vio_dev *viodev = to_vio_dev(dev);
493 void *ret;
494
495 if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE))) {
496 atomic_inc(&viodev->cmo.allocs_failed);
497 return NULL;
498 }
499
500 ret = dma_iommu_ops.alloc_coherent(dev, size, dma_handle, flag);
501 if (unlikely(ret == NULL)) {
502 vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE));
503 atomic_inc(&viodev->cmo.allocs_failed);
504 }
505
506 return ret;
507}
508
509static void vio_dma_iommu_free_coherent(struct device *dev, size_t size,
510 void *vaddr, dma_addr_t dma_handle)
511{
512 struct vio_dev *viodev = to_vio_dev(dev);
513
514 dma_iommu_ops.free_coherent(dev, size, vaddr, dma_handle);
515
516 vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE));
517}
518
519static dma_addr_t vio_dma_iommu_map_single(struct device *dev, void *vaddr,
520 size_t size,
521 enum dma_data_direction direction,
522 struct dma_attrs *attrs)
523{
524 struct vio_dev *viodev = to_vio_dev(dev);
525 dma_addr_t ret = DMA_ERROR_CODE;
526
527 if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE))) {
528 atomic_inc(&viodev->cmo.allocs_failed);
529 return ret;
530 }
531
532 ret = dma_iommu_ops.map_single(dev, vaddr, size, direction, attrs);
533 if (unlikely(dma_mapping_error(ret))) {
534 vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE));
535 atomic_inc(&viodev->cmo.allocs_failed);
536 }
537
538 return ret;
539}
540
541static void vio_dma_iommu_unmap_single(struct device *dev,
542 dma_addr_t dma_handle, size_t size,
543 enum dma_data_direction direction,
544 struct dma_attrs *attrs)
545{
546 struct vio_dev *viodev = to_vio_dev(dev);
547
548 dma_iommu_ops.unmap_single(dev, dma_handle, size, direction, attrs);
549
550 vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE));
551}
552
553static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
554 int nelems, enum dma_data_direction direction,
555 struct dma_attrs *attrs)
556{
557 struct vio_dev *viodev = to_vio_dev(dev);
558 struct scatterlist *sgl;
559 int ret, count = 0;
560 size_t alloc_size = 0;
561
562 for (sgl = sglist; count < nelems; count++, sgl++)
563 alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE);
564
565 if (vio_cmo_alloc(viodev, alloc_size)) {
566 atomic_inc(&viodev->cmo.allocs_failed);
567 return 0;
568 }
569
570 ret = dma_iommu_ops.map_sg(dev, sglist, nelems, direction, attrs);
571
572 if (unlikely(!ret)) {
573 vio_cmo_dealloc(viodev, alloc_size);
574 atomic_inc(&viodev->cmo.allocs_failed);
575 }
576
577 for (sgl = sglist, count = 0; count < ret; count++, sgl++)
578 alloc_size -= roundup(sgl->dma_length, IOMMU_PAGE_SIZE);
579 if (alloc_size)
580 vio_cmo_dealloc(viodev, alloc_size);
581
582 return ret;
583}
584
585static void vio_dma_iommu_unmap_sg(struct device *dev,
586 struct scatterlist *sglist, int nelems,
587 enum dma_data_direction direction,
588 struct dma_attrs *attrs)
589{
590 struct vio_dev *viodev = to_vio_dev(dev);
591 struct scatterlist *sgl;
592 size_t alloc_size = 0;
593 int count = 0;
594
595 for (sgl = sglist; count < nelems; count++, sgl++)
596 alloc_size += roundup(sgl->dma_length, IOMMU_PAGE_SIZE);
597
598 dma_iommu_ops.unmap_sg(dev, sglist, nelems, direction, attrs);
599
600 vio_cmo_dealloc(viodev, alloc_size);
601}
602
603struct dma_mapping_ops vio_dma_mapping_ops = {
604 .alloc_coherent = vio_dma_iommu_alloc_coherent,
605 .free_coherent = vio_dma_iommu_free_coherent,
606 .map_single = vio_dma_iommu_map_single,
607 .unmap_single = vio_dma_iommu_unmap_single,
608 .map_sg = vio_dma_iommu_map_sg,
609 .unmap_sg = vio_dma_iommu_unmap_sg,
610};
611
612/**
613 * vio_cmo_set_dev_desired - Set desired entitlement for a device
614 *
615 * @viodev: struct vio_dev for device to alter
616 * @new_desired: new desired entitlement level in bytes
617 *
618 * For use by devices to request a change to their entitlement at runtime or
619 * through sysfs. The desired entitlement level is changed and a balancing
620 * of system resources is scheduled to run in the future.
621 */
622void vio_cmo_set_dev_desired(struct vio_dev *viodev, size_t desired)
623{
624 unsigned long flags;
625 struct vio_cmo_dev_entry *dev_ent;
626 int found = 0;
627
628 if (!firmware_has_feature(FW_FEATURE_CMO))
629 return;
630
631 spin_lock_irqsave(&vio_cmo.lock, flags);
632 if (desired < VIO_CMO_MIN_ENT)
633 desired = VIO_CMO_MIN_ENT;
634
635 /*
636 * Changes will not be made for devices not in the device list.
637 * If it is not in the device list, then no driver is loaded
638 * for the device and it can not receive entitlement.
639 */
640 list_for_each_entry(dev_ent, &vio_cmo.device_list, list)
641 if (viodev == dev_ent->viodev) {
642 found = 1;
643 break;
644 }
645 if (!found)
646 return;
647
648 /* Increase/decrease in desired device entitlement */
649 if (desired >= viodev->cmo.desired) {
650 /* Just bump the bus and device values prior to a balance*/
651 vio_cmo.desired += desired - viodev->cmo.desired;
652 viodev->cmo.desired = desired;
653 } else {
654 /* Decrease bus and device values for desired entitlement */
655 vio_cmo.desired -= viodev->cmo.desired - desired;
656 viodev->cmo.desired = desired;
657 /*
658 * If less entitlement is desired than current entitlement, move
659 * any reserve memory in the change region to the excess pool.
660 */
661 if (viodev->cmo.entitled > desired) {
662 vio_cmo.reserve.size -= viodev->cmo.entitled - desired;
663 vio_cmo.excess.size += viodev->cmo.entitled - desired;
664 /*
665 * If entitlement moving from the reserve pool to the
666 * excess pool is currently unused, add to the excess
667 * free counter.
668 */
669 if (viodev->cmo.allocated < viodev->cmo.entitled)
670 vio_cmo.excess.free += viodev->cmo.entitled -
671 max(viodev->cmo.allocated, desired);
672 viodev->cmo.entitled = desired;
673 }
674 }
675 schedule_delayed_work(&vio_cmo.balance_q, 0);
676 spin_unlock_irqrestore(&vio_cmo.lock, flags);
677}
678
679/**
680 * vio_cmo_bus_probe - Handle CMO specific bus probe activities
681 *
682 * @viodev - Pointer to struct vio_dev for device
683 *
684 * Determine the devices IO memory entitlement needs, attempting
685 * to satisfy the system minimum entitlement at first and scheduling
686 * a balance operation to take care of the rest at a later time.
687 *
688 * Returns: 0 on success, -EINVAL when device doesn't support CMO, and
689 * -ENOMEM when entitlement is not available for device or
690 * device entry.
691 *
692 */
693static int vio_cmo_bus_probe(struct vio_dev *viodev)
694{
695 struct vio_cmo_dev_entry *dev_ent;
696 struct device *dev = &viodev->dev;
697 struct vio_driver *viodrv = to_vio_driver(dev->driver);
698 unsigned long flags;
699 size_t size;
700
701 /*
702 * Check to see that device has a DMA window and configure
703 * entitlement for the device.
704 */
705 if (of_get_property(viodev->dev.archdata.of_node,
706 "ibm,my-dma-window", NULL)) {
707 /* Check that the driver is CMO enabled and get desired DMA */
708 if (!viodrv->get_desired_dma) {
709 dev_err(dev, "%s: device driver does not support CMO\n",
710 __func__);
711 return -EINVAL;
712 }
713
714 viodev->cmo.desired = IOMMU_PAGE_ALIGN(viodrv->get_desired_dma(viodev));
715 if (viodev->cmo.desired < VIO_CMO_MIN_ENT)
716 viodev->cmo.desired = VIO_CMO_MIN_ENT;
717 size = VIO_CMO_MIN_ENT;
718
719 dev_ent = kmalloc(sizeof(struct vio_cmo_dev_entry),
720 GFP_KERNEL);
721 if (!dev_ent)
722 return -ENOMEM;
723
724 dev_ent->viodev = viodev;
725 spin_lock_irqsave(&vio_cmo.lock, flags);
726 list_add(&dev_ent->list, &vio_cmo.device_list);
727 } else {
728 viodev->cmo.desired = 0;
729 size = 0;
730 spin_lock_irqsave(&vio_cmo.lock, flags);
731 }
732
733 /*
734 * If the needs for vio_cmo.min have not changed since they
735 * were last set, the number of devices in the OF tree has
736 * been constant and the IO memory for this is already in
737 * the reserve pool.
738 */
739 if (vio_cmo.min == ((vio_cmo_num_OF_devs() + 1) *
740 VIO_CMO_MIN_ENT)) {
741 /* Updated desired entitlement if device requires it */
742 if (size)
743 vio_cmo.desired += (viodev->cmo.desired -
744 VIO_CMO_MIN_ENT);
745 } else {
746 size_t tmp;
747
748 tmp = vio_cmo.spare + vio_cmo.excess.free;
749 if (tmp < size) {
750 dev_err(dev, "%s: insufficient free "
751 "entitlement to add device. "
752 "Need %lu, have %lu\n", __func__,
753 size, (vio_cmo.spare + tmp));
754 spin_unlock_irqrestore(&vio_cmo.lock, flags);
755 return -ENOMEM;
756 }
757
758 /* Use excess pool first to fulfill request */
759 tmp = min(size, vio_cmo.excess.free);
760 vio_cmo.excess.free -= tmp;
761 vio_cmo.excess.size -= tmp;
762 vio_cmo.reserve.size += tmp;
763
764 /* Use spare if excess pool was insufficient */
765 vio_cmo.spare -= size - tmp;
766
767 /* Update bus accounting */
768 vio_cmo.min += size;
769 vio_cmo.desired += viodev->cmo.desired;
770 }
771 spin_unlock_irqrestore(&vio_cmo.lock, flags);
772 return 0;
773}
774
775/**
776 * vio_cmo_bus_remove - Handle CMO specific bus removal activities
777 *
778 * @viodev - Pointer to struct vio_dev for device
779 *
780 * Remove the device from the cmo device list. The minimum entitlement
781 * will be reserved for the device as long as it is in the system. The
782 * rest of the entitlement the device had been allocated will be returned
783 * to the system.
784 */
785static void vio_cmo_bus_remove(struct vio_dev *viodev)
786{
787 struct vio_cmo_dev_entry *dev_ent;
788 unsigned long flags;
789 size_t tmp;
790
791 spin_lock_irqsave(&vio_cmo.lock, flags);
792 if (viodev->cmo.allocated) {
793 dev_err(&viodev->dev, "%s: device had %lu bytes of IO "
794 "allocated after remove operation.\n",
795 __func__, viodev->cmo.allocated);
796 BUG();
797 }
798
799 /*
800 * Remove the device from the device list being maintained for
801 * CMO enabled devices.
802 */
803 list_for_each_entry(dev_ent, &vio_cmo.device_list, list)
804 if (viodev == dev_ent->viodev) {
805 list_del(&dev_ent->list);
806 kfree(dev_ent);
807 break;
808 }
809
810 /*
811 * Devices may not require any entitlement and they do not need
812 * to be processed. Otherwise, return the device's entitlement
813 * back to the pools.
814 */
815 if (viodev->cmo.entitled) {
816 /*
817 * This device has not yet left the OF tree, it's
818 * minimum entitlement remains in vio_cmo.min and
819 * vio_cmo.desired
820 */
821 vio_cmo.desired -= (viodev->cmo.desired - VIO_CMO_MIN_ENT);
822
823 /*
824 * Save min allocation for device in reserve as long
825 * as it exists in OF tree as determined by later
826 * balance operation
827 */
828 viodev->cmo.entitled -= VIO_CMO_MIN_ENT;
829
830 /* Replenish spare from freed reserve pool */
831 if (viodev->cmo.entitled && (vio_cmo.spare < VIO_CMO_MIN_ENT)) {
832 tmp = min(viodev->cmo.entitled, (VIO_CMO_MIN_ENT -
833 vio_cmo.spare));
834 vio_cmo.spare += tmp;
835 viodev->cmo.entitled -= tmp;
836 }
837
838 /* Remaining reserve goes to excess pool */
839 vio_cmo.excess.size += viodev->cmo.entitled;
840 vio_cmo.excess.free += viodev->cmo.entitled;
841 vio_cmo.reserve.size -= viodev->cmo.entitled;
842
843 /*
844 * Until the device is removed it will keep a
845 * minimum entitlement; this will guarantee that
846 * a module unload/load will result in a success.
847 */
848 viodev->cmo.entitled = VIO_CMO_MIN_ENT;
849 viodev->cmo.desired = VIO_CMO_MIN_ENT;
850 atomic_set(&viodev->cmo.allocs_failed, 0);
851 }
852
853 spin_unlock_irqrestore(&vio_cmo.lock, flags);
854}
855
856static void vio_cmo_set_dma_ops(struct vio_dev *viodev)
857{
858 vio_dma_mapping_ops.dma_supported = dma_iommu_ops.dma_supported;
859 viodev->dev.archdata.dma_ops = &vio_dma_mapping_ops;
860}
861
862/**
863 * vio_cmo_bus_init - CMO entitlement initialization at bus init time
864 *
865 * Set up the reserve and excess entitlement pools based on available
866 * system entitlement and the number of devices in the OF tree that
867 * require entitlement in the reserve pool.
868 */
869static void vio_cmo_bus_init(void)
870{
871 struct hvcall_mpp_data mpp_data;
872 int err;
873
874 memset(&vio_cmo, 0, sizeof(struct vio_cmo));
875 spin_lock_init(&vio_cmo.lock);
876 INIT_LIST_HEAD(&vio_cmo.device_list);
877 INIT_DELAYED_WORK(&vio_cmo.balance_q, vio_cmo_balance);
878
879 /* Get current system entitlement */
880 err = h_get_mpp(&mpp_data);
881
882 /*
883 * On failure, continue with entitlement set to 0, will panic()
884 * later when spare is reserved.
885 */
886 if (err != H_SUCCESS) {
887 printk(KERN_ERR "%s: unable to determine system IO "\
888 "entitlement. (%d)\n", __func__, err);
889 vio_cmo.entitled = 0;
890 } else {
891 vio_cmo.entitled = mpp_data.entitled_mem;
892 }
893
894 /* Set reservation and check against entitlement */
895 vio_cmo.spare = VIO_CMO_MIN_ENT;
896 vio_cmo.reserve.size = vio_cmo.spare;
897 vio_cmo.reserve.size += (vio_cmo_num_OF_devs() *
898 VIO_CMO_MIN_ENT);
899 if (vio_cmo.reserve.size > vio_cmo.entitled) {
900 printk(KERN_ERR "%s: insufficient system entitlement\n",
901 __func__);
902 panic("%s: Insufficient system entitlement", __func__);
903 }
904
905 /* Set the remaining accounting variables */
906 vio_cmo.excess.size = vio_cmo.entitled - vio_cmo.reserve.size;
907 vio_cmo.excess.free = vio_cmo.excess.size;
908 vio_cmo.min = vio_cmo.reserve.size;
909 vio_cmo.desired = vio_cmo.reserve.size;
910}
911
912/* sysfs device functions and data structures for CMO */
913
914#define viodev_cmo_rd_attr(name) \
915static ssize_t viodev_cmo_##name##_show(struct device *dev, \
916 struct device_attribute *attr, \
917 char *buf) \
918{ \
919 return sprintf(buf, "%lu\n", to_vio_dev(dev)->cmo.name); \
920}
921
922static ssize_t viodev_cmo_allocs_failed_show(struct device *dev,
923 struct device_attribute *attr, char *buf)
924{
925 struct vio_dev *viodev = to_vio_dev(dev);
926 return sprintf(buf, "%d\n", atomic_read(&viodev->cmo.allocs_failed));
927}
928
929static ssize_t viodev_cmo_allocs_failed_reset(struct device *dev,
930 struct device_attribute *attr, const char *buf, size_t count)
931{
932 struct vio_dev *viodev = to_vio_dev(dev);
933 atomic_set(&viodev->cmo.allocs_failed, 0);
934 return count;
935}
936
937static ssize_t viodev_cmo_desired_set(struct device *dev,
938 struct device_attribute *attr, const char *buf, size_t count)
939{
940 struct vio_dev *viodev = to_vio_dev(dev);
941 size_t new_desired;
942 int ret;
943
944 ret = strict_strtoul(buf, 10, &new_desired);
945 if (ret)
946 return ret;
947
948 vio_cmo_set_dev_desired(viodev, new_desired);
949 return count;
950}
951
952viodev_cmo_rd_attr(desired);
953viodev_cmo_rd_attr(entitled);
954viodev_cmo_rd_attr(allocated);
955
956static ssize_t name_show(struct device *, struct device_attribute *, char *);
957static ssize_t devspec_show(struct device *, struct device_attribute *, char *);
958static struct device_attribute vio_cmo_dev_attrs[] = {
959 __ATTR_RO(name),
960 __ATTR_RO(devspec),
961 __ATTR(cmo_desired, S_IWUSR|S_IRUSR|S_IWGRP|S_IRGRP|S_IROTH,
962 viodev_cmo_desired_show, viodev_cmo_desired_set),
963 __ATTR(cmo_entitled, S_IRUGO, viodev_cmo_entitled_show, NULL),
964 __ATTR(cmo_allocated, S_IRUGO, viodev_cmo_allocated_show, NULL),
965 __ATTR(cmo_allocs_failed, S_IWUSR|S_IRUSR|S_IWGRP|S_IRGRP|S_IROTH,
966 viodev_cmo_allocs_failed_show, viodev_cmo_allocs_failed_reset),
967 __ATTR_NULL
968};
969
970/* sysfs bus functions and data structures for CMO */
971
972#define viobus_cmo_rd_attr(name) \
973static ssize_t \
974viobus_cmo_##name##_show(struct bus_type *bt, char *buf) \
975{ \
976 return sprintf(buf, "%lu\n", vio_cmo.name); \
977}
978
979#define viobus_cmo_pool_rd_attr(name, var) \
980static ssize_t \
981viobus_cmo_##name##_pool_show_##var(struct bus_type *bt, char *buf) \
982{ \
983 return sprintf(buf, "%lu\n", vio_cmo.name.var); \
984}
985
986static ssize_t viobus_cmo_high_reset(struct bus_type *bt, const char *buf,
987 size_t count)
988{
989 unsigned long flags;
990
991 spin_lock_irqsave(&vio_cmo.lock, flags);
992 vio_cmo.high = vio_cmo.curr;
993 spin_unlock_irqrestore(&vio_cmo.lock, flags);
994
995 return count;
996}
997
998viobus_cmo_rd_attr(entitled);
999viobus_cmo_pool_rd_attr(reserve, size);
1000viobus_cmo_pool_rd_attr(excess, size);
1001viobus_cmo_pool_rd_attr(excess, free);
1002viobus_cmo_rd_attr(spare);
1003viobus_cmo_rd_attr(min);
1004viobus_cmo_rd_attr(desired);
1005viobus_cmo_rd_attr(curr);
1006viobus_cmo_rd_attr(high);
1007
1008static struct bus_attribute vio_cmo_bus_attrs[] = {
1009 __ATTR(cmo_entitled, S_IRUGO, viobus_cmo_entitled_show, NULL),
1010 __ATTR(cmo_reserve_size, S_IRUGO, viobus_cmo_reserve_pool_show_size, NULL),
1011 __ATTR(cmo_excess_size, S_IRUGO, viobus_cmo_excess_pool_show_size, NULL),
1012 __ATTR(cmo_excess_free, S_IRUGO, viobus_cmo_excess_pool_show_free, NULL),
1013 __ATTR(cmo_spare, S_IRUGO, viobus_cmo_spare_show, NULL),
1014 __ATTR(cmo_min, S_IRUGO, viobus_cmo_min_show, NULL),
1015 __ATTR(cmo_desired, S_IRUGO, viobus_cmo_desired_show, NULL),
1016 __ATTR(cmo_curr, S_IRUGO, viobus_cmo_curr_show, NULL),
1017 __ATTR(cmo_high, S_IWUSR|S_IRUSR|S_IWGRP|S_IRGRP|S_IROTH,
1018 viobus_cmo_high_show, viobus_cmo_high_reset),
1019 __ATTR_NULL
1020};
1021
1022static void vio_cmo_sysfs_init(void)
1023{
1024 vio_bus_type.dev_attrs = vio_cmo_dev_attrs;
1025 vio_bus_type.bus_attrs = vio_cmo_bus_attrs;
1026}
1027#else /* CONFIG_PPC_SMLPAR */
1028/* Dummy functions for iSeries platform */
1029int vio_cmo_entitlement_update(size_t new_entitlement) { return 0; }
1030void vio_cmo_set_dev_desired(struct vio_dev *viodev, size_t desired) {}
1031static int vio_cmo_bus_probe(struct vio_dev *viodev) { return 0; }
1032static void vio_cmo_bus_remove(struct vio_dev *viodev) {}
1033static void vio_cmo_set_dma_ops(struct vio_dev *viodev) {}
1034static void vio_cmo_bus_init() {}
1035static void vio_cmo_sysfs_init() { }
1036#endif /* CONFIG_PPC_SMLPAR */
1037EXPORT_SYMBOL(vio_cmo_entitlement_update);
1038EXPORT_SYMBOL(vio_cmo_set_dev_desired);
1039
49static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev) 1040static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev)
50{ 1041{
51 const unsigned char *dma_window; 1042 const unsigned char *dma_window;
@@ -114,8 +1105,17 @@ static int vio_bus_probe(struct device *dev)
114 return error; 1105 return error;
115 1106
116 id = vio_match_device(viodrv->id_table, viodev); 1107 id = vio_match_device(viodrv->id_table, viodev);
117 if (id) 1108 if (id) {
1109 memset(&viodev->cmo, 0, sizeof(viodev->cmo));
1110 if (firmware_has_feature(FW_FEATURE_CMO)) {
1111 error = vio_cmo_bus_probe(viodev);
1112 if (error)
1113 return error;
1114 }
118 error = viodrv->probe(viodev, id); 1115 error = viodrv->probe(viodev, id);
1116 if (error)
1117 vio_cmo_bus_remove(viodev);
1118 }
119 1119
120 return error; 1120 return error;
121} 1121}
@@ -125,12 +1125,23 @@ static int vio_bus_remove(struct device *dev)
125{ 1125{
126 struct vio_dev *viodev = to_vio_dev(dev); 1126 struct vio_dev *viodev = to_vio_dev(dev);
127 struct vio_driver *viodrv = to_vio_driver(dev->driver); 1127 struct vio_driver *viodrv = to_vio_driver(dev->driver);
1128 struct device *devptr;
1129 int ret = 1;
1130
1131 /*
1132 * Hold a reference to the device after the remove function is called
1133 * to allow for CMO accounting cleanup for the device.
1134 */
1135 devptr = get_device(dev);
128 1136
129 if (viodrv->remove) 1137 if (viodrv->remove)
130 return viodrv->remove(viodev); 1138 ret = viodrv->remove(viodev);
1139
1140 if (!ret && firmware_has_feature(FW_FEATURE_CMO))
1141 vio_cmo_bus_remove(viodev);
131 1142
132 /* driver can't remove */ 1143 put_device(devptr);
133 return 1; 1144 return ret;
134} 1145}
135 1146
136/** 1147/**
@@ -215,7 +1226,11 @@ struct vio_dev *vio_register_device_node(struct device_node *of_node)
215 viodev->unit_address = *unit_address; 1226 viodev->unit_address = *unit_address;
216 } 1227 }
217 viodev->dev.archdata.of_node = of_node_get(of_node); 1228 viodev->dev.archdata.of_node = of_node_get(of_node);
218 viodev->dev.archdata.dma_ops = &dma_iommu_ops; 1229
1230 if (firmware_has_feature(FW_FEATURE_CMO))
1231 vio_cmo_set_dma_ops(viodev);
1232 else
1233 viodev->dev.archdata.dma_ops = &dma_iommu_ops;
219 viodev->dev.archdata.dma_data = vio_build_iommu_table(viodev); 1234 viodev->dev.archdata.dma_data = vio_build_iommu_table(viodev);
220 viodev->dev.archdata.numa_node = of_node_to_nid(of_node); 1235 viodev->dev.archdata.numa_node = of_node_to_nid(of_node);
221 1236
@@ -245,6 +1260,9 @@ static int __init vio_bus_init(void)
245 int err; 1260 int err;
246 struct device_node *node_vroot; 1261 struct device_node *node_vroot;
247 1262
1263 if (firmware_has_feature(FW_FEATURE_CMO))
1264 vio_cmo_sysfs_init();
1265
248 err = bus_register(&vio_bus_type); 1266 err = bus_register(&vio_bus_type);
249 if (err) { 1267 if (err) {
250 printk(KERN_ERR "failed to register VIO bus\n"); 1268 printk(KERN_ERR "failed to register VIO bus\n");
@@ -262,6 +1280,9 @@ static int __init vio_bus_init(void)
262 return err; 1280 return err;
263 } 1281 }
264 1282
1283 if (firmware_has_feature(FW_FEATURE_CMO))
1284 vio_cmo_bus_init();
1285
265 node_vroot = of_find_node_by_name(NULL, "vdevice"); 1286 node_vroot = of_find_node_by_name(NULL, "vdevice");
266 if (node_vroot) { 1287 if (node_vroot) {
267 struct device_node *of_node; 1288 struct device_node *of_node;
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index a914411bced5..4a8ce62fe112 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -85,7 +85,7 @@ SECTIONS
85 85
86 /* The dummy segment contents for the bug workaround mentioned above 86 /* The dummy segment contents for the bug workaround mentioned above
87 near PHDRS. */ 87 near PHDRS. */
88 .dummy : { 88 .dummy : AT(ADDR(.dummy) - LOAD_OFFSET) {
89 LONG(0xf177) 89 LONG(0xf177)
90 } :kernel :dummy 90 } :kernel :dummy
91 91
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 1707d00331fc..565b7a237c84 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -100,31 +100,6 @@ static int store_updates_sp(struct pt_regs *regs)
100 return 0; 100 return 0;
101} 101}
102 102
103#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
104static void do_dabr(struct pt_regs *regs, unsigned long address,
105 unsigned long error_code)
106{
107 siginfo_t info;
108
109 if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code,
110 11, SIGSEGV) == NOTIFY_STOP)
111 return;
112
113 if (debugger_dabr_match(regs))
114 return;
115
116 /* Clear the DABR */
117 set_dabr(0);
118
119 /* Deliver the signal to userspace */
120 info.si_signo = SIGTRAP;
121 info.si_errno = 0;
122 info.si_code = TRAP_HWBKPT;
123 info.si_addr = (void __user *)address;
124 force_sig_info(SIGTRAP, &info, current);
125}
126#endif /* !(CONFIG_4xx || CONFIG_BOOKE)*/
127
128/* 103/*
129 * For 600- and 800-family processors, the error_code parameter is DSISR 104 * For 600- and 800-family processors, the error_code parameter is DSISR
130 * for a data fault, SRR1 for an instruction fault. For 400-family processors 105 * for a data fault, SRR1 for an instruction fault. For 400-family processors
diff --git a/arch/powerpc/platforms/52xx/Kconfig b/arch/powerpc/platforms/52xx/Kconfig
index ccbd4958412e..696a5ee4962d 100644
--- a/arch/powerpc/platforms/52xx/Kconfig
+++ b/arch/powerpc/platforms/52xx/Kconfig
@@ -1,7 +1,6 @@
1config PPC_MPC52xx 1config PPC_MPC52xx
2 bool "52xx-based boards" 2 bool "52xx-based boards"
3 depends on PPC_MULTIPLATFORM && PPC32 3 depends on PPC_MULTIPLATFORM && PPC32
4 select FSL_SOC
5 select PPC_CLOCK 4 select PPC_CLOCK
6 select PPC_PCI_CHOICE 5 select PPC_PCI_CHOICE
7 6
@@ -49,5 +48,6 @@ config PPC_MPC5200_GPIO
49 bool "MPC5200 GPIO support" 48 bool "MPC5200 GPIO support"
50 depends on PPC_MPC52xx 49 depends on PPC_MPC52xx
51 select ARCH_REQUIRE_GPIOLIB 50 select ARCH_REQUIRE_GPIOLIB
51 select GENERIC_GPIO
52 help 52 help
53 Enable gpiolib support for mpc5200 based boards 53 Enable gpiolib support for mpc5200 based boards
diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c
index 208005ca262c..e06420af5fe9 100644
--- a/arch/powerpc/platforms/cell/iommu.c
+++ b/arch/powerpc/platforms/cell/iommu.c
@@ -172,7 +172,7 @@ static void invalidate_tce_cache(struct cbe_iommu *iommu, unsigned long *pte,
172 } 172 }
173} 173}
174 174
175static void tce_build_cell(struct iommu_table *tbl, long index, long npages, 175static int tce_build_cell(struct iommu_table *tbl, long index, long npages,
176 unsigned long uaddr, enum dma_data_direction direction, 176 unsigned long uaddr, enum dma_data_direction direction,
177 struct dma_attrs *attrs) 177 struct dma_attrs *attrs)
178{ 178{
@@ -213,6 +213,7 @@ static void tce_build_cell(struct iommu_table *tbl, long index, long npages,
213 213
214 pr_debug("tce_build_cell(index=%lx,n=%lx,dir=%d,base_pte=%lx)\n", 214 pr_debug("tce_build_cell(index=%lx,n=%lx,dir=%d,base_pte=%lx)\n",
215 index, npages, direction, base_pte); 215 index, npages, direction, base_pte);
216 return 0;
216} 217}
217 218
218static void tce_free_cell(struct iommu_table *tbl, long index, long npages) 219static void tce_free_cell(struct iommu_table *tbl, long index, long npages)
@@ -1150,12 +1151,23 @@ static int iommu_fixed_disabled;
1150 1151
1151static int __init setup_iommu_fixed(char *str) 1152static int __init setup_iommu_fixed(char *str)
1152{ 1153{
1154 struct device_node *pciep;
1155
1153 if (strcmp(str, "off") == 0) 1156 if (strcmp(str, "off") == 0)
1154 iommu_fixed_disabled = 1; 1157 iommu_fixed_disabled = 1;
1155 1158
1156 else if (strcmp(str, "weak") == 0) 1159 /* If we can find a pcie-endpoint in the device tree assume that
1160 * we're on a triblade or a CAB so by default the fixed mapping
1161 * should be set to be weakly ordered; but only if the boot
1162 * option WASN'T set for strong ordering
1163 */
1164 pciep = of_find_node_by_type(NULL, "pcie-endpoint");
1165
1166 if (strcmp(str, "weak") == 0 || (pciep && strcmp(str, "strong") != 0))
1157 iommu_fixed_is_weak = 1; 1167 iommu_fixed_is_weak = 1;
1158 1168
1169 of_node_put(pciep);
1170
1159 return 1; 1171 return 1;
1160} 1172}
1161__setup("iommu_fixed=", setup_iommu_fixed); 1173__setup("iommu_fixed=", setup_iommu_fixed);
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 34654743363d..2deeeba7eccf 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -312,11 +312,28 @@ static struct spu *aff_ref_location(struct spu_context *ctx, int mem_aff,
312 */ 312 */
313 node = cpu_to_node(raw_smp_processor_id()); 313 node = cpu_to_node(raw_smp_processor_id());
314 for (n = 0; n < MAX_NUMNODES; n++, node++) { 314 for (n = 0; n < MAX_NUMNODES; n++, node++) {
315 int available_spus;
316
315 node = (node < MAX_NUMNODES) ? node : 0; 317 node = (node < MAX_NUMNODES) ? node : 0;
316 if (!node_allowed(ctx, node)) 318 if (!node_allowed(ctx, node))
317 continue; 319 continue;
320
321 available_spus = 0;
318 mutex_lock(&cbe_spu_info[node].list_mutex); 322 mutex_lock(&cbe_spu_info[node].list_mutex);
319 list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) { 323 list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
324 if (spu->ctx && spu->ctx->gang
325 && spu->ctx->aff_offset == 0)
326 available_spus -=
327 (spu->ctx->gang->contexts - 1);
328 else
329 available_spus++;
330 }
331 if (available_spus < ctx->gang->contexts) {
332 mutex_unlock(&cbe_spu_info[node].list_mutex);
333 continue;
334 }
335
336 list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
320 if ((!mem_aff || spu->has_mem_affinity) && 337 if ((!mem_aff || spu->has_mem_affinity) &&
321 sched_spu(spu)) { 338 sched_spu(spu)) {
322 mutex_unlock(&cbe_spu_info[node].list_mutex); 339 mutex_unlock(&cbe_spu_info[node].list_mutex);
@@ -389,6 +406,9 @@ static int has_affinity(struct spu_context *ctx)
389 if (list_empty(&ctx->aff_list)) 406 if (list_empty(&ctx->aff_list))
390 return 0; 407 return 0;
391 408
409 if (atomic_read(&ctx->gang->aff_sched_count) == 0)
410 ctx->gang->aff_ref_spu = NULL;
411
392 if (!gang->aff_ref_spu) { 412 if (!gang->aff_ref_spu) {
393 if (!(gang->aff_flags & AFF_MERGED)) 413 if (!(gang->aff_flags & AFF_MERGED))
394 aff_merge_remaining_ctxs(gang); 414 aff_merge_remaining_ctxs(gang);
@@ -416,14 +436,8 @@ static void spu_unbind_context(struct spu *spu, struct spu_context *ctx)
416 if (spu->ctx->flags & SPU_CREATE_NOSCHED) 436 if (spu->ctx->flags & SPU_CREATE_NOSCHED)
417 atomic_dec(&cbe_spu_info[spu->node].reserved_spus); 437 atomic_dec(&cbe_spu_info[spu->node].reserved_spus);
418 438
419 if (ctx->gang){ 439 if (ctx->gang)
420 mutex_lock(&ctx->gang->aff_mutex); 440 atomic_dec_if_positive(&ctx->gang->aff_sched_count);
421 if (has_affinity(ctx)) {
422 if (atomic_dec_and_test(&ctx->gang->aff_sched_count))
423 ctx->gang->aff_ref_spu = NULL;
424 }
425 mutex_unlock(&ctx->gang->aff_mutex);
426 }
427 441
428 spu_switch_notify(spu, NULL); 442 spu_switch_notify(spu, NULL);
429 spu_unmap_mappings(ctx); 443 spu_unmap_mappings(ctx);
@@ -562,10 +576,7 @@ static struct spu *spu_get_idle(struct spu_context *ctx)
562 goto found; 576 goto found;
563 mutex_unlock(&cbe_spu_info[node].list_mutex); 577 mutex_unlock(&cbe_spu_info[node].list_mutex);
564 578
565 mutex_lock(&ctx->gang->aff_mutex); 579 atomic_dec(&ctx->gang->aff_sched_count);
566 if (atomic_dec_and_test(&ctx->gang->aff_sched_count))
567 ctx->gang->aff_ref_spu = NULL;
568 mutex_unlock(&ctx->gang->aff_mutex);
569 goto not_found; 580 goto not_found;
570 } 581 }
571 mutex_unlock(&ctx->gang->aff_mutex); 582 mutex_unlock(&ctx->gang->aff_mutex);
diff --git a/arch/powerpc/platforms/cell/spufs/sputrace.c b/arch/powerpc/platforms/cell/spufs/sputrace.c
index 8c0e95766a62..92d20e993ede 100644
--- a/arch/powerpc/platforms/cell/spufs/sputrace.c
+++ b/arch/powerpc/platforms/cell/spufs/sputrace.c
@@ -196,8 +196,7 @@ static int __init sputrace_init(void)
196 struct proc_dir_entry *entry; 196 struct proc_dir_entry *entry;
197 int i, error = -ENOMEM; 197 int i, error = -ENOMEM;
198 198
199 sputrace_log = kcalloc(sizeof(struct sputrace), 199 sputrace_log = kcalloc(bufsize, sizeof(struct sputrace), GFP_KERNEL);
200 bufsize, GFP_KERNEL);
201 if (!sputrace_log) 200 if (!sputrace_log)
202 goto out; 201 goto out;
203 202
diff --git a/arch/powerpc/platforms/iseries/iommu.c b/arch/powerpc/platforms/iseries/iommu.c
index bc818e4e2033..bb464d1211b2 100644
--- a/arch/powerpc/platforms/iseries/iommu.c
+++ b/arch/powerpc/platforms/iseries/iommu.c
@@ -41,7 +41,7 @@
41#include <asm/iseries/hv_call_event.h> 41#include <asm/iseries/hv_call_event.h>
42#include <asm/iseries/iommu.h> 42#include <asm/iseries/iommu.h>
43 43
44static void tce_build_iSeries(struct iommu_table *tbl, long index, long npages, 44static int tce_build_iSeries(struct iommu_table *tbl, long index, long npages,
45 unsigned long uaddr, enum dma_data_direction direction, 45 unsigned long uaddr, enum dma_data_direction direction,
46 struct dma_attrs *attrs) 46 struct dma_attrs *attrs)
47{ 47{
@@ -71,6 +71,7 @@ static void tce_build_iSeries(struct iommu_table *tbl, long index, long npages,
71 index++; 71 index++;
72 uaddr += TCE_PAGE_SIZE; 72 uaddr += TCE_PAGE_SIZE;
73 } 73 }
74 return 0;
74} 75}
75 76
76static void tce_free_iSeries(struct iommu_table *tbl, long index, long npages) 77static void tce_free_iSeries(struct iommu_table *tbl, long index, long npages)
diff --git a/arch/powerpc/platforms/pasemi/iommu.c b/arch/powerpc/platforms/pasemi/iommu.c
index 70541b7a5013..a0ff03a3d8da 100644
--- a/arch/powerpc/platforms/pasemi/iommu.c
+++ b/arch/powerpc/platforms/pasemi/iommu.c
@@ -83,7 +83,7 @@ static u32 *iob_l2_base;
83static struct iommu_table iommu_table_iobmap; 83static struct iommu_table iommu_table_iobmap;
84static int iommu_table_iobmap_inited; 84static int iommu_table_iobmap_inited;
85 85
86static void iobmap_build(struct iommu_table *tbl, long index, 86static int iobmap_build(struct iommu_table *tbl, long index,
87 long npages, unsigned long uaddr, 87 long npages, unsigned long uaddr,
88 enum dma_data_direction direction, 88 enum dma_data_direction direction,
89 struct dma_attrs *attrs) 89 struct dma_attrs *attrs)
@@ -108,6 +108,7 @@ static void iobmap_build(struct iommu_table *tbl, long index,
108 uaddr += IOBMAP_PAGE_SIZE; 108 uaddr += IOBMAP_PAGE_SIZE;
109 bus_addr += IOBMAP_PAGE_SIZE; 109 bus_addr += IOBMAP_PAGE_SIZE;
110 } 110 }
111 return 0;
111} 112}
112 113
113 114
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index 757c0296e0b8..97619fd51e39 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -40,3 +40,26 @@ config PPC_PSERIES_DEBUG
40 depends on PPC_PSERIES && PPC_EARLY_DEBUG 40 depends on PPC_PSERIES && PPC_EARLY_DEBUG
41 bool "Enable extra debug logging in platforms/pseries" 41 bool "Enable extra debug logging in platforms/pseries"
42 default y 42 default y
43
44config PPC_SMLPAR
45 bool "Support for shared-memory logical partitions"
46 depends on PPC_PSERIES
47 select LPARCFG
48 default n
49 help
50 Select this option to enable shared memory partition support.
51 With this option a system running in an LPAR can be given more
52 memory than physically available and will allow firmware to
53 balance memory across many LPARs.
54
55config CMM
56 tristate "Collaborative memory management"
57 depends on PPC_SMLPAR
58 default y
59 help
60 Select this option, if you want to enable the kernel interface
61 to reduce the memory size of the system. This is accomplished
62 by allocating pages of memory and put them "on hold". This only
63 makes sense for a system running in an LPAR where the unused pages
64 will be reused for other LPARs. The interface allows firmware to
65 balance memory across many LPARs.
diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile
index 554c6e42ef2a..dfe574af2dc0 100644
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -24,3 +24,4 @@ obj-$(CONFIG_HVC_CONSOLE) += hvconsole.o
24obj-$(CONFIG_HVCS) += hvcserver.o 24obj-$(CONFIG_HVCS) += hvcserver.o
25obj-$(CONFIG_HCALL_STATS) += hvCall_inst.o 25obj-$(CONFIG_HCALL_STATS) += hvCall_inst.o
26obj-$(CONFIG_PHYP_DUMP) += phyp_dump.o 26obj-$(CONFIG_PHYP_DUMP) += phyp_dump.o
27obj-$(CONFIG_CMM) += cmm.o
diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c
new file mode 100644
index 000000000000..c6b3be03168b
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/cmm.c
@@ -0,0 +1,468 @@
1/*
2 * Collaborative memory management interface.
3 *
4 * Copyright (C) 2008 IBM Corporation
5 * Author(s): Brian King (brking@linux.vnet.ibm.com),
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 *
21 */
22
23#include <linux/ctype.h>
24#include <linux/delay.h>
25#include <linux/errno.h>
26#include <linux/fs.h>
27#include <linux/init.h>
28#include <linux/kthread.h>
29#include <linux/module.h>
30#include <linux/oom.h>
31#include <linux/sched.h>
32#include <linux/stringify.h>
33#include <linux/swap.h>
34#include <linux/sysdev.h>
35#include <asm/firmware.h>
36#include <asm/hvcall.h>
37#include <asm/mmu.h>
38#include <asm/pgalloc.h>
39#include <asm/uaccess.h>
40
41#include "plpar_wrappers.h"
42
43#define CMM_DRIVER_VERSION "1.0.0"
44#define CMM_DEFAULT_DELAY 1
45#define CMM_DEBUG 0
46#define CMM_DISABLE 0
47#define CMM_OOM_KB 1024
48#define CMM_MIN_MEM_MB 256
49#define KB2PAGES(_p) ((_p)>>(PAGE_SHIFT-10))
50#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10))
51
52static unsigned int delay = CMM_DEFAULT_DELAY;
53static unsigned int oom_kb = CMM_OOM_KB;
54static unsigned int cmm_debug = CMM_DEBUG;
55static unsigned int cmm_disabled = CMM_DISABLE;
56static unsigned long min_mem_mb = CMM_MIN_MEM_MB;
57static struct sys_device cmm_sysdev;
58
59MODULE_AUTHOR("Brian King <brking@linux.vnet.ibm.com>");
60MODULE_DESCRIPTION("IBM System p Collaborative Memory Manager");
61MODULE_LICENSE("GPL");
62MODULE_VERSION(CMM_DRIVER_VERSION);
63
64module_param_named(delay, delay, uint, S_IRUGO | S_IWUSR);
65MODULE_PARM_DESC(delay, "Delay (in seconds) between polls to query hypervisor paging requests. "
66 "[Default=" __stringify(CMM_DEFAULT_DELAY) "]");
67module_param_named(oom_kb, oom_kb, uint, S_IRUGO | S_IWUSR);
68MODULE_PARM_DESC(oom_kb, "Amount of memory in kb to free on OOM. "
69 "[Default=" __stringify(CMM_OOM_KB) "]");
70module_param_named(min_mem_mb, min_mem_mb, ulong, S_IRUGO | S_IWUSR);
71MODULE_PARM_DESC(min_mem_mb, "Minimum amount of memory (in MB) to not balloon. "
72 "[Default=" __stringify(CMM_MIN_MEM_MB) "]");
73module_param_named(debug, cmm_debug, uint, S_IRUGO | S_IWUSR);
74MODULE_PARM_DESC(debug, "Enable module debugging logging. Set to 1 to enable. "
75 "[Default=" __stringify(CMM_DEBUG) "]");
76
77#define CMM_NR_PAGES ((PAGE_SIZE - sizeof(void *) - sizeof(unsigned long)) / sizeof(unsigned long))
78
79#define cmm_dbg(...) if (cmm_debug) { printk(KERN_INFO "cmm: "__VA_ARGS__); }
80
81struct cmm_page_array {
82 struct cmm_page_array *next;
83 unsigned long index;
84 unsigned long page[CMM_NR_PAGES];
85};
86
87static unsigned long loaned_pages;
88static unsigned long loaned_pages_target;
89static unsigned long oom_freed_pages;
90
91static struct cmm_page_array *cmm_page_list;
92static DEFINE_SPINLOCK(cmm_lock);
93
94static struct task_struct *cmm_thread_ptr;
95
96/**
97 * cmm_alloc_pages - Allocate pages and mark them as loaned
98 * @nr: number of pages to allocate
99 *
100 * Return value:
101 * number of pages requested to be allocated which were not
102 **/
103static long cmm_alloc_pages(long nr)
104{
105 struct cmm_page_array *pa, *npa;
106 unsigned long addr;
107 long rc;
108
109 cmm_dbg("Begin request for %ld pages\n", nr);
110
111 while (nr) {
112 addr = __get_free_page(GFP_NOIO | __GFP_NOWARN |
113 __GFP_NORETRY | __GFP_NOMEMALLOC);
114 if (!addr)
115 break;
116 spin_lock(&cmm_lock);
117 pa = cmm_page_list;
118 if (!pa || pa->index >= CMM_NR_PAGES) {
119 /* Need a new page for the page list. */
120 spin_unlock(&cmm_lock);
121 npa = (struct cmm_page_array *)__get_free_page(GFP_NOIO | __GFP_NOWARN |
122 __GFP_NORETRY | __GFP_NOMEMALLOC);
123 if (!npa) {
124 pr_info("%s: Can not allocate new page list\n", __FUNCTION__);
125 free_page(addr);
126 break;
127 }
128 spin_lock(&cmm_lock);
129 pa = cmm_page_list;
130
131 if (!pa || pa->index >= CMM_NR_PAGES) {
132 npa->next = pa;
133 npa->index = 0;
134 pa = npa;
135 cmm_page_list = pa;
136 } else
137 free_page((unsigned long) npa);
138 }
139
140 if ((rc = plpar_page_set_loaned(__pa(addr)))) {
141 pr_err("%s: Can not set page to loaned. rc=%ld\n", __FUNCTION__, rc);
142 spin_unlock(&cmm_lock);
143 free_page(addr);
144 break;
145 }
146
147 pa->page[pa->index++] = addr;
148 loaned_pages++;
149 totalram_pages--;
150 spin_unlock(&cmm_lock);
151 nr--;
152 }
153
154 cmm_dbg("End request with %ld pages unfulfilled\n", nr);
155 return nr;
156}
157
158/**
159 * cmm_free_pages - Free pages and mark them as active
160 * @nr: number of pages to free
161 *
162 * Return value:
163 * number of pages requested to be freed which were not
164 **/
165static long cmm_free_pages(long nr)
166{
167 struct cmm_page_array *pa;
168 unsigned long addr;
169
170 cmm_dbg("Begin free of %ld pages.\n", nr);
171 spin_lock(&cmm_lock);
172 pa = cmm_page_list;
173 while (nr) {
174 if (!pa || pa->index <= 0)
175 break;
176 addr = pa->page[--pa->index];
177
178 if (pa->index == 0) {
179 pa = pa->next;
180 free_page((unsigned long) cmm_page_list);
181 cmm_page_list = pa;
182 }
183
184 plpar_page_set_active(__pa(addr));
185 free_page(addr);
186 loaned_pages--;
187 nr--;
188 totalram_pages++;
189 }
190 spin_unlock(&cmm_lock);
191 cmm_dbg("End request with %ld pages unfulfilled\n", nr);
192 return nr;
193}
194
195/**
196 * cmm_oom_notify - OOM notifier
197 * @self: notifier block struct
198 * @dummy: not used
199 * @parm: returned - number of pages freed
200 *
201 * Return value:
202 * NOTIFY_OK
203 **/
204static int cmm_oom_notify(struct notifier_block *self,
205 unsigned long dummy, void *parm)
206{
207 unsigned long *freed = parm;
208 long nr = KB2PAGES(oom_kb);
209
210 cmm_dbg("OOM processing started\n");
211 nr = cmm_free_pages(nr);
212 loaned_pages_target = loaned_pages;
213 *freed += KB2PAGES(oom_kb) - nr;
214 oom_freed_pages += KB2PAGES(oom_kb) - nr;
215 cmm_dbg("OOM processing complete\n");
216 return NOTIFY_OK;
217}
218
219/**
220 * cmm_get_mpp - Read memory performance parameters
221 *
222 * Makes hcall to query the current page loan request from the hypervisor.
223 *
224 * Return value:
225 * nothing
226 **/
227static void cmm_get_mpp(void)
228{
229 int rc;
230 struct hvcall_mpp_data mpp_data;
231 unsigned long active_pages_target;
232 signed long page_loan_request;
233
234 rc = h_get_mpp(&mpp_data);
235
236 if (rc != H_SUCCESS)
237 return;
238
239 page_loan_request = div_s64((s64)mpp_data.loan_request, PAGE_SIZE);
240 loaned_pages_target = page_loan_request + loaned_pages;
241 if (loaned_pages_target > oom_freed_pages)
242 loaned_pages_target -= oom_freed_pages;
243 else
244 loaned_pages_target = 0;
245
246 active_pages_target = totalram_pages + loaned_pages - loaned_pages_target;
247
248 if ((min_mem_mb * 1024 * 1024) > (active_pages_target * PAGE_SIZE))
249 loaned_pages_target = totalram_pages + loaned_pages -
250 ((min_mem_mb * 1024 * 1024) / PAGE_SIZE);
251
252 cmm_dbg("delta = %ld, loaned = %lu, target = %lu, oom = %lu, totalram = %lu\n",
253 page_loan_request, loaned_pages, loaned_pages_target,
254 oom_freed_pages, totalram_pages);
255}
256
257static struct notifier_block cmm_oom_nb = {
258 .notifier_call = cmm_oom_notify
259};
260
261/**
262 * cmm_thread - CMM task thread
263 * @dummy: not used
264 *
265 * Return value:
266 * 0
267 **/
268static int cmm_thread(void *dummy)
269{
270 unsigned long timeleft;
271
272 while (1) {
273 timeleft = msleep_interruptible(delay * 1000);
274
275 if (kthread_should_stop() || timeleft) {
276 loaned_pages_target = loaned_pages;
277 break;
278 }
279
280 cmm_get_mpp();
281
282 if (loaned_pages_target > loaned_pages) {
283 if (cmm_alloc_pages(loaned_pages_target - loaned_pages))
284 loaned_pages_target = loaned_pages;
285 } else if (loaned_pages_target < loaned_pages)
286 cmm_free_pages(loaned_pages - loaned_pages_target);
287 }
288 return 0;
289}
290
291#define CMM_SHOW(name, format, args...) \
292 static ssize_t show_##name(struct sys_device *dev, char *buf) \
293 { \
294 return sprintf(buf, format, ##args); \
295 } \
296 static SYSDEV_ATTR(name, S_IRUGO, show_##name, NULL)
297
298CMM_SHOW(loaned_kb, "%lu\n", PAGES2KB(loaned_pages));
299CMM_SHOW(loaned_target_kb, "%lu\n", PAGES2KB(loaned_pages_target));
300
301static ssize_t show_oom_pages(struct sys_device *dev, char *buf)
302{
303 return sprintf(buf, "%lu\n", PAGES2KB(oom_freed_pages));
304}
305
306static ssize_t store_oom_pages(struct sys_device *dev,
307 const char *buf, size_t count)
308{
309 unsigned long val = simple_strtoul (buf, NULL, 10);
310
311 if (!capable(CAP_SYS_ADMIN))
312 return -EPERM;
313 if (val != 0)
314 return -EBADMSG;
315
316 oom_freed_pages = 0;
317 return count;
318}
319
320static SYSDEV_ATTR(oom_freed_kb, S_IWUSR| S_IRUGO,
321 show_oom_pages, store_oom_pages);
322
323static struct sysdev_attribute *cmm_attrs[] = {
324 &attr_loaned_kb,
325 &attr_loaned_target_kb,
326 &attr_oom_freed_kb,
327};
328
329static struct sysdev_class cmm_sysdev_class = {
330 .name = "cmm",
331};
332
333/**
334 * cmm_sysfs_register - Register with sysfs
335 *
336 * Return value:
337 * 0 on success / other on failure
338 **/
339static int cmm_sysfs_register(struct sys_device *sysdev)
340{
341 int i, rc;
342
343 if ((rc = sysdev_class_register(&cmm_sysdev_class)))
344 return rc;
345
346 sysdev->id = 0;
347 sysdev->cls = &cmm_sysdev_class;
348
349 if ((rc = sysdev_register(sysdev)))
350 goto class_unregister;
351
352 for (i = 0; i < ARRAY_SIZE(cmm_attrs); i++) {
353 if ((rc = sysdev_create_file(sysdev, cmm_attrs[i])))
354 goto fail;
355 }
356
357 return 0;
358
359fail:
360 while (--i >= 0)
361 sysdev_remove_file(sysdev, cmm_attrs[i]);
362 sysdev_unregister(sysdev);
363class_unregister:
364 sysdev_class_unregister(&cmm_sysdev_class);
365 return rc;
366}
367
368/**
369 * cmm_unregister_sysfs - Unregister from sysfs
370 *
371 **/
372static void cmm_unregister_sysfs(struct sys_device *sysdev)
373{
374 int i;
375
376 for (i = 0; i < ARRAY_SIZE(cmm_attrs); i++)
377 sysdev_remove_file(sysdev, cmm_attrs[i]);
378 sysdev_unregister(sysdev);
379 sysdev_class_unregister(&cmm_sysdev_class);
380}
381
382/**
383 * cmm_init - Module initialization
384 *
385 * Return value:
386 * 0 on success / other on failure
387 **/
388static int cmm_init(void)
389{
390 int rc = -ENOMEM;
391
392 if (!firmware_has_feature(FW_FEATURE_CMO))
393 return -EOPNOTSUPP;
394
395 if ((rc = register_oom_notifier(&cmm_oom_nb)) < 0)
396 return rc;
397
398 if ((rc = cmm_sysfs_register(&cmm_sysdev)))
399 goto out_oom_notifier;
400
401 if (cmm_disabled)
402 return rc;
403
404 cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread");
405 if (IS_ERR(cmm_thread_ptr)) {
406 rc = PTR_ERR(cmm_thread_ptr);
407 goto out_unregister_sysfs;
408 }
409
410 return rc;
411
412out_unregister_sysfs:
413 cmm_unregister_sysfs(&cmm_sysdev);
414out_oom_notifier:
415 unregister_oom_notifier(&cmm_oom_nb);
416 return rc;
417}
418
419/**
420 * cmm_exit - Module exit
421 *
422 * Return value:
423 * nothing
424 **/
425static void cmm_exit(void)
426{
427 if (cmm_thread_ptr)
428 kthread_stop(cmm_thread_ptr);
429 unregister_oom_notifier(&cmm_oom_nb);
430 cmm_free_pages(loaned_pages);
431 cmm_unregister_sysfs(&cmm_sysdev);
432}
433
434/**
435 * cmm_set_disable - Disable/Enable CMM
436 *
437 * Return value:
438 * 0 on success / other on failure
439 **/
440static int cmm_set_disable(const char *val, struct kernel_param *kp)
441{
442 int disable = simple_strtoul(val, NULL, 10);
443
444 if (disable != 0 && disable != 1)
445 return -EINVAL;
446
447 if (disable && !cmm_disabled) {
448 if (cmm_thread_ptr)
449 kthread_stop(cmm_thread_ptr);
450 cmm_thread_ptr = NULL;
451 cmm_free_pages(loaned_pages);
452 } else if (!disable && cmm_disabled) {
453 cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread");
454 if (IS_ERR(cmm_thread_ptr))
455 return PTR_ERR(cmm_thread_ptr);
456 }
457
458 cmm_disabled = disable;
459 return 0;
460}
461
462module_param_call(disable, cmm_set_disable, param_get_uint,
463 &cmm_disabled, S_IRUGO | S_IWUSR);
464MODULE_PARM_DESC(disable, "Disable CMM. Set to 1 to disable. "
465 "[Default=" __stringify(CMM_DISABLE) "]");
466
467module_init(cmm_init);
468module_exit(cmm_exit);
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 5377dd4b849a..a8c446697f9e 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -48,7 +48,7 @@
48#include "plpar_wrappers.h" 48#include "plpar_wrappers.h"
49 49
50 50
51static void tce_build_pSeries(struct iommu_table *tbl, long index, 51static int tce_build_pSeries(struct iommu_table *tbl, long index,
52 long npages, unsigned long uaddr, 52 long npages, unsigned long uaddr,
53 enum dma_data_direction direction, 53 enum dma_data_direction direction,
54 struct dma_attrs *attrs) 54 struct dma_attrs *attrs)
@@ -72,6 +72,7 @@ static void tce_build_pSeries(struct iommu_table *tbl, long index,
72 uaddr += TCE_PAGE_SIZE; 72 uaddr += TCE_PAGE_SIZE;
73 tcep++; 73 tcep++;
74 } 74 }
75 return 0;
75} 76}
76 77
77 78
@@ -94,14 +95,19 @@ static unsigned long tce_get_pseries(struct iommu_table *tbl, long index)
94 return *tcep; 95 return *tcep;
95} 96}
96 97
97static void tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, 98static void tce_free_pSeriesLP(struct iommu_table*, long, long);
99static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long);
100
101static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,
98 long npages, unsigned long uaddr, 102 long npages, unsigned long uaddr,
99 enum dma_data_direction direction, 103 enum dma_data_direction direction,
100 struct dma_attrs *attrs) 104 struct dma_attrs *attrs)
101{ 105{
102 u64 rc; 106 u64 rc = 0;
103 u64 proto_tce, tce; 107 u64 proto_tce, tce;
104 u64 rpn; 108 u64 rpn;
109 int ret = 0;
110 long tcenum_start = tcenum, npages_start = npages;
105 111
106 rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT; 112 rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT;
107 proto_tce = TCE_PCI_READ; 113 proto_tce = TCE_PCI_READ;
@@ -112,6 +118,13 @@ static void tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,
112 tce = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT; 118 tce = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT;
113 rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, tce); 119 rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, tce);
114 120
121 if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
122 ret = (int)rc;
123 tce_free_pSeriesLP(tbl, tcenum_start,
124 (npages_start - (npages + 1)));
125 break;
126 }
127
115 if (rc && printk_ratelimit()) { 128 if (rc && printk_ratelimit()) {
116 printk("tce_build_pSeriesLP: plpar_tce_put failed. rc=%ld\n", rc); 129 printk("tce_build_pSeriesLP: plpar_tce_put failed. rc=%ld\n", rc);
117 printk("\tindex = 0x%lx\n", (u64)tbl->it_index); 130 printk("\tindex = 0x%lx\n", (u64)tbl->it_index);
@@ -123,25 +136,27 @@ static void tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,
123 tcenum++; 136 tcenum++;
124 rpn++; 137 rpn++;
125 } 138 }
139 return ret;
126} 140}
127 141
128static DEFINE_PER_CPU(u64 *, tce_page) = NULL; 142static DEFINE_PER_CPU(u64 *, tce_page) = NULL;
129 143
130static void tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, 144static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
131 long npages, unsigned long uaddr, 145 long npages, unsigned long uaddr,
132 enum dma_data_direction direction, 146 enum dma_data_direction direction,
133 struct dma_attrs *attrs) 147 struct dma_attrs *attrs)
134{ 148{
135 u64 rc; 149 u64 rc = 0;
136 u64 proto_tce; 150 u64 proto_tce;
137 u64 *tcep; 151 u64 *tcep;
138 u64 rpn; 152 u64 rpn;
139 long l, limit; 153 long l, limit;
154 long tcenum_start = tcenum, npages_start = npages;
155 int ret = 0;
140 156
141 if (npages == 1) { 157 if (npages == 1) {
142 tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, 158 return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
143 direction, attrs); 159 direction, attrs);
144 return;
145 } 160 }
146 161
147 tcep = __get_cpu_var(tce_page); 162 tcep = __get_cpu_var(tce_page);
@@ -153,9 +168,8 @@ static void tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
153 tcep = (u64 *)__get_free_page(GFP_ATOMIC); 168 tcep = (u64 *)__get_free_page(GFP_ATOMIC);
154 /* If allocation fails, fall back to the loop implementation */ 169 /* If allocation fails, fall back to the loop implementation */
155 if (!tcep) { 170 if (!tcep) {
156 tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, 171 return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
157 direction, attrs); 172 direction, attrs);
158 return;
159 } 173 }
160 __get_cpu_var(tce_page) = tcep; 174 __get_cpu_var(tce_page) = tcep;
161 } 175 }
@@ -187,6 +201,13 @@ static void tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
187 tcenum += limit; 201 tcenum += limit;
188 } while (npages > 0 && !rc); 202 } while (npages > 0 && !rc);
189 203
204 if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
205 ret = (int)rc;
206 tce_freemulti_pSeriesLP(tbl, tcenum_start,
207 (npages_start - (npages + limit)));
208 return ret;
209 }
210
190 if (rc && printk_ratelimit()) { 211 if (rc && printk_ratelimit()) {
191 printk("tce_buildmulti_pSeriesLP: plpar_tce_put failed. rc=%ld\n", rc); 212 printk("tce_buildmulti_pSeriesLP: plpar_tce_put failed. rc=%ld\n", rc);
192 printk("\tindex = 0x%lx\n", (u64)tbl->it_index); 213 printk("\tindex = 0x%lx\n", (u64)tbl->it_index);
@@ -194,6 +215,7 @@ static void tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
194 printk("\ttce[0] val = 0x%lx\n", tcep[0]); 215 printk("\ttce[0] val = 0x%lx\n", tcep[0]);
195 show_stack(current, (unsigned long *)__get_SP()); 216 show_stack(current, (unsigned long *)__get_SP());
196 } 217 }
218 return ret;
197} 219}
198 220
199static void tce_free_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages) 221static void tce_free_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages)
diff --git a/arch/powerpc/platforms/pseries/plpar_wrappers.h b/arch/powerpc/platforms/pseries/plpar_wrappers.h
index d8680b589dc9..a437267c6bf8 100644
--- a/arch/powerpc/platforms/pseries/plpar_wrappers.h
+++ b/arch/powerpc/platforms/pseries/plpar_wrappers.h
@@ -42,6 +42,16 @@ static inline long register_slb_shadow(unsigned long cpu, unsigned long vpa)
42 return vpa_call(0x3, cpu, vpa); 42 return vpa_call(0x3, cpu, vpa);
43} 43}
44 44
45static inline long plpar_page_set_loaned(unsigned long vpa)
46{
47 return plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_LOANED, vpa, 0);
48}
49
50static inline long plpar_page_set_active(unsigned long vpa)
51{
52 return plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_ACTIVE, vpa, 0);
53}
54
45extern void vpa_init(int cpu); 55extern void vpa_init(int cpu);
46 56
47static inline long plpar_pte_enter(unsigned long flags, 57static inline long plpar_pte_enter(unsigned long flags,
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 90beb444e1dd..063a0d2fba30 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -314,6 +314,76 @@ static int pseries_set_xdabr(unsigned long dabr)
314 H_DABRX_KERNEL | H_DABRX_USER); 314 H_DABRX_KERNEL | H_DABRX_USER);
315} 315}
316 316
317#define CMO_CHARACTERISTICS_TOKEN 44
318#define CMO_MAXLENGTH 1026
319
320/**
321 * fw_cmo_feature_init - FW_FEATURE_CMO is not stored in ibm,hypertas-functions,
322 * handle that here. (Stolen from parse_system_parameter_string)
323 */
324void pSeries_cmo_feature_init(void)
325{
326 char *ptr, *key, *value, *end;
327 int call_status;
328 int PrPSP = -1;
329 int SecPSP = -1;
330
331 pr_debug(" -> fw_cmo_feature_init()\n");
332 spin_lock(&rtas_data_buf_lock);
333 memset(rtas_data_buf, 0, RTAS_DATA_BUF_SIZE);
334 call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
335 NULL,
336 CMO_CHARACTERISTICS_TOKEN,
337 __pa(rtas_data_buf),
338 RTAS_DATA_BUF_SIZE);
339
340 if (call_status != 0) {
341 spin_unlock(&rtas_data_buf_lock);
342 pr_debug("CMO not available\n");
343 pr_debug(" <- fw_cmo_feature_init()\n");
344 return;
345 }
346
347 end = rtas_data_buf + CMO_MAXLENGTH - 2;
348 ptr = rtas_data_buf + 2; /* step over strlen value */
349 key = value = ptr;
350
351 while (*ptr && (ptr <= end)) {
352 /* Separate the key and value by replacing '=' with '\0' and
353 * point the value at the string after the '='
354 */
355 if (ptr[0] == '=') {
356 ptr[0] = '\0';
357 value = ptr + 1;
358 } else if (ptr[0] == '\0' || ptr[0] == ',') {
359 /* Terminate the string containing the key/value pair */
360 ptr[0] = '\0';
361
362 if (key == value) {
363 pr_debug("Malformed key/value pair\n");
364 /* Never found a '=', end processing */
365 break;
366 }
367
368 if (0 == strcmp(key, "PrPSP"))
369 PrPSP = simple_strtol(value, NULL, 10);
370 else if (0 == strcmp(key, "SecPSP"))
371 SecPSP = simple_strtol(value, NULL, 10);
372 value = key = ptr + 1;
373 }
374 ptr++;
375 }
376
377 if (PrPSP != -1 || SecPSP != -1) {
378 pr_info("CMO enabled\n");
379 pr_debug("CMO enabled, PrPSP=%d, SecPSP=%d\n", PrPSP, SecPSP);
380 powerpc_firmware_features |= FW_FEATURE_CMO;
381 } else
382 pr_debug("CMO not enabled, PrPSP=%d, SecPSP=%d\n", PrPSP, SecPSP);
383 spin_unlock(&rtas_data_buf_lock);
384 pr_debug(" <- fw_cmo_feature_init()\n");
385}
386
317/* 387/*
318 * Early initialization. Relocation is on but do not reference unbolted pages 388 * Early initialization. Relocation is on but do not reference unbolted pages
319 */ 389 */
@@ -329,6 +399,7 @@ static void __init pSeries_init_early(void)
329 else if (firmware_has_feature(FW_FEATURE_XDABR)) 399 else if (firmware_has_feature(FW_FEATURE_XDABR))
330 ppc_md.set_dabr = pseries_set_xdabr; 400 ppc_md.set_dabr = pseries_set_xdabr;
331 401
402 pSeries_cmo_feature_init();
332 iommu_init_early_pSeries(); 403 iommu_init_early_pSeries();
333 404
334 pr_debug(" <- pSeries_init_early()\n"); 405 pr_debug(" <- pSeries_init_early()\n");
diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c
index de8c8b542cfa..89639ecbf381 100644
--- a/arch/powerpc/sysdev/dart_iommu.c
+++ b/arch/powerpc/sysdev/dart_iommu.c
@@ -147,7 +147,7 @@ static void dart_flush(struct iommu_table *tbl)
147 } 147 }
148} 148}
149 149
150static void dart_build(struct iommu_table *tbl, long index, 150static int dart_build(struct iommu_table *tbl, long index,
151 long npages, unsigned long uaddr, 151 long npages, unsigned long uaddr,
152 enum dma_data_direction direction, 152 enum dma_data_direction direction,
153 struct dma_attrs *attrs) 153 struct dma_attrs *attrs)
@@ -184,6 +184,7 @@ static void dart_build(struct iommu_table *tbl, long index,
184 } else { 184 } else {
185 dart_dirty = 1; 185 dart_dirty = 1;
186 } 186 }
187 return 0;
187} 188}
188 189
189 190