diff options
author | Glenn Elliott <gelliott@cs.unc.edu> | 2012-03-04 19:47:13 -0500 |
---|---|---|
committer | Glenn Elliott <gelliott@cs.unc.edu> | 2012-03-04 19:47:13 -0500 |
commit | c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch) | |
tree | ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /arch/x86/include/asm/percpu.h | |
parent | ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff) | |
parent | 6a00f206debf8a5c8899055726ad127dbeeed098 (diff) |
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts:
litmus/sched_cedf.c
Diffstat (limited to 'arch/x86/include/asm/percpu.h')
-rw-r--r-- | arch/x86/include/asm/percpu.h | 256 |
1 files changed, 254 insertions, 2 deletions
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index cd28f9ad910d..a0a9779084d1 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h | |||
@@ -45,12 +45,28 @@ | |||
45 | #include <linux/stringify.h> | 45 | #include <linux/stringify.h> |
46 | 46 | ||
47 | #ifdef CONFIG_SMP | 47 | #ifdef CONFIG_SMP |
48 | #define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x | 48 | #define __percpu_prefix "%%"__stringify(__percpu_seg)":" |
49 | #define __my_cpu_offset percpu_read(this_cpu_off) | 49 | #define __my_cpu_offset percpu_read(this_cpu_off) |
50 | |||
51 | /* | ||
52 | * Compared to the generic __my_cpu_offset version, the following | ||
53 | * saves one instruction and avoids clobbering a temp register. | ||
54 | */ | ||
55 | #define __this_cpu_ptr(ptr) \ | ||
56 | ({ \ | ||
57 | unsigned long tcp_ptr__; \ | ||
58 | __verify_pcpu_ptr(ptr); \ | ||
59 | asm volatile("add " __percpu_arg(1) ", %0" \ | ||
60 | : "=r" (tcp_ptr__) \ | ||
61 | : "m" (this_cpu_off), "0" (ptr)); \ | ||
62 | (typeof(*(ptr)) __kernel __force *)tcp_ptr__; \ | ||
63 | }) | ||
50 | #else | 64 | #else |
51 | #define __percpu_arg(x) "%P" #x | 65 | #define __percpu_prefix "" |
52 | #endif | 66 | #endif |
53 | 67 | ||
68 | #define __percpu_arg(x) __percpu_prefix "%P" #x | ||
69 | |||
54 | /* | 70 | /* |
55 | * Initialized pointers to per-cpu variables needed for the boot | 71 | * Initialized pointers to per-cpu variables needed for the boot |
56 | * processor need to use these macros to get the proper address | 72 | * processor need to use these macros to get the proper address |
@@ -216,6 +232,125 @@ do { \ | |||
216 | }) | 232 | }) |
217 | 233 | ||
218 | /* | 234 | /* |
235 | * Add return operation | ||
236 | */ | ||
237 | #define percpu_add_return_op(var, val) \ | ||
238 | ({ \ | ||
239 | typeof(var) paro_ret__ = val; \ | ||
240 | switch (sizeof(var)) { \ | ||
241 | case 1: \ | ||
242 | asm("xaddb %0, "__percpu_arg(1) \ | ||
243 | : "+q" (paro_ret__), "+m" (var) \ | ||
244 | : : "memory"); \ | ||
245 | break; \ | ||
246 | case 2: \ | ||
247 | asm("xaddw %0, "__percpu_arg(1) \ | ||
248 | : "+r" (paro_ret__), "+m" (var) \ | ||
249 | : : "memory"); \ | ||
250 | break; \ | ||
251 | case 4: \ | ||
252 | asm("xaddl %0, "__percpu_arg(1) \ | ||
253 | : "+r" (paro_ret__), "+m" (var) \ | ||
254 | : : "memory"); \ | ||
255 | break; \ | ||
256 | case 8: \ | ||
257 | asm("xaddq %0, "__percpu_arg(1) \ | ||
258 | : "+re" (paro_ret__), "+m" (var) \ | ||
259 | : : "memory"); \ | ||
260 | break; \ | ||
261 | default: __bad_percpu_size(); \ | ||
262 | } \ | ||
263 | paro_ret__ += val; \ | ||
264 | paro_ret__; \ | ||
265 | }) | ||
266 | |||
267 | /* | ||
268 | * xchg is implemented using cmpxchg without a lock prefix. xchg is | ||
269 | * expensive due to the implied lock prefix. The processor cannot prefetch | ||
270 | * cachelines if xchg is used. | ||
271 | */ | ||
272 | #define percpu_xchg_op(var, nval) \ | ||
273 | ({ \ | ||
274 | typeof(var) pxo_ret__; \ | ||
275 | typeof(var) pxo_new__ = (nval); \ | ||
276 | switch (sizeof(var)) { \ | ||
277 | case 1: \ | ||
278 | asm("\n\tmov "__percpu_arg(1)",%%al" \ | ||
279 | "\n1:\tcmpxchgb %2, "__percpu_arg(1) \ | ||
280 | "\n\tjnz 1b" \ | ||
281 | : "=&a" (pxo_ret__), "+m" (var) \ | ||
282 | : "q" (pxo_new__) \ | ||
283 | : "memory"); \ | ||
284 | break; \ | ||
285 | case 2: \ | ||
286 | asm("\n\tmov "__percpu_arg(1)",%%ax" \ | ||
287 | "\n1:\tcmpxchgw %2, "__percpu_arg(1) \ | ||
288 | "\n\tjnz 1b" \ | ||
289 | : "=&a" (pxo_ret__), "+m" (var) \ | ||
290 | : "r" (pxo_new__) \ | ||
291 | : "memory"); \ | ||
292 | break; \ | ||
293 | case 4: \ | ||
294 | asm("\n\tmov "__percpu_arg(1)",%%eax" \ | ||
295 | "\n1:\tcmpxchgl %2, "__percpu_arg(1) \ | ||
296 | "\n\tjnz 1b" \ | ||
297 | : "=&a" (pxo_ret__), "+m" (var) \ | ||
298 | : "r" (pxo_new__) \ | ||
299 | : "memory"); \ | ||
300 | break; \ | ||
301 | case 8: \ | ||
302 | asm("\n\tmov "__percpu_arg(1)",%%rax" \ | ||
303 | "\n1:\tcmpxchgq %2, "__percpu_arg(1) \ | ||
304 | "\n\tjnz 1b" \ | ||
305 | : "=&a" (pxo_ret__), "+m" (var) \ | ||
306 | : "r" (pxo_new__) \ | ||
307 | : "memory"); \ | ||
308 | break; \ | ||
309 | default: __bad_percpu_size(); \ | ||
310 | } \ | ||
311 | pxo_ret__; \ | ||
312 | }) | ||
313 | |||
314 | /* | ||
315 | * cmpxchg has no such implied lock semantics as a result it is much | ||
316 | * more efficient for cpu local operations. | ||
317 | */ | ||
318 | #define percpu_cmpxchg_op(var, oval, nval) \ | ||
319 | ({ \ | ||
320 | typeof(var) pco_ret__; \ | ||
321 | typeof(var) pco_old__ = (oval); \ | ||
322 | typeof(var) pco_new__ = (nval); \ | ||
323 | switch (sizeof(var)) { \ | ||
324 | case 1: \ | ||
325 | asm("cmpxchgb %2, "__percpu_arg(1) \ | ||
326 | : "=a" (pco_ret__), "+m" (var) \ | ||
327 | : "q" (pco_new__), "0" (pco_old__) \ | ||
328 | : "memory"); \ | ||
329 | break; \ | ||
330 | case 2: \ | ||
331 | asm("cmpxchgw %2, "__percpu_arg(1) \ | ||
332 | : "=a" (pco_ret__), "+m" (var) \ | ||
333 | : "r" (pco_new__), "0" (pco_old__) \ | ||
334 | : "memory"); \ | ||
335 | break; \ | ||
336 | case 4: \ | ||
337 | asm("cmpxchgl %2, "__percpu_arg(1) \ | ||
338 | : "=a" (pco_ret__), "+m" (var) \ | ||
339 | : "r" (pco_new__), "0" (pco_old__) \ | ||
340 | : "memory"); \ | ||
341 | break; \ | ||
342 | case 8: \ | ||
343 | asm("cmpxchgq %2, "__percpu_arg(1) \ | ||
344 | : "=a" (pco_ret__), "+m" (var) \ | ||
345 | : "r" (pco_new__), "0" (pco_old__) \ | ||
346 | : "memory"); \ | ||
347 | break; \ | ||
348 | default: __bad_percpu_size(); \ | ||
349 | } \ | ||
350 | pco_ret__; \ | ||
351 | }) | ||
352 | |||
353 | /* | ||
219 | * percpu_read() makes gcc load the percpu variable every time it is | 354 | * percpu_read() makes gcc load the percpu variable every time it is |
220 | * accessed while percpu_read_stable() allows the value to be cached. | 355 | * accessed while percpu_read_stable() allows the value to be cached. |
221 | * percpu_read_stable() is more efficient and can be used if its value | 356 | * percpu_read_stable() is more efficient and can be used if its value |
@@ -253,6 +388,12 @@ do { \ | |||
253 | #define __this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) | 388 | #define __this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) |
254 | #define __this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) | 389 | #define __this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) |
255 | #define __this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) | 390 | #define __this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) |
391 | /* | ||
392 | * Generic fallback operations for __this_cpu_xchg_[1-4] are okay and much | ||
393 | * faster than an xchg with forced lock semantics. | ||
394 | */ | ||
395 | #define __this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) | ||
396 | #define __this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) | ||
256 | 397 | ||
257 | #define this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) | 398 | #define this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) |
258 | #define this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) | 399 | #define this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) |
@@ -272,6 +413,9 @@ do { \ | |||
272 | #define this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) | 413 | #define this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) |
273 | #define this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) | 414 | #define this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) |
274 | #define this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) | 415 | #define this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) |
416 | #define this_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval) | ||
417 | #define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval) | ||
418 | #define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval) | ||
275 | 419 | ||
276 | #define irqsafe_cpu_add_1(pcp, val) percpu_add_op((pcp), val) | 420 | #define irqsafe_cpu_add_1(pcp, val) percpu_add_op((pcp), val) |
277 | #define irqsafe_cpu_add_2(pcp, val) percpu_add_op((pcp), val) | 421 | #define irqsafe_cpu_add_2(pcp, val) percpu_add_op((pcp), val) |
@@ -285,6 +429,49 @@ do { \ | |||
285 | #define irqsafe_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) | 429 | #define irqsafe_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) |
286 | #define irqsafe_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) | 430 | #define irqsafe_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) |
287 | #define irqsafe_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) | 431 | #define irqsafe_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) |
432 | #define irqsafe_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval) | ||
433 | #define irqsafe_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval) | ||
434 | #define irqsafe_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval) | ||
435 | |||
436 | #ifndef CONFIG_M386 | ||
437 | #define __this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) | ||
438 | #define __this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) | ||
439 | #define __this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val) | ||
440 | #define __this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) | ||
441 | #define __this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) | ||
442 | #define __this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) | ||
443 | |||
444 | #define this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) | ||
445 | #define this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) | ||
446 | #define this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val) | ||
447 | #define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) | ||
448 | #define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) | ||
449 | #define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) | ||
450 | |||
451 | #define irqsafe_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) | ||
452 | #define irqsafe_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) | ||
453 | #define irqsafe_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) | ||
454 | #endif /* !CONFIG_M386 */ | ||
455 | |||
456 | #ifdef CONFIG_X86_CMPXCHG64 | ||
457 | #define percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) \ | ||
458 | ({ \ | ||
459 | char __ret; \ | ||
460 | typeof(o1) __o1 = o1; \ | ||
461 | typeof(o1) __n1 = n1; \ | ||
462 | typeof(o2) __o2 = o2; \ | ||
463 | typeof(o2) __n2 = n2; \ | ||
464 | typeof(o2) __dummy = n2; \ | ||
465 | asm volatile("cmpxchg8b "__percpu_arg(1)"\n\tsetz %0\n\t" \ | ||
466 | : "=a"(__ret), "=m" (pcp1), "=d"(__dummy) \ | ||
467 | : "b"(__n1), "c"(__n2), "a"(__o1), "d"(__o2)); \ | ||
468 | __ret; \ | ||
469 | }) | ||
470 | |||
471 | #define __this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) | ||
472 | #define this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) | ||
473 | #define irqsafe_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) | ||
474 | #endif /* CONFIG_X86_CMPXCHG64 */ | ||
288 | 475 | ||
289 | /* | 476 | /* |
290 | * Per cpu atomic 64 bit operations are only available under 64 bit. | 477 | * Per cpu atomic 64 bit operations are only available under 64 bit. |
@@ -297,6 +484,7 @@ do { \ | |||
297 | #define __this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) | 484 | #define __this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) |
298 | #define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) | 485 | #define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) |
299 | #define __this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) | 486 | #define __this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) |
487 | #define __this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) | ||
300 | 488 | ||
301 | #define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) | 489 | #define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) |
302 | #define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) | 490 | #define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) |
@@ -304,11 +492,48 @@ do { \ | |||
304 | #define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) | 492 | #define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) |
305 | #define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) | 493 | #define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) |
306 | #define this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) | 494 | #define this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) |
495 | #define this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) | ||
496 | #define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) | ||
497 | #define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) | ||
307 | 498 | ||
308 | #define irqsafe_cpu_add_8(pcp, val) percpu_add_op((pcp), val) | 499 | #define irqsafe_cpu_add_8(pcp, val) percpu_add_op((pcp), val) |
309 | #define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) | 500 | #define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) |
310 | #define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) | 501 | #define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) |
311 | #define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) | 502 | #define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) |
503 | #define irqsafe_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) | ||
504 | #define irqsafe_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) | ||
505 | |||
506 | /* | ||
507 | * Pretty complex macro to generate cmpxchg16 instruction. The instruction | ||
508 | * is not supported on early AMD64 processors so we must be able to emulate | ||
509 | * it in software. The address used in the cmpxchg16 instruction must be | ||
510 | * aligned to a 16 byte boundary. | ||
511 | */ | ||
512 | #ifdef CONFIG_SMP | ||
513 | #define CMPXCHG16B_EMU_CALL "call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP3 | ||
514 | #else | ||
515 | #define CMPXCHG16B_EMU_CALL "call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP2 | ||
516 | #endif | ||
517 | #define percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) \ | ||
518 | ({ \ | ||
519 | char __ret; \ | ||
520 | typeof(o1) __o1 = o1; \ | ||
521 | typeof(o1) __n1 = n1; \ | ||
522 | typeof(o2) __o2 = o2; \ | ||
523 | typeof(o2) __n2 = n2; \ | ||
524 | typeof(o2) __dummy; \ | ||
525 | alternative_io(CMPXCHG16B_EMU_CALL, \ | ||
526 | "cmpxchg16b " __percpu_prefix "(%%rsi)\n\tsetz %0\n\t", \ | ||
527 | X86_FEATURE_CX16, \ | ||
528 | ASM_OUTPUT2("=a"(__ret), "=d"(__dummy)), \ | ||
529 | "S" (&pcp1), "b"(__n1), "c"(__n2), \ | ||
530 | "a"(__o1), "d"(__o2) : "memory"); \ | ||
531 | __ret; \ | ||
532 | }) | ||
533 | |||
534 | #define __this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) | ||
535 | #define this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) | ||
536 | #define irqsafe_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) | ||
312 | 537 | ||
313 | #endif | 538 | #endif |
314 | 539 | ||
@@ -322,6 +547,33 @@ do { \ | |||
322 | old__; \ | 547 | old__; \ |
323 | }) | 548 | }) |
324 | 549 | ||
550 | static __always_inline int x86_this_cpu_constant_test_bit(unsigned int nr, | ||
551 | const unsigned long __percpu *addr) | ||
552 | { | ||
553 | unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG; | ||
554 | |||
555 | return ((1UL << (nr % BITS_PER_LONG)) & percpu_read(*a)) != 0; | ||
556 | } | ||
557 | |||
558 | static inline int x86_this_cpu_variable_test_bit(int nr, | ||
559 | const unsigned long __percpu *addr) | ||
560 | { | ||
561 | int oldbit; | ||
562 | |||
563 | asm volatile("bt "__percpu_arg(2)",%1\n\t" | ||
564 | "sbb %0,%0" | ||
565 | : "=r" (oldbit) | ||
566 | : "m" (*(unsigned long *)addr), "Ir" (nr)); | ||
567 | |||
568 | return oldbit; | ||
569 | } | ||
570 | |||
571 | #define x86_this_cpu_test_bit(nr, addr) \ | ||
572 | (__builtin_constant_p((nr)) \ | ||
573 | ? x86_this_cpu_constant_test_bit((nr), (addr)) \ | ||
574 | : x86_this_cpu_variable_test_bit((nr), (addr))) | ||
575 | |||
576 | |||
325 | #include <asm-generic/percpu.h> | 577 | #include <asm-generic/percpu.h> |
326 | 578 | ||
327 | /* We can use this directly for local CPU (faster). */ | 579 | /* We can use this directly for local CPU (faster). */ |