aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/kernel-parameters.txt3
-rw-r--r--Documentation/lguest/lguest.c178
-rw-r--r--Documentation/networking/00-INDEX2
-rw-r--r--Documentation/networking/net-modules.txt315
-rw-r--r--MAINTAINERS7
-rw-r--r--arch/blackfin/kernel/dma-mapping.c1
-rw-r--r--arch/m68k/kernel/dma.c2
-rw-r--r--arch/sparc64/kernel/iommu_common.c2
-rw-r--r--arch/sparc64/kernel/ldc.c2
-rw-r--r--arch/um/drivers/ubd_kern.c2
-rw-r--r--arch/x86/kernel/crash.c6
-rw-r--r--arch/x86/kernel/pci-gart_64.c3
-rw-r--r--arch/x86/lguest/boot.c54
-rw-r--r--arch/x86/lguest/i386_head.S8
-rw-r--r--arch/x86/mm/fault_32.c2
-rw-r--r--block/ll_rw_blk.c4
-rw-r--r--crypto/hmac.c3
-rw-r--r--drivers/acpi/sleep/proc.c66
-rw-r--r--drivers/ata/ahci.c144
-rw-r--r--drivers/ata/libata-core.c44
-rw-r--r--drivers/ata/libata-eh.c12
-rw-r--r--drivers/ata/pata_icside.c42
-rw-r--r--drivers/ata/sata_nv.c6
-rw-r--r--drivers/block/cryptoloop.c9
-rw-r--r--drivers/block/sunvdc.c1
-rw-r--r--drivers/block/ub.c11
-rw-r--r--drivers/block/virtio_blk.c10
-rw-r--r--drivers/cdrom/viocd.c3
-rw-r--r--drivers/ieee1394/dma.c4
-rw-r--r--drivers/infiniband/core/umem.c4
-rw-r--r--drivers/infiniband/hw/mthca/mthca_memfree.c9
-rw-r--r--drivers/lguest/core.c5
-rw-r--r--drivers/lguest/hypercalls.c11
-rw-r--r--drivers/lguest/interrupts_and_traps.c37
-rw-r--r--drivers/lguest/lg.h7
-rw-r--r--drivers/lguest/lguest_device.c11
-rw-r--r--drivers/lguest/lguest_user.c23
-rw-r--r--drivers/lguest/page_tables.c113
-rw-r--r--drivers/lguest/segments.c48
-rw-r--r--drivers/lguest/x86/core.c122
-rw-r--r--drivers/lguest/x86/switcher_32.S71
-rw-r--r--drivers/md/dm-crypt.c8
-rw-r--r--drivers/media/common/saa7146_core.c3
-rw-r--r--drivers/media/video/ivtv/ivtv-udma.c6
-rw-r--r--drivers/media/video/videobuf-dma-sg.c10
-rw-r--r--drivers/message/i2o/i2o_block.c1
-rw-r--r--drivers/mmc/host/au1xmmc.c2
-rw-r--r--drivers/mmc/host/mmci.c6
-rw-r--r--drivers/mmc/host/pxamci.c1
-rw-r--r--drivers/mmc/host/sdhci.c2
-rw-r--r--drivers/mmc/host/wbsd.c2
-rw-r--r--drivers/net/bonding/bond_main.c5
-rw-r--r--drivers/net/bonding/bonding.h1
-rw-r--r--drivers/net/cpmac.c145
-rw-r--r--drivers/net/ehea/ehea.h2
-rw-r--r--drivers/net/ehea/ehea_main.c7
-rw-r--r--drivers/net/forcedeth.c16
-rw-r--r--drivers/net/ipg.c22
-rw-r--r--drivers/net/ipg.h20
-rw-r--r--drivers/net/mlx4/icm.c4
-rw-r--r--drivers/net/natsemi.c1
-rw-r--r--drivers/net/usb/rndis_host.c18
-rw-r--r--drivers/s390/scsi/zfcp_aux.c12
-rw-r--r--drivers/s390/scsi/zfcp_def.h3
-rw-r--r--drivers/s390/scsi/zfcp_erp.c7
-rw-r--r--drivers/scsi/atari_NCR5380.c5
-rw-r--r--drivers/scsi/ipr.c2
-rw-r--r--drivers/scsi/iscsi_tcp.c4
-rw-r--r--drivers/scsi/osst.c6
-rw-r--r--drivers/scsi/sg.c13
-rw-r--r--drivers/scsi/st.c14
-rw-r--r--drivers/scsi/sun3x_esp.c4
-rw-r--r--fs/ecryptfs/crypto.c23
-rw-r--r--fs/mbcache.c2
-rw-r--r--fs/proc/proc_sysctl.c6
-rw-r--r--include/asm-avr32/dma-mapping.h2
-rw-r--r--include/asm-frv/scatterlist.h3
-rw-r--r--include/asm-x86/lguest_hcall.h16
-rw-r--r--include/asm-xtensa/dma-mapping.h2
-rw-r--r--include/linux/compiler.h6
-rw-r--r--include/linux/completion.h18
-rw-r--r--include/linux/lguest.h4
-rw-r--r--include/linux/lguest_launcher.h24
-rw-r--r--include/linux/pci_ids.h4
-rw-r--r--include/linux/scatterlist.h43
-rw-r--r--include/linux/sched.h37
-rw-r--r--init/Kconfig1
-rw-r--r--kernel/profile.c5
-rw-r--r--kernel/sched.c330
-rw-r--r--kernel/sched_fair.c48
-rw-r--r--kernel/sched_idletask.c18
-rw-r--r--kernel/sched_rt.c32
-rw-r--r--kernel/user.c5
-rw-r--r--lib/Kconfig.debug8
-rw-r--r--net/core/skbuff.c9
-rw-r--r--net/ieee80211/ieee80211_crypt_tkip.c11
-rw-r--r--net/ieee80211/ieee80211_crypt_wep.c2
-rw-r--r--net/sctp/auth.c4
-rw-r--r--net/sctp/sm_make_chunk.c8
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_crypto.c14
-rw-r--r--net/sunrpc/xdr.c4
-rw-r--r--net/xfrm/xfrm_algo.c9
102 files changed, 1262 insertions, 1195 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index a13d69b2217d..8ae5fac08dfa 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1444,7 +1444,8 @@ and is between 256 and 4096 characters. It is defined in the file
1444 Param: "schedule" - profile schedule points. 1444 Param: "schedule" - profile schedule points.
1445 Param: <number> - step/bucket size as a power of 2 for 1445 Param: <number> - step/bucket size as a power of 2 for
1446 statistical time based profiling. 1446 statistical time based profiling.
1447 Param: "sleep" - profile D-state sleeping (millisecs) 1447 Param: "sleep" - profile D-state sleeping (millisecs).
1448 Requires CONFIG_SCHEDSTATS
1448 Param: "kvm" - profile VM exits. 1449 Param: "kvm" - profile VM exits.
1449 1450
1450 processor.max_cstate= [HW,ACPI] 1451 processor.max_cstate= [HW,ACPI]
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 5bdc37f81842..f2668390e8f7 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -34,25 +34,24 @@
34#include <zlib.h> 34#include <zlib.h>
35#include <assert.h> 35#include <assert.h>
36#include <sched.h> 36#include <sched.h>
37/*L:110 We can ignore the 30 include files we need for this program, but I do
38 * want to draw attention to the use of kernel-style types.
39 *
40 * As Linus said, "C is a Spartan language, and so should your naming be." I
41 * like these abbreviations and the header we need uses them, so we define them
42 * here.
43 */
44typedef unsigned long long u64;
45typedef uint32_t u32;
46typedef uint16_t u16;
47typedef uint8_t u8;
48#include "linux/lguest_launcher.h" 37#include "linux/lguest_launcher.h"
49#include "linux/pci_ids.h"
50#include "linux/virtio_config.h" 38#include "linux/virtio_config.h"
51#include "linux/virtio_net.h" 39#include "linux/virtio_net.h"
52#include "linux/virtio_blk.h" 40#include "linux/virtio_blk.h"
53#include "linux/virtio_console.h" 41#include "linux/virtio_console.h"
54#include "linux/virtio_ring.h" 42#include "linux/virtio_ring.h"
55#include "asm-x86/bootparam.h" 43#include "asm-x86/bootparam.h"
44/*L:110 We can ignore the 38 include files we need for this program, but I do
45 * want to draw attention to the use of kernel-style types.
46 *
47 * As Linus said, "C is a Spartan language, and so should your naming be." I
48 * like these abbreviations, so we define them here. Note that u64 is always
49 * unsigned long long, which works on all Linux systems: this means that we can
50 * use %llu in printf for any u64. */
51typedef unsigned long long u64;
52typedef uint32_t u32;
53typedef uint16_t u16;
54typedef uint8_t u8;
56/*:*/ 55/*:*/
57 56
58#define PAGE_PRESENT 0x7 /* Present, RW, Execute */ 57#define PAGE_PRESENT 0x7 /* Present, RW, Execute */
@@ -361,8 +360,8 @@ static unsigned long load_bzimage(int fd)
361} 360}
362 361
363/*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels 362/*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels
364 * come wrapped up in the self-decompressing "bzImage" format. With some funky 363 * come wrapped up in the self-decompressing "bzImage" format. With a little
365 * coding, we can load those, too. */ 364 * work, we can load those, too. */
366static unsigned long load_kernel(int fd) 365static unsigned long load_kernel(int fd)
367{ 366{
368 Elf32_Ehdr hdr; 367 Elf32_Ehdr hdr;
@@ -465,6 +464,7 @@ static unsigned long setup_pagetables(unsigned long mem,
465 * to know where it is. */ 464 * to know where it is. */
466 return to_guest_phys(pgdir); 465 return to_guest_phys(pgdir);
467} 466}
467/*:*/
468 468
469/* Simple routine to roll all the commandline arguments together with spaces 469/* Simple routine to roll all the commandline arguments together with spaces
470 * between them. */ 470 * between them. */
@@ -481,9 +481,9 @@ static void concat(char *dst, char *args[])
481 dst[len] = '\0'; 481 dst[len] = '\0';
482} 482}
483 483
484/* This is where we actually tell the kernel to initialize the Guest. We saw 484/*L:185 This is where we actually tell the kernel to initialize the Guest. We
485 * the arguments it expects when we looked at initialize() in lguest_user.c: 485 * saw the arguments it expects when we looked at initialize() in lguest_user.c:
486 * the base of guest "physical" memory, the top physical page to allow, the 486 * the base of Guest "physical" memory, the top physical page to allow, the
487 * top level pagetable and the entry point for the Guest. */ 487 * top level pagetable and the entry point for the Guest. */
488static int tell_kernel(unsigned long pgdir, unsigned long start) 488static int tell_kernel(unsigned long pgdir, unsigned long start)
489{ 489{
@@ -513,13 +513,14 @@ static void add_device_fd(int fd)
513/*L:200 513/*L:200
514 * The Waker. 514 * The Waker.
515 * 515 *
516 * With a console and network devices, we can have lots of input which we need 516 * With console, block and network devices, we can have lots of input which we
517 * to process. We could try to tell the kernel what file descriptors to watch, 517 * need to process. We could try to tell the kernel what file descriptors to
518 * but handing a file descriptor mask through to the kernel is fairly icky. 518 * watch, but handing a file descriptor mask through to the kernel is fairly
519 * icky.
519 * 520 *
520 * Instead, we fork off a process which watches the file descriptors and writes 521 * Instead, we fork off a process which watches the file descriptors and writes
521 * the LHREQ_BREAK command to the /dev/lguest filedescriptor to tell the Host 522 * the LHREQ_BREAK command to the /dev/lguest file descriptor to tell the Host
522 * loop to stop running the Guest. This causes it to return from the 523 * stop running the Guest. This causes the Launcher to return from the
523 * /dev/lguest read with -EAGAIN, where it will write to /dev/lguest to reset 524 * /dev/lguest read with -EAGAIN, where it will write to /dev/lguest to reset
524 * the LHREQ_BREAK and wake us up again. 525 * the LHREQ_BREAK and wake us up again.
525 * 526 *
@@ -545,7 +546,9 @@ static void wake_parent(int pipefd, int lguest_fd)
545 if (read(pipefd, &fd, sizeof(fd)) == 0) 546 if (read(pipefd, &fd, sizeof(fd)) == 0)
546 exit(0); 547 exit(0);
547 /* Otherwise it's telling us to change what file 548 /* Otherwise it's telling us to change what file
548 * descriptors we're to listen to. */ 549 * descriptors we're to listen to. Positive means
550 * listen to a new one, negative means stop
551 * listening. */
549 if (fd >= 0) 552 if (fd >= 0)
550 FD_SET(fd, &devices.infds); 553 FD_SET(fd, &devices.infds);
551 else 554 else
@@ -560,7 +563,7 @@ static int setup_waker(int lguest_fd)
560{ 563{
561 int pipefd[2], child; 564 int pipefd[2], child;
562 565
563 /* We create a pipe to talk to the waker, and also so it knows when the 566 /* We create a pipe to talk to the Waker, and also so it knows when the
564 * Launcher dies (and closes pipe). */ 567 * Launcher dies (and closes pipe). */
565 pipe(pipefd); 568 pipe(pipefd);
566 child = fork(); 569 child = fork();
@@ -568,7 +571,8 @@ static int setup_waker(int lguest_fd)
568 err(1, "forking"); 571 err(1, "forking");
569 572
570 if (child == 0) { 573 if (child == 0) {
571 /* Close the "writing" end of our copy of the pipe */ 574 /* We are the Waker: close the "writing" end of our copy of the
575 * pipe and start waiting for input. */
572 close(pipefd[1]); 576 close(pipefd[1]);
573 wake_parent(pipefd[0], lguest_fd); 577 wake_parent(pipefd[0], lguest_fd);
574 } 578 }
@@ -579,12 +583,12 @@ static int setup_waker(int lguest_fd)
579 return pipefd[1]; 583 return pipefd[1];
580} 584}
581 585
582/*L:210 586/*
583 * Device Handling. 587 * Device Handling.
584 * 588 *
585 * When the Guest sends DMA to us, it sends us an array of addresses and sizes. 589 * When the Guest gives us a buffer, it sends an array of addresses and sizes.
586 * We need to make sure it's not trying to reach into the Launcher itself, so 590 * We need to make sure it's not trying to reach into the Launcher itself, so
587 * we have a convenient routine which check it and exits with an error message 591 * we have a convenient routine which checks it and exits with an error message
588 * if something funny is going on: 592 * if something funny is going on:
589 */ 593 */
590static void *_check_pointer(unsigned long addr, unsigned int size, 594static void *_check_pointer(unsigned long addr, unsigned int size,
@@ -601,7 +605,9 @@ static void *_check_pointer(unsigned long addr, unsigned int size,
601/* A macro which transparently hands the line number to the real function. */ 605/* A macro which transparently hands the line number to the real function. */
602#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) 606#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)
603 607
604/* This function returns the next descriptor in the chain, or vq->vring.num. */ 608/* Each buffer in the virtqueues is actually a chain of descriptors. This
609 * function returns the next descriptor in the chain, or vq->vring.num if we're
610 * at the end. */
605static unsigned next_desc(struct virtqueue *vq, unsigned int i) 611static unsigned next_desc(struct virtqueue *vq, unsigned int i)
606{ 612{
607 unsigned int next; 613 unsigned int next;
@@ -680,13 +686,14 @@ static unsigned get_vq_desc(struct virtqueue *vq,
680 return head; 686 return head;
681} 687}
682 688
683/* Once we've used one of their buffers, we tell them about it. We'll then 689/* After we've used one of their buffers, we tell them about it. We'll then
684 * want to send them an interrupt, using trigger_irq(). */ 690 * want to send them an interrupt, using trigger_irq(). */
685static void add_used(struct virtqueue *vq, unsigned int head, int len) 691static void add_used(struct virtqueue *vq, unsigned int head, int len)
686{ 692{
687 struct vring_used_elem *used; 693 struct vring_used_elem *used;
688 694
689 /* Get a pointer to the next entry in the used ring. */ 695 /* The virtqueue contains a ring of used buffers. Get a pointer to the
696 * next entry in that used ring. */
690 used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num]; 697 used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num];
691 used->id = head; 698 used->id = head;
692 used->len = len; 699 used->len = len;
@@ -700,6 +707,7 @@ static void trigger_irq(int fd, struct virtqueue *vq)
700{ 707{
701 unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; 708 unsigned long buf[] = { LHREQ_IRQ, vq->config.irq };
702 709
710 /* If they don't want an interrupt, don't send one. */
703 if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) 711 if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
704 return; 712 return;
705 713
@@ -716,8 +724,11 @@ static void add_used_and_trigger(int fd, struct virtqueue *vq,
716 trigger_irq(fd, vq); 724 trigger_irq(fd, vq);
717} 725}
718 726
719/* Here is the input terminal setting we save, and the routine to restore them 727/*
720 * on exit so the user can see what they type next. */ 728 * The Console
729 *
730 * Here is the input terminal setting we save, and the routine to restore them
731 * on exit so the user gets their terminal back. */
721static struct termios orig_term; 732static struct termios orig_term;
722static void restore_term(void) 733static void restore_term(void)
723{ 734{
@@ -818,7 +829,10 @@ static void handle_console_output(int fd, struct virtqueue *vq)
818 } 829 }
819} 830}
820 831
821/* Handling output for network is also simple: we get all the output buffers 832/*
833 * The Network
834 *
835 * Handling output for network is also simple: we get all the output buffers
822 * and write them (ignoring the first element) to this device's file descriptor 836 * and write them (ignoring the first element) to this device's file descriptor
823 * (stdout). */ 837 * (stdout). */
824static void handle_net_output(int fd, struct virtqueue *vq) 838static void handle_net_output(int fd, struct virtqueue *vq)
@@ -831,8 +845,9 @@ static void handle_net_output(int fd, struct virtqueue *vq)
831 while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { 845 while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {
832 if (in) 846 if (in)
833 errx(1, "Input buffers in output queue?"); 847 errx(1, "Input buffers in output queue?");
834 /* Check header, but otherwise ignore it (we said we supported 848 /* Check header, but otherwise ignore it (we told the Guest we
835 * no features). */ 849 * supported no features, so it shouldn't have anything
850 * interesting). */
836 (void)convert(&iov[0], struct virtio_net_hdr); 851 (void)convert(&iov[0], struct virtio_net_hdr);
837 len = writev(vq->dev->fd, iov+1, out-1); 852 len = writev(vq->dev->fd, iov+1, out-1);
838 add_used_and_trigger(fd, vq, head, len); 853 add_used_and_trigger(fd, vq, head, len);
@@ -883,7 +898,8 @@ static bool handle_tun_input(int fd, struct device *dev)
883 return true; 898 return true;
884} 899}
885 900
886/* This callback ensures we try again, in case we stopped console or net 901/*L:215 This is the callback attached to the network and console input
902 * virtqueues: it ensures we try again, in case we stopped console or net
887 * delivery because Guest didn't have any buffers. */ 903 * delivery because Guest didn't have any buffers. */
888static void enable_fd(int fd, struct virtqueue *vq) 904static void enable_fd(int fd, struct virtqueue *vq)
889{ 905{
@@ -919,7 +935,7 @@ static void handle_output(int fd, unsigned long addr)
919 strnlen(from_guest_phys(addr), guest_limit - addr)); 935 strnlen(from_guest_phys(addr), guest_limit - addr));
920} 936}
921 937
922/* This is called when the waker wakes us up: check for incoming file 938/* This is called when the Waker wakes us up: check for incoming file
923 * descriptors. */ 939 * descriptors. */
924static void handle_input(int fd) 940static void handle_input(int fd)
925{ 941{
@@ -986,8 +1002,7 @@ static struct lguest_device_desc *new_dev_desc(u16 type)
986} 1002}
987 1003
988/* Each device descriptor is followed by some configuration information. 1004/* Each device descriptor is followed by some configuration information.
989 * The first byte is a "status" byte for the Guest to report what's happening. 1005 * Each configuration field looks like: u8 type, u8 len, [... len bytes...].
990 * After that are fields: u8 type, u8 len, [... len bytes...].
991 * 1006 *
992 * This routine adds a new field to an existing device's descriptor. It only 1007 * This routine adds a new field to an existing device's descriptor. It only
993 * works for the last device, but that's OK because that's how we use it. */ 1008 * works for the last device, but that's OK because that's how we use it. */
@@ -1044,14 +1059,17 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
1044 /* Link virtqueue back to device. */ 1059 /* Link virtqueue back to device. */
1045 vq->dev = dev; 1060 vq->dev = dev;
1046 1061
1047 /* Set up handler. */ 1062 /* Set the routine to call when the Guest does something to this
1063 * virtqueue. */
1048 vq->handle_output = handle_output; 1064 vq->handle_output = handle_output;
1065
1066 /* Set the "Don't Notify Me" flag if we don't have a handler */
1049 if (!handle_output) 1067 if (!handle_output)
1050 vq->vring.used->flags = VRING_USED_F_NO_NOTIFY; 1068 vq->vring.used->flags = VRING_USED_F_NO_NOTIFY;
1051} 1069}
1052 1070
1053/* This routine does all the creation and setup of a new device, including 1071/* This routine does all the creation and setup of a new device, including
1054 * caling new_dev_desc() to allocate the descriptor and device memory. */ 1072 * calling new_dev_desc() to allocate the descriptor and device memory. */
1055static struct device *new_device(const char *name, u16 type, int fd, 1073static struct device *new_device(const char *name, u16 type, int fd,
1056 bool (*handle_input)(int, struct device *)) 1074 bool (*handle_input)(int, struct device *))
1057{ 1075{
@@ -1060,7 +1078,7 @@ static struct device *new_device(const char *name, u16 type, int fd,
1060 /* Append to device list. Prepending to a single-linked list is 1078 /* Append to device list. Prepending to a single-linked list is
1061 * easier, but the user expects the devices to be arranged on the bus 1079 * easier, but the user expects the devices to be arranged on the bus
1062 * in command-line order. The first network device on the command line 1080 * in command-line order. The first network device on the command line
1063 * is eth0, the first block device /dev/lgba, etc. */ 1081 * is eth0, the first block device /dev/vda, etc. */
1064 *devices.lastdev = dev; 1082 *devices.lastdev = dev;
1065 dev->next = NULL; 1083 dev->next = NULL;
1066 devices.lastdev = &dev->next; 1084 devices.lastdev = &dev->next;
@@ -1104,7 +1122,7 @@ static void setup_console(void)
1104 /* The console needs two virtqueues: the input then the output. When 1122 /* The console needs two virtqueues: the input then the output. When
1105 * they put something the input queue, we make sure we're listening to 1123 * they put something the input queue, we make sure we're listening to
1106 * stdin. When they put something in the output queue, we write it to 1124 * stdin. When they put something in the output queue, we write it to
1107 * stdout. */ 1125 * stdout. */
1108 add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); 1126 add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
1109 add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output); 1127 add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output);
1110 1128
@@ -1252,21 +1270,17 @@ static void setup_tun_net(const char *arg)
1252 verbose("attached to bridge: %s\n", br_name); 1270 verbose("attached to bridge: %s\n", br_name);
1253} 1271}
1254 1272
1255 1273/* Our block (disk) device should be really simple: the Guest asks for a block
1256/* 1274 * number and we read or write that position in the file. Unfortunately, that
1257 * Block device. 1275 * was amazingly slow: the Guest waits until the read is finished before
1276 * running anything else, even if it could have been doing useful work.
1258 * 1277 *
1259 * Serving a block device is really easy: the Guest asks for a block number and 1278 * We could use async I/O, except it's reputed to suck so hard that characters
1260 * we read or write that position in the file. 1279 * actually go missing from your code when you try to use it.
1261 *
1262 * Unfortunately, this is amazingly slow: the Guest waits until the read is
1263 * finished before running anything else, even if it could be doing useful
1264 * work. We could use async I/O, except it's reputed to suck so hard that
1265 * characters actually go missing from your code when you try to use it.
1266 * 1280 *
1267 * So we farm the I/O out to thread, and communicate with it via a pipe. */ 1281 * So we farm the I/O out to thread, and communicate with it via a pipe. */
1268 1282
1269/* This hangs off device->priv, with the data. */ 1283/* This hangs off device->priv. */
1270struct vblk_info 1284struct vblk_info
1271{ 1285{
1272 /* The size of the file. */ 1286 /* The size of the file. */
@@ -1282,8 +1296,14 @@ struct vblk_info
1282 * Launcher triggers interrupt to Guest. */ 1296 * Launcher triggers interrupt to Guest. */
1283 int done_fd; 1297 int done_fd;
1284}; 1298};
1299/*:*/
1285 1300
1286/* This is the core of the I/O thread. It returns true if it did something. */ 1301/*L:210
1302 * The Disk
1303 *
1304 * Remember that the block device is handled by a separate I/O thread. We head
1305 * straight into the core of that thread here:
1306 */
1287static bool service_io(struct device *dev) 1307static bool service_io(struct device *dev)
1288{ 1308{
1289 struct vblk_info *vblk = dev->priv; 1309 struct vblk_info *vblk = dev->priv;
@@ -1294,10 +1314,14 @@ static bool service_io(struct device *dev)
1294 struct iovec iov[dev->vq->vring.num]; 1314 struct iovec iov[dev->vq->vring.num];
1295 off64_t off; 1315 off64_t off;
1296 1316
1317 /* See if there's a request waiting. If not, nothing to do. */
1297 head = get_vq_desc(dev->vq, iov, &out_num, &in_num); 1318 head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
1298 if (head == dev->vq->vring.num) 1319 if (head == dev->vq->vring.num)
1299 return false; 1320 return false;
1300 1321
1322 /* Every block request should contain at least one output buffer
1323 * (detailing the location on disk and the type of request) and one
1324 * input buffer (to hold the result). */
1301 if (out_num == 0 || in_num == 0) 1325 if (out_num == 0 || in_num == 0)
1302 errx(1, "Bad virtblk cmd %u out=%u in=%u", 1326 errx(1, "Bad virtblk cmd %u out=%u in=%u",
1303 head, out_num, in_num); 1327 head, out_num, in_num);
@@ -1306,10 +1330,15 @@ static bool service_io(struct device *dev)
1306 in = convert(&iov[out_num+in_num-1], struct virtio_blk_inhdr); 1330 in = convert(&iov[out_num+in_num-1], struct virtio_blk_inhdr);
1307 off = out->sector * 512; 1331 off = out->sector * 512;
1308 1332
1309 /* This is how we implement barriers. Pretty poor, no? */ 1333 /* The block device implements "barriers", where the Guest indicates
1334 * that it wants all previous writes to occur before this write. We
1335 * don't have a way of asking our kernel to do a barrier, so we just
1336 * synchronize all the data in the file. Pretty poor, no? */
1310 if (out->type & VIRTIO_BLK_T_BARRIER) 1337 if (out->type & VIRTIO_BLK_T_BARRIER)
1311 fdatasync(vblk->fd); 1338 fdatasync(vblk->fd);
1312 1339
1340 /* In general the virtio block driver is allowed to try SCSI commands.
1341 * It'd be nice if we supported eject, for example, but we don't. */
1313 if (out->type & VIRTIO_BLK_T_SCSI_CMD) { 1342 if (out->type & VIRTIO_BLK_T_SCSI_CMD) {
1314 fprintf(stderr, "Scsi commands unsupported\n"); 1343 fprintf(stderr, "Scsi commands unsupported\n");
1315 in->status = VIRTIO_BLK_S_UNSUPP; 1344 in->status = VIRTIO_BLK_S_UNSUPP;
@@ -1375,7 +1404,7 @@ static int io_thread(void *_dev)
1375 1404
1376 /* When this read fails, it means Launcher died, so we follow. */ 1405 /* When this read fails, it means Launcher died, so we follow. */
1377 while (read(vblk->workpipe[0], &c, 1) == 1) { 1406 while (read(vblk->workpipe[0], &c, 1) == 1) {
1378 /* We acknowledge each request immediately, to reduce latency, 1407 /* We acknowledge each request immediately to reduce latency,
1379 * rather than waiting until we've done them all. I haven't 1408 * rather than waiting until we've done them all. I haven't
1380 * measured to see if it makes any difference. */ 1409 * measured to see if it makes any difference. */
1381 while (service_io(dev)) 1410 while (service_io(dev))
@@ -1384,12 +1413,14 @@ static int io_thread(void *_dev)
1384 return 0; 1413 return 0;
1385} 1414}
1386 1415
1387/* When the thread says some I/O is done, we interrupt the Guest. */ 1416/* Now we've seen the I/O thread, we return to the Launcher to see what happens
1417 * when the thread tells us it's completed some I/O. */
1388static bool handle_io_finish(int fd, struct device *dev) 1418static bool handle_io_finish(int fd, struct device *dev)
1389{ 1419{
1390 char c; 1420 char c;
1391 1421
1392 /* If child died, presumably it printed message. */ 1422 /* If the I/O thread died, presumably it printed the error, so we
1423 * simply exit. */
1393 if (read(dev->fd, &c, 1) != 1) 1424 if (read(dev->fd, &c, 1) != 1)
1394 exit(1); 1425 exit(1);
1395 1426
@@ -1398,7 +1429,7 @@ static bool handle_io_finish(int fd, struct device *dev)
1398 return true; 1429 return true;
1399} 1430}
1400 1431
1401/* When the Guest submits some I/O, we wake the I/O thread. */ 1432/* When the Guest submits some I/O, we just need to wake the I/O thread. */
1402static void handle_virtblk_output(int fd, struct virtqueue *vq) 1433static void handle_virtblk_output(int fd, struct virtqueue *vq)
1403{ 1434{
1404 struct vblk_info *vblk = vq->dev->priv; 1435 struct vblk_info *vblk = vq->dev->priv;
@@ -1410,7 +1441,7 @@ static void handle_virtblk_output(int fd, struct virtqueue *vq)
1410 exit(1); 1441 exit(1);
1411} 1442}
1412 1443
1413/* This creates a virtual block device. */ 1444/*L:198 This actually sets up a virtual block device. */
1414static void setup_block_file(const char *filename) 1445static void setup_block_file(const char *filename)
1415{ 1446{
1416 int p[2]; 1447 int p[2];
@@ -1426,7 +1457,7 @@ static void setup_block_file(const char *filename)
1426 /* The device responds to return from I/O thread. */ 1457 /* The device responds to return from I/O thread. */
1427 dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish); 1458 dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish);
1428 1459
1429 /* The device has a virtqueue. */ 1460 /* The device has one virtqueue, where the Guest places requests. */
1430 add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output); 1461 add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output);
1431 1462
1432 /* Allocate the room for our own bookkeeping */ 1463 /* Allocate the room for our own bookkeeping */
@@ -1448,7 +1479,8 @@ static void setup_block_file(const char *filename)
1448 /* The I/O thread writes to this end of the pipe when done. */ 1479 /* The I/O thread writes to this end of the pipe when done. */
1449 vblk->done_fd = p[1]; 1480 vblk->done_fd = p[1];
1450 1481
1451 /* This is how we tell the I/O thread about more work. */ 1482 /* This is the second pipe, which is how we tell the I/O thread about
1483 * more work. */
1452 pipe(vblk->workpipe); 1484 pipe(vblk->workpipe);
1453 1485
1454 /* Create stack for thread and run it */ 1486 /* Create stack for thread and run it */
@@ -1487,24 +1519,25 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd)
1487 char reason[1024] = { 0 }; 1519 char reason[1024] = { 0 };
1488 read(lguest_fd, reason, sizeof(reason)-1); 1520 read(lguest_fd, reason, sizeof(reason)-1);
1489 errx(1, "%s", reason); 1521 errx(1, "%s", reason);
1490 /* EAGAIN means the waker wanted us to look at some input. 1522 /* EAGAIN means the Waker wanted us to look at some input.
1491 * Anything else means a bug or incompatible change. */ 1523 * Anything else means a bug or incompatible change. */
1492 } else if (errno != EAGAIN) 1524 } else if (errno != EAGAIN)
1493 err(1, "Running guest failed"); 1525 err(1, "Running guest failed");
1494 1526
1495 /* Service input, then unset the BREAK which releases 1527 /* Service input, then unset the BREAK to release the Waker. */
1496 * the Waker. */
1497 handle_input(lguest_fd); 1528 handle_input(lguest_fd);
1498 if (write(lguest_fd, args, sizeof(args)) < 0) 1529 if (write(lguest_fd, args, sizeof(args)) < 0)
1499 err(1, "Resetting break"); 1530 err(1, "Resetting break");
1500 } 1531 }
1501} 1532}
1502/* 1533/*
1503 * This is the end of the Launcher. 1534 * This is the end of the Launcher. The good news: we are over halfway
1535 * through! The bad news: the most fiendish part of the code still lies ahead
1536 * of us.
1504 * 1537 *
1505 * But wait! We've seen I/O from the Launcher, and we've seen I/O from the 1538 * Are you ready? Take a deep breath and join me in the core of the Host, in
1506 * Drivers. If we were to see the Host kernel I/O code, our understanding 1539 * "make Host".
1507 * would be complete... :*/ 1540 :*/
1508 1541
1509static struct option opts[] = { 1542static struct option opts[] = {
1510 { "verbose", 0, NULL, 'v' }, 1543 { "verbose", 0, NULL, 'v' },
@@ -1527,7 +1560,7 @@ int main(int argc, char *argv[])
1527 /* Memory, top-level pagetable, code startpoint and size of the 1560 /* Memory, top-level pagetable, code startpoint and size of the
1528 * (optional) initrd. */ 1561 * (optional) initrd. */
1529 unsigned long mem = 0, pgdir, start, initrd_size = 0; 1562 unsigned long mem = 0, pgdir, start, initrd_size = 0;
1530 /* A temporary and the /dev/lguest file descriptor. */ 1563 /* Two temporaries and the /dev/lguest file descriptor. */
1531 int i, c, lguest_fd; 1564 int i, c, lguest_fd;
1532 /* The boot information for the Guest. */ 1565 /* The boot information for the Guest. */
1533 struct boot_params *boot; 1566 struct boot_params *boot;
@@ -1622,6 +1655,7 @@ int main(int argc, char *argv[])
1622 /* The boot header contains a command line pointer: we put the command 1655 /* The boot header contains a command line pointer: we put the command
1623 * line after the boot header. */ 1656 * line after the boot header. */
1624 boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1); 1657 boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1);
1658 /* We use a simple helper to copy the arguments separated by spaces. */
1625 concat((char *)(boot + 1), argv+optind+2); 1659 concat((char *)(boot + 1), argv+optind+2);
1626 1660
1627 /* Boot protocol version: 2.07 supports the fields for lguest. */ 1661 /* Boot protocol version: 2.07 supports the fields for lguest. */
diff --git a/Documentation/networking/00-INDEX b/Documentation/networking/00-INDEX
index 153d84d281e6..f5a5e6d3d541 100644
--- a/Documentation/networking/00-INDEX
+++ b/Documentation/networking/00-INDEX
@@ -80,8 +80,6 @@ multicast.txt
80 - Behaviour of cards under Multicast 80 - Behaviour of cards under Multicast
81ncsa-telnet 81ncsa-telnet
82 - notes on how NCSA telnet (DOS) breaks with MTU discovery enabled. 82 - notes on how NCSA telnet (DOS) breaks with MTU discovery enabled.
83net-modules.txt
84 - info and "insmod" parameters for all network driver modules.
85netdevices.txt 83netdevices.txt
86 - info on network device driver functions exported to the kernel. 84 - info on network device driver functions exported to the kernel.
87olympic.txt 85olympic.txt
diff --git a/Documentation/networking/net-modules.txt b/Documentation/networking/net-modules.txt
deleted file mode 100644
index 98c4392dd0fd..000000000000
--- a/Documentation/networking/net-modules.txt
+++ /dev/null
@@ -1,315 +0,0 @@
1Wed 2-Aug-95 <matti.aarnio@utu.fi>
2
3 Linux network driver modules
4
5 Do not mistake this for "README.modules" at the top-level
6 directory! That document tells about modules in general, while
7 this one tells only about network device driver modules.
8
9 This is a potpourri of INSMOD-time(*) configuration options
10 (if such exists) and their default values of various modules
11 in the Linux network drivers collection.
12
13 Some modules have also hidden (= non-documented) tunable values.
14 The choice of not documenting them is based on general belief, that
15 the less the user needs to know, the better. (There are things that
16 driver developers can use, others should not confuse themselves.)
17
18 In many cases it is highly preferred that insmod:ing is done
19 ONLY with defining an explicit address for the card, AND BY
20 NOT USING AUTO-PROBING!
21
22 Now most cards have some explicitly defined base address that they
23 are compiled with (to avoid auto-probing, among other things).
24 If that compiled value does not match your actual configuration,
25 do use the "io=0xXXX" -parameter for the insmod, and give there
26 a value matching your environment.
27
28 If you are adventurous, you can ask the driver to autoprobe
29 by using the "io=0" parameter, however it is a potentially dangerous
30 thing to do in a live system. (If you don't know where the
31 card is located, you can try autoprobing, and after possible
32 crash recovery, insmod with proper IO-address..)
33
34 --------------------------
35 (*) "INSMOD-time" means when you load module with
36 /sbin/insmod you can feed it optional parameters.
37 See "man insmod".
38 --------------------------
39
40
41 8390 based Network Modules (Paul Gortmaker, Nov 12, 1995)
42 --------------------------
43
44(Includes: smc-ultra, ne, wd, 3c503, hp, hp-plus, e2100 and ac3200)
45
46The 8390 series of network drivers now support multiple card systems without
47reloading the same module multiple times (memory efficient!) This is done by
48specifying multiple comma separated values, such as:
49
50 insmod 3c503.o io=0x280,0x300,0x330,0x350 xcvr=0,1,0,1
51
52The above would have the one module controlling four 3c503 cards, with card 2
53and 4 using external transceivers. The "insmod" manual describes the usage
54of comma separated value lists.
55
56It is *STRONGLY RECOMMENDED* that you supply "io=" instead of autoprobing.
57If an "io=" argument is not supplied, then the ISA drivers will complain
58about autoprobing being not recommended, and begrudgingly autoprobe for
59a *SINGLE CARD ONLY* -- if you want to use multiple cards you *have* to
60supply an "io=0xNNN,0xQQQ,..." argument.
61
62The ne module is an exception to the above. A NE2000 is essentially an
638390 chip, some bus glue and some RAM. Because of this, the ne probe is
64more invasive than the rest, and so at boot we make sure the ne probe is
65done last of all the 8390 cards (so that it won't trip over other 8390 based
66cards) With modules we can't ensure that all other non-ne 8390 cards have
67already been found. Because of this, the ne module REQUIRES an "io=0xNNN"
68argument passed in via insmod. It will refuse to autoprobe.
69
70It is also worth noting that auto-IRQ probably isn't as reliable during
71the flurry of interrupt activity on a running machine. Cards such as the
72ne2000 that can't get the IRQ setting from an EEPROM or configuration
73register are probably best supplied with an "irq=M" argument as well.
74
75
76----------------------------------------------------------------------
77Card/Module List - Configurable Parameters and Default Values
78----------------------------------------------------------------------
79
803c501.c:
81 io = 0x280 IO base address
82 irq = 5 IRQ
83 (Probes ports: 0x280, 0x300)
84
853c503.c:
86 io = 0 (It will complain if you don't supply an "io=0xNNN")
87 irq = 0 (IRQ software selected by driver using autoIRQ)
88 xcvr = 0 (Use xcvr=1 to select external transceiver.)
89 (Probes ports: 0x300, 0x310, 0x330, 0x350, 0x250, 0x280, 0x2A0, 0x2E0)
90
913c505.c:
92 io = 0
93 irq = 0
94 dma = 6 (not autoprobed)
95 (Probes ports: 0x300, 0x280, 0x310)
96
973c507.c:
98 io = 0x300
99 irq = 0
100 (Probes ports: 0x300, 0x320, 0x340, 0x280)
101
1023c509.c:
103 io = 0
104 irq = 0
105 ( Module load-time probing Works reliably only on EISA, ISA ID-PROBE
106 IS NOT RELIABLE! Compile this driver statically into kernel for
107 now, if you need it auto-probing on an ISA-bus machine. )
108
1098390.c:
110 (No public options, several other modules need this one)
111
112a2065.c:
113 Since this is a Zorro board, it supports full autoprobing, even for
114 multiple boards. (m68k/Amiga)
115
116ac3200.c:
117 io = 0 (Checks 0x1000 to 0x8fff in 0x1000 intervals)
118 irq = 0 (Read from config register)
119 (EISA probing..)
120
121apricot.c:
122 io = 0x300 (Can't be altered!)
123 irq = 10
124
125arcnet.c:
126 io = 0
127 irqnum = 0
128 shmem = 0
129 num = 0
130 DO SET THESE MANUALLY AT INSMOD!
131 (When probing, looks at the following possible addresses:
132 Suggested ones:
133 0x300, 0x2E0, 0x2F0, 0x2D0
134 Other ones:
135 0x200, 0x210, 0x220, 0x230, 0x240, 0x250, 0x260, 0x270,
136 0x280, 0x290, 0x2A0, 0x2B0, 0x2C0,
137 0x310, 0x320, 0x330, 0x340, 0x350, 0x360, 0x370,
138 0x380, 0x390, 0x3A0, 0x3E0, 0x3F0 )
139
140ariadne.c:
141 Since this is a Zorro board, it supports full autoprobing, even for
142 multiple boards. (m68k/Amiga)
143
144at1700.c:
145 io = 0x260
146 irq = 0
147 (Probes ports: 0x260, 0x280, 0x2A0, 0x240, 0x340, 0x320, 0x380, 0x300)
148
149atarilance.c:
150 Supports full autoprobing. (m68k/Atari)
151
152atp.c: *Not modularized*
153 (Probes ports: 0x378, 0x278, 0x3BC;
154 fixed IRQs: 5 and 7 )
155
156cops.c:
157 io = 0x240
158 irq = 5
159 nodeid = 0 (AutoSelect = 0, NodeID 1-254 is hand selected.)
160 (Probes ports: 0x240, 0x340, 0x200, 0x210, 0x220, 0x230, 0x260,
161 0x2A0, 0x300, 0x310, 0x320, 0x330, 0x350, 0x360)
162
163de4x5.c:
164 io = 0x000b
165 irq = 10
166 is_not_dec = 0 -- For non-DEC card using DEC 21040/21041/21140 chip, set this to 1
167 (EISA, and PCI probing)
168
169de600.c:
170 de600_debug = 0
171 (On port 0x378, irq 7 -- lpt1; compile time configurable)
172
173de620.c:
174 bnc = 0, utp = 0 <-- Force media by setting either.
175 io = 0x378 (also compile-time configurable)
176 irq = 7
177
178depca.c:
179 io = 0x200
180 irq = 7
181 (Probes ports: ISA: 0x300, 0x200;
182 EISA: 0x0c00 )
183
184dummy.c:
185 No options
186
187e2100.c:
188 io = 0 (It will complain if you don't supply an "io=0xNNN")
189 irq = 0 (IRQ software selected by driver)
190 mem = 0 (Override default shared memory start of 0xd0000)
191 xcvr = 0 (Use xcvr=1 to select external transceiver.)
192 (Probes ports: 0x300, 0x280, 0x380, 0x220)
193
194eepro.c:
195 io = 0x200
196 irq = 0
197 (Probes ports: 0x200, 0x240, 0x280, 0x2C0, 0x300, 0x320, 0x340, 0x360)
198
199eexpress.c:
200 io = 0x300
201 irq = 0 (IRQ value read from EEPROM)
202 (Probes ports: 0x300, 0x270, 0x320, 0x340)
203
204eql.c:
205 (No parameters)
206
207ewrk3.c:
208 io = 0x300
209 irq = 5
210 (With module no autoprobing!
211 On EISA-bus does EISA probing.
212 Static linkage probes ports on ISA bus:
213 0x100, 0x120, 0x140, 0x160, 0x180, 0x1A0, 0x1C0,
214 0x200, 0x220, 0x240, 0x260, 0x280, 0x2A0, 0x2C0, 0x2E0,
215 0x300, 0x340, 0x360, 0x380, 0x3A0, 0x3C0)
216
217hp-plus.c:
218 io = 0 (It will complain if you don't supply an "io=0xNNN")
219 irq = 0 (IRQ read from configuration register)
220 (Probes ports: 0x200, 0x240, 0x280, 0x2C0, 0x300, 0x320, 0x340)
221
222hp.c:
223 io = 0 (It will complain if you don't supply an "io=0xNNN")
224 irq = 0 (IRQ software selected by driver using autoIRQ)
225 (Probes ports: 0x300, 0x320, 0x340, 0x280, 0x2C0, 0x200, 0x240)
226
227hp100.c:
228 hp100_port = 0 (IO-base address)
229 (Does EISA-probing, if on EISA-slot;
230 On ISA-bus probes all ports from 0x100 thru to 0x3E0
231 in increments of 0x020)
232
233hydra.c:
234 Since this is a Zorro board, it supports full autoprobing, even for
235 multiple boards. (m68k/Amiga)
236
237ibmtr.c:
238 io = 0xa20, 0xa24 (autoprobed by default)
239 irq = 0 (driver cannot select irq - read from hardware)
240 mem = 0 (shared memory base set at 0xd0000 and not yet
241 able to override thru mem= parameter.)
242
243lance.c: *Not modularized*
244 (PCI, and ISA probing; "CONFIG_PCI" needed for PCI support)
245 (Probes ISA ports: 0x300, 0x320, 0x340, 0x360)
246
247loopback.c: *Static kernel component*
248
249ne.c:
250 io = 0 (Explicitly *requires* an "io=0xNNN" value)
251 irq = 0 (Tries to determine configured IRQ via autoIRQ)
252 (Probes ports: 0x300, 0x280, 0x320, 0x340, 0x360)
253
254net_init.c: *Static kernel component*
255
256ni52.c: *Not modularized*
257 (Probes ports: 0x300, 0x280, 0x360, 0x320, 0x340
258 mems: 0xD0000, 0xD2000, 0xC8000, 0xCA000,
259 0xD4000, 0xD6000, 0xD8000 )
260
261ni65.c: *Not modularized* **16MB MEMORY BARRIER BUG**
262 (Probes ports: 0x300, 0x320, 0x340, 0x360)
263
264pi2.c: *Not modularized* (well, NON-STANDARD modularization!)
265 Only one card supported at this time.
266 (Probes ports: 0x380, 0x300, 0x320, 0x340, 0x360, 0x3A0)
267
268plip.c:
269 io = 0
270 irq = 0 (by default, uses IRQ 5 for port at 0x3bc, IRQ 7
271 for port at 0x378, and IRQ 2 for port at 0x278)
272 (Probes ports: 0x278, 0x378, 0x3bc)
273
274ppp.c:
275 No options (ppp-2.2+ has some, this is based on non-dynamic
276 version from ppp-2.1.2d)
277
278seeq8005.c: *Not modularized*
279 (Probes ports: 0x300, 0x320, 0x340, 0x360)
280
281skeleton.c: *Skeleton*
282
283slhc.c:
284 No configuration parameters
285
286slip.c:
287 slip_maxdev = 256 (default value from SL_NRUNIT on slip.h)
288
289
290smc-ultra.c:
291 io = 0 (It will complain if you don't supply an "io=0xNNN")
292 irq = 0 (IRQ val. read from EEPROM)
293 (Probes ports: 0x200, 0x220, 0x240, 0x280, 0x300, 0x340, 0x380)
294
295tulip.c: *Partial modularization*
296 (init-time memory allocation makes problems..)
297
298tunnel.c:
299 No insmod parameters
300
301wavelan.c:
302 io = 0x390 (Settable, but change not recommended)
303 irq = 0 (Not honoured, if changed..)
304
305wd.c:
306 io = 0 (It will complain if you don't supply an "io=0xNNN")
307 irq = 0 (IRQ val. read from EEPROM, ancient cards use autoIRQ)
308 mem = 0 (Force shared-memory on address 0xC8000, or whatever..)
309 mem_end = 0 (Force non-std. mem. size via supplying mem_end val.)
310 (eg. for 32k WD8003EBT, use mem=0xd0000 mem_end=0xd8000)
311 (Probes ports: 0x300, 0x280, 0x380, 0x240)
312
313znet.c: *Not modularized*
314 (Only one device on Zenith Z-Note (notebook?) systems,
315 configuration information from (EE)PROM)
diff --git a/MAINTAINERS b/MAINTAINERS
index 188fbe4d0b9e..f985dfa5941c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2259,6 +2259,13 @@ L: legousb-devel@lists.sourceforge.net
2259W: http://legousb.sourceforge.net/ 2259W: http://legousb.sourceforge.net/
2260S: Maintained 2260S: Maintained
2261 2261
2262LGUEST
2263P: Rusty Russell
2264M: rusty@rustcorp.com.au
2265L: lguest@ozlabs.org
2266W: http://lguest.ozlabs.org/
2267S: Maintained
2268
2262LINUX FOR IBM pSERIES (RS/6000) 2269LINUX FOR IBM pSERIES (RS/6000)
2263P: Paul Mackerras 2270P: Paul Mackerras
2264M: paulus@au.ibm.com 2271M: paulus@au.ibm.com
diff --git a/arch/blackfin/kernel/dma-mapping.c b/arch/blackfin/kernel/dma-mapping.c
index a16cb03c5291..d6b61d56b656 100644
--- a/arch/blackfin/kernel/dma-mapping.c
+++ b/arch/blackfin/kernel/dma-mapping.c
@@ -35,6 +35,7 @@
35#include <linux/device.h> 35#include <linux/device.h>
36#include <linux/dma-mapping.h> 36#include <linux/dma-mapping.h>
37#include <linux/io.h> 37#include <linux/io.h>
38#include <linux/scatterlist.h>
38#include <asm/cacheflush.h> 39#include <asm/cacheflush.h>
39#include <asm/bfin-global.h> 40#include <asm/bfin-global.h>
40 41
diff --git a/arch/m68k/kernel/dma.c b/arch/m68k/kernel/dma.c
index ef490e1ce600..6f8c080dd9f9 100644
--- a/arch/m68k/kernel/dma.c
+++ b/arch/m68k/kernel/dma.c
@@ -9,10 +9,10 @@
9#include <linux/dma-mapping.h> 9#include <linux/dma-mapping.h>
10#include <linux/device.h> 10#include <linux/device.h>
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/scatterlist.h>
12#include <linux/vmalloc.h> 13#include <linux/vmalloc.h>
13 14
14#include <asm/pgalloc.h> 15#include <asm/pgalloc.h>
15#include <asm/scatterlist.h>
16 16
17void *dma_alloc_coherent(struct device *dev, size_t size, 17void *dma_alloc_coherent(struct device *dev, size_t size,
18 dma_addr_t *handle, gfp_t flag) 18 dma_addr_t *handle, gfp_t flag)
diff --git a/arch/sparc64/kernel/iommu_common.c b/arch/sparc64/kernel/iommu_common.c
index b70324e0d83d..efd5dff85f60 100644
--- a/arch/sparc64/kernel/iommu_common.c
+++ b/arch/sparc64/kernel/iommu_common.c
@@ -234,7 +234,7 @@ unsigned long prepare_sg(struct scatterlist *sg, int nents)
234 dma_sg->dma_length = dent_len; 234 dma_sg->dma_length = dent_len;
235 235
236 if (dma_sg != sg) { 236 if (dma_sg != sg) {
237 dma_sg = next_sg(dma_sg); 237 dma_sg = sg_next(dma_sg);
238 dma_sg->dma_length = 0; 238 dma_sg->dma_length = 0;
239 } 239 }
240 240
diff --git a/arch/sparc64/kernel/ldc.c b/arch/sparc64/kernel/ldc.c
index c8313cb60f0a..217478a94128 100644
--- a/arch/sparc64/kernel/ldc.c
+++ b/arch/sparc64/kernel/ldc.c
@@ -2121,7 +2121,7 @@ int ldc_map_sg(struct ldc_channel *lp,
2121 state.nc = 0; 2121 state.nc = 0;
2122 2122
2123 for (i = 0; i < num_sg; i++) 2123 for (i = 0; i < num_sg; i++)
2124 fill_cookies(&state, page_to_pfn(sg[i].page) << PAGE_SHIFT, 2124 fill_cookies(&state, page_to_pfn(sg_page(&sg[i])) << PAGE_SHIFT,
2125 sg[i].offset, sg[i].length); 2125 sg[i].offset, sg[i].length);
2126 2126
2127 return state.nc; 2127 return state.nc;
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 3a8cd3dfb51c..e184b44b1011 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -35,6 +35,7 @@
35#include "linux/genhd.h" 35#include "linux/genhd.h"
36#include "linux/spinlock.h" 36#include "linux/spinlock.h"
37#include "linux/platform_device.h" 37#include "linux/platform_device.h"
38#include "linux/scatterlist.h"
38#include "asm/segment.h" 39#include "asm/segment.h"
39#include "asm/uaccess.h" 40#include "asm/uaccess.h"
40#include "asm/irq.h" 41#include "asm/irq.h"
@@ -704,6 +705,7 @@ static int ubd_add(int n, char **error_out)
704 ubd_dev->size = ROUND_BLOCK(ubd_dev->size); 705 ubd_dev->size = ROUND_BLOCK(ubd_dev->size);
705 706
706 INIT_LIST_HEAD(&ubd_dev->restart); 707 INIT_LIST_HEAD(&ubd_dev->restart);
708 sg_init_table(&ubd_dev->sg, MAX_SG);
707 709
708 err = -ENOMEM; 710 err = -ENOMEM;
709 ubd_dev->queue = blk_init_queue(do_ubd_request, &ubd_dev->lock); 711 ubd_dev->queue = blk_init_queue(do_ubd_request, &ubd_dev->lock);
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index af0253f94a9a..8bb482ff091b 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -25,7 +25,7 @@
25#include <linux/kdebug.h> 25#include <linux/kdebug.h>
26#include <asm/smp.h> 26#include <asm/smp.h>
27 27
28#ifdef X86_32 28#ifdef CONFIG_X86_32
29#include <mach_ipi.h> 29#include <mach_ipi.h>
30#else 30#else
31#include <asm/mach_apic.h> 31#include <asm/mach_apic.h>
@@ -41,7 +41,7 @@ static int crash_nmi_callback(struct notifier_block *self,
41 unsigned long val, void *data) 41 unsigned long val, void *data)
42{ 42{
43 struct pt_regs *regs; 43 struct pt_regs *regs;
44#ifdef X86_32 44#ifdef CONFIG_X86_32
45 struct pt_regs fixed_regs; 45 struct pt_regs fixed_regs;
46#endif 46#endif
47 int cpu; 47 int cpu;
@@ -60,7 +60,7 @@ static int crash_nmi_callback(struct notifier_block *self,
60 return NOTIFY_STOP; 60 return NOTIFY_STOP;
61 local_irq_disable(); 61 local_irq_disable();
62 62
63#ifdef X86_32 63#ifdef CONFIG_X86_32
64 if (!user_mode_vm(regs)) { 64 if (!user_mode_vm(regs)) {
65 crash_fixup_ss_esp(&fixed_regs, regs); 65 crash_fixup_ss_esp(&fixed_regs, regs);
66 regs = &fixed_regs; 66 regs = &fixed_regs;
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index c56e9ee64964..ae7e0161ce46 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -338,7 +338,6 @@ static int __dma_map_cont(struct scatterlist *start, int nelems,
338 338
339 BUG_ON(s != start && s->offset); 339 BUG_ON(s != start && s->offset);
340 if (s == start) { 340 if (s == start) {
341 *sout = *s;
342 sout->dma_address = iommu_bus_base; 341 sout->dma_address = iommu_bus_base;
343 sout->dma_address += iommu_page*PAGE_SIZE + s->offset; 342 sout->dma_address += iommu_page*PAGE_SIZE + s->offset;
344 sout->dma_length = s->length; 343 sout->dma_length = s->length;
@@ -365,7 +364,7 @@ static inline int dma_map_cont(struct scatterlist *start, int nelems,
365{ 364{
366 if (!need) { 365 if (!need) {
367 BUG_ON(nelems != 1); 366 BUG_ON(nelems != 1);
368 *sout = *start; 367 sout->dma_address = start->dma_address;
369 sout->dma_length = start->length; 368 sout->dma_length = start->length;
370 return 0; 369 return 0;
371 } 370 }
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index d2235db4085f..a55b0902f9d3 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -56,6 +56,7 @@
56#include <linux/lguest.h> 56#include <linux/lguest.h>
57#include <linux/lguest_launcher.h> 57#include <linux/lguest_launcher.h>
58#include <linux/virtio_console.h> 58#include <linux/virtio_console.h>
59#include <linux/pm.h>
59#include <asm/paravirt.h> 60#include <asm/paravirt.h>
60#include <asm/param.h> 61#include <asm/param.h>
61#include <asm/page.h> 62#include <asm/page.h>
@@ -98,7 +99,7 @@ static cycle_t clock_base;
98 * When lazy_mode is set, it means we're allowed to defer all hypercalls and do 99 * When lazy_mode is set, it means we're allowed to defer all hypercalls and do
99 * them as a batch when lazy_mode is eventually turned off. Because hypercalls 100 * them as a batch when lazy_mode is eventually turned off. Because hypercalls
100 * are reasonably expensive, batching them up makes sense. For example, a 101 * are reasonably expensive, batching them up makes sense. For example, a
101 * large mmap might update dozens of page table entries: that code calls 102 * large munmap might update dozens of page table entries: that code calls
102 * paravirt_enter_lazy_mmu(), does the dozen updates, then calls 103 * paravirt_enter_lazy_mmu(), does the dozen updates, then calls
103 * lguest_leave_lazy_mode(). 104 * lguest_leave_lazy_mode().
104 * 105 *
@@ -163,8 +164,8 @@ void async_hcall(unsigned long call,
163/*:*/ 164/*:*/
164 165
165/*G:033 166/*G:033
166 * Here are our first native-instruction replacements: four functions for 167 * After that diversion we return to our first native-instruction
167 * interrupt control. 168 * replacements: four functions for interrupt control.
168 * 169 *
169 * The simplest way of implementing these would be to have "turn interrupts 170 * The simplest way of implementing these would be to have "turn interrupts
170 * off" and "turn interrupts on" hypercalls. Unfortunately, this is too slow: 171 * off" and "turn interrupts on" hypercalls. Unfortunately, this is too slow:
@@ -183,7 +184,7 @@ static unsigned long save_fl(void)
183 return lguest_data.irq_enabled; 184 return lguest_data.irq_enabled;
184} 185}
185 186
186/* "restore_flags" just sets the flags back to the value given. */ 187/* restore_flags() just sets the flags back to the value given. */
187static void restore_fl(unsigned long flags) 188static void restore_fl(unsigned long flags)
188{ 189{
189 lguest_data.irq_enabled = flags; 190 lguest_data.irq_enabled = flags;
@@ -356,7 +357,7 @@ static void lguest_cpuid(unsigned int *eax, unsigned int *ebx,
356 * it. The Host needs to know when the Guest wants to change them, so we have 357 * it. The Host needs to know when the Guest wants to change them, so we have
357 * a whole series of functions like read_cr0() and write_cr0(). 358 * a whole series of functions like read_cr0() and write_cr0().
358 * 359 *
359 * We start with CR0. CR0 allows you to turn on and off all kinds of basic 360 * We start with cr0. cr0 allows you to turn on and off all kinds of basic
360 * features, but Linux only really cares about one: the horrifically-named Task 361 * features, but Linux only really cares about one: the horrifically-named Task
361 * Switched (TS) bit at bit 3 (ie. 8) 362 * Switched (TS) bit at bit 3 (ie. 8)
362 * 363 *
@@ -371,8 +372,7 @@ static void lguest_cpuid(unsigned int *eax, unsigned int *ebx,
371static unsigned long current_cr0, current_cr3; 372static unsigned long current_cr0, current_cr3;
372static void lguest_write_cr0(unsigned long val) 373static void lguest_write_cr0(unsigned long val)
373{ 374{
374 /* 8 == TS bit. */ 375 lazy_hcall(LHCALL_TS, val & X86_CR0_TS, 0, 0);
375 lazy_hcall(LHCALL_TS, val & 8, 0, 0);
376 current_cr0 = val; 376 current_cr0 = val;
377} 377}
378 378
@@ -387,10 +387,10 @@ static unsigned long lguest_read_cr0(void)
387static void lguest_clts(void) 387static void lguest_clts(void)
388{ 388{
389 lazy_hcall(LHCALL_TS, 0, 0, 0); 389 lazy_hcall(LHCALL_TS, 0, 0, 0);
390 current_cr0 &= ~8U; 390 current_cr0 &= ~X86_CR0_TS;
391} 391}
392 392
393/* CR2 is the virtual address of the last page fault, which the Guest only ever 393/* cr2 is the virtual address of the last page fault, which the Guest only ever
394 * reads. The Host kindly writes this into our "struct lguest_data", so we 394 * reads. The Host kindly writes this into our "struct lguest_data", so we
395 * just read it out of there. */ 395 * just read it out of there. */
396static unsigned long lguest_read_cr2(void) 396static unsigned long lguest_read_cr2(void)
@@ -398,7 +398,7 @@ static unsigned long lguest_read_cr2(void)
398 return lguest_data.cr2; 398 return lguest_data.cr2;
399} 399}
400 400
401/* CR3 is the current toplevel pagetable page: the principle is the same as 401/* cr3 is the current toplevel pagetable page: the principle is the same as
402 * cr0. Keep a local copy, and tell the Host when it changes. */ 402 * cr0. Keep a local copy, and tell the Host when it changes. */
403static void lguest_write_cr3(unsigned long cr3) 403static void lguest_write_cr3(unsigned long cr3)
404{ 404{
@@ -411,7 +411,7 @@ static unsigned long lguest_read_cr3(void)
411 return current_cr3; 411 return current_cr3;
412} 412}
413 413
414/* CR4 is used to enable and disable PGE, but we don't care. */ 414/* cr4 is used to enable and disable PGE, but we don't care. */
415static unsigned long lguest_read_cr4(void) 415static unsigned long lguest_read_cr4(void)
416{ 416{
417 return 0; 417 return 0;
@@ -432,7 +432,7 @@ static void lguest_write_cr4(unsigned long val)
432 * maps virtual addresses to physical addresses using "page tables". We could 432 * maps virtual addresses to physical addresses using "page tables". We could
433 * use one huge index of 1 million entries: each address is 4 bytes, so that's 433 * use one huge index of 1 million entries: each address is 4 bytes, so that's
434 * 1024 pages just to hold the page tables. But since most virtual addresses 434 * 1024 pages just to hold the page tables. But since most virtual addresses
435 * are unused, we use a two level index which saves space. The CR3 register 435 * are unused, we use a two level index which saves space. The cr3 register
436 * contains the physical address of the top level "page directory" page, which 436 * contains the physical address of the top level "page directory" page, which
437 * contains physical addresses of up to 1024 second-level pages. Each of these 437 * contains physical addresses of up to 1024 second-level pages. Each of these
438 * second level pages contains up to 1024 physical addresses of actual pages, 438 * second level pages contains up to 1024 physical addresses of actual pages,
@@ -440,7 +440,7 @@ static void lguest_write_cr4(unsigned long val)
440 * 440 *
441 * Here's a diagram, where arrows indicate physical addresses: 441 * Here's a diagram, where arrows indicate physical addresses:
442 * 442 *
443 * CR3 ---> +---------+ 443 * cr3 ---> +---------+
444 * | --------->+---------+ 444 * | --------->+---------+
445 * | | | PADDR1 | 445 * | | | PADDR1 |
446 * Top-level | | PADDR2 | 446 * Top-level | | PADDR2 |
@@ -498,8 +498,7 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
498 * 498 *
499 * ... except in early boot when the kernel sets up the initial pagetables, 499 * ... except in early boot when the kernel sets up the initial pagetables,
500 * which makes booting astonishingly slow. So we don't even tell the Host 500 * which makes booting astonishingly slow. So we don't even tell the Host
501 * anything changed until we've done the first page table switch. 501 * anything changed until we've done the first page table switch. */
502 */
503static void lguest_set_pte(pte_t *ptep, pte_t pteval) 502static void lguest_set_pte(pte_t *ptep, pte_t pteval)
504{ 503{
505 *ptep = pteval; 504 *ptep = pteval;
@@ -720,10 +719,10 @@ static void lguest_time_init(void)
720 /* Set up the timer interrupt (0) to go to our simple timer routine */ 719 /* Set up the timer interrupt (0) to go to our simple timer routine */
721 set_irq_handler(0, lguest_time_irq); 720 set_irq_handler(0, lguest_time_irq);
722 721
723 /* Our clock structure look like arch/i386/kernel/tsc.c if we can use 722 /* Our clock structure looks like arch/x86/kernel/tsc_32.c if we can
724 * the TSC, otherwise it's a dumb nanosecond-resolution clock. Either 723 * use the TSC, otherwise it's a dumb nanosecond-resolution clock.
725 * way, the "rating" is initialized so high that it's always chosen 724 * Either way, the "rating" is set so high that it's always chosen over
726 * over any other clocksource. */ 725 * any other clocksource. */
727 if (lguest_data.tsc_khz) 726 if (lguest_data.tsc_khz)
728 lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz, 727 lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz,
729 lguest_clock.shift); 728 lguest_clock.shift);
@@ -749,7 +748,7 @@ static void lguest_time_init(void)
749 * to work. They're pretty simple. 748 * to work. They're pretty simple.
750 */ 749 */
751 750
752/* The Guest needs to tell the host what stack it expects traps to use. For 751/* The Guest needs to tell the Host what stack it expects traps to use. For
753 * native hardware, this is part of the Task State Segment mentioned above in 752 * native hardware, this is part of the Task State Segment mentioned above in
754 * lguest_load_tr_desc(), but to help hypervisors there's this special call. 753 * lguest_load_tr_desc(), but to help hypervisors there's this special call.
755 * 754 *
@@ -850,13 +849,16 @@ static __init char *lguest_memory_setup(void)
850 return "LGUEST"; 849 return "LGUEST";
851} 850}
852 851
853/* Before virtqueues are set up, we use LHCALL_NOTIFY on normal memory to 852/* We will eventually use the virtio console device to produce console output,
854 * produce console output. */ 853 * but before that is set up we use LHCALL_NOTIFY on normal memory to produce
854 * console output. */
855static __init int early_put_chars(u32 vtermno, const char *buf, int count) 855static __init int early_put_chars(u32 vtermno, const char *buf, int count)
856{ 856{
857 char scratch[17]; 857 char scratch[17];
858 unsigned int len = count; 858 unsigned int len = count;
859 859
860 /* We use a nul-terminated string, so we have to make a copy. Icky,
861 * huh? */
860 if (len > sizeof(scratch) - 1) 862 if (len > sizeof(scratch) - 1)
861 len = sizeof(scratch) - 1; 863 len = sizeof(scratch) - 1;
862 scratch[len] = '\0'; 864 scratch[len] = '\0';
@@ -883,7 +885,7 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count)
883 * Our current solution is to allow the paravirt back end to optionally patch 885 * Our current solution is to allow the paravirt back end to optionally patch
884 * over the indirect calls to replace them with something more efficient. We 886 * over the indirect calls to replace them with something more efficient. We
885 * patch the four most commonly called functions: disable interrupts, enable 887 * patch the four most commonly called functions: disable interrupts, enable
886 * interrupts, restore interrupts and save interrupts. We usually have 10 888 * interrupts, restore interrupts and save interrupts. We usually have 6 or 10
887 * bytes to patch into: the Guest versions of these operations are small enough 889 * bytes to patch into: the Guest versions of these operations are small enough
888 * that we can fit comfortably. 890 * that we can fit comfortably.
889 * 891 *
@@ -1015,7 +1017,7 @@ __init void lguest_init(void)
1015 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory"); 1017 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory");
1016 1018
1017 /* The Host uses the top of the Guest's virtual address space for the 1019 /* The Host uses the top of the Guest's virtual address space for the
1018 * Host<->Guest Switcher, and it tells us how much it needs in 1020 * Host<->Guest Switcher, and it tells us how big that is in
1019 * lguest_data.reserve_mem, set up on the LGUEST_INIT hypercall. */ 1021 * lguest_data.reserve_mem, set up on the LGUEST_INIT hypercall. */
1020 reserve_top_address(lguest_data.reserve_mem); 1022 reserve_top_address(lguest_data.reserve_mem);
1021 1023
@@ -1065,6 +1067,6 @@ __init void lguest_init(void)
1065/* 1067/*
1066 * This marks the end of stage II of our journey, The Guest. 1068 * This marks the end of stage II of our journey, The Guest.
1067 * 1069 *
1068 * It is now time for us to explore the nooks and crannies of the three Guest 1070 * It is now time for us to explore the layer of virtual drivers and complete
1069 * devices and complete our understanding of the Guest in "make Drivers". 1071 * our understanding of the Guest in "make Drivers".
1070 */ 1072 */
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index ebc6ac733899..95b6fbcded63 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -6,7 +6,7 @@
6#include <asm/processor-flags.h> 6#include <asm/processor-flags.h>
7 7
8/*G:020 This is where we begin: head.S notes that the boot header's platform 8/*G:020 This is where we begin: head.S notes that the boot header's platform
9 * type field is "1" (lguest), so calls us here. The boot header is in %esi. 9 * type field is "1" (lguest), so calls us here.
10 * 10 *
11 * WARNING: be very careful here! We're running at addresses equal to physical 11 * WARNING: be very careful here! We're running at addresses equal to physical
12 * addesses (around 0), not above PAGE_OFFSET as most code expectes 12 * addesses (around 0), not above PAGE_OFFSET as most code expectes
@@ -17,13 +17,15 @@
17 * boot. */ 17 * boot. */
18.section .init.text, "ax", @progbits 18.section .init.text, "ax", @progbits
19ENTRY(lguest_entry) 19ENTRY(lguest_entry)
20 /* Make initial hypercall now, so we can set up the pagetables. */ 20 /* We make the "initialization" hypercall now to tell the Host about
21 * us, and also find out where it put our page tables. */
21 movl $LHCALL_LGUEST_INIT, %eax 22 movl $LHCALL_LGUEST_INIT, %eax
22 movl $lguest_data - __PAGE_OFFSET, %edx 23 movl $lguest_data - __PAGE_OFFSET, %edx
23 int $LGUEST_TRAP_ENTRY 24 int $LGUEST_TRAP_ENTRY
24 25
25 /* The Host put the toplevel pagetable in lguest_data.pgdir. The movsl 26 /* The Host put the toplevel pagetable in lguest_data.pgdir. The movsl
26 * instruction uses %esi implicitly. */ 27 * instruction uses %esi implicitly as the source for the copy we'
28 * about to do. */
27 movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi 29 movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi
28 30
29 /* Copy first 32 entries of page directory to __PAGE_OFFSET entries. 31 /* Copy first 32 entries of page directory to __PAGE_OFFSET entries.
diff --git a/arch/x86/mm/fault_32.c b/arch/x86/mm/fault_32.c
index 503dfc05111b..33563ee8eb0f 100644
--- a/arch/x86/mm/fault_32.c
+++ b/arch/x86/mm/fault_32.c
@@ -550,7 +550,7 @@ no_context:
550 page &= PAGE_MASK; 550 page &= PAGE_MASK;
551 page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) 551 page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
552 & (PTRS_PER_PMD - 1)]; 552 & (PTRS_PER_PMD - 1)];
553 printk(KERN_ALERT "*pde = %016Lx ", page); 553 printk(KERN_CONT "*pde = %016Lx ", page);
554 page &= ~_PAGE_NX; 554 page &= ~_PAGE_NX;
555 } 555 }
556#else 556#else
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index de5ba479c224..b01dee3ae7f3 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -1366,9 +1366,7 @@ new_segment:
1366 sg = sg_next(sg); 1366 sg = sg_next(sg);
1367 } 1367 }
1368 1368
1369 sg_set_page(sg, bvec->bv_page); 1369 sg_set_page(sg, bvec->bv_page, nbytes, bvec->bv_offset);
1370 sg->length = nbytes;
1371 sg->offset = bvec->bv_offset;
1372 nsegs++; 1370 nsegs++;
1373 } 1371 }
1374 bvprv = bvec; 1372 bvprv = bvec;
diff --git a/crypto/hmac.c b/crypto/hmac.c
index e4eb6ac53b5c..6691981bda11 100644
--- a/crypto/hmac.c
+++ b/crypto/hmac.c
@@ -160,8 +160,7 @@ static int hmac_digest(struct hash_desc *pdesc, struct scatterlist *sg,
160 160
161 sg_set_buf(sg1, ipad, bs); 161 sg_set_buf(sg1, ipad, bs);
162 162
163 sg_set_page(&sg[1], (void *) sg); 163 sg_set_page(&sg[1], (void *) sg, 0, 0);
164 sg1[1].length = 0;
165 sg_set_buf(sg2, opad, bs + ds); 164 sg_set_buf(sg2, opad, bs + ds);
166 165
167 err = crypto_hash_digest(&desc, sg1, nbytes + bs, digest); 166 err = crypto_hash_digest(&desc, sg1, nbytes + bs, digest);
diff --git a/drivers/acpi/sleep/proc.c b/drivers/acpi/sleep/proc.c
index 3839efd5eaea..1538355c266b 100644
--- a/drivers/acpi/sleep/proc.c
+++ b/drivers/acpi/sleep/proc.c
@@ -194,6 +194,23 @@ static int get_date_field(char **p, u32 * value)
194 return result; 194 return result;
195} 195}
196 196
197/* Read a possibly BCD register, always return binary */
198static u32 cmos_bcd_read(int offset, int rtc_control)
199{
200 u32 val = CMOS_READ(offset);
201 if (!(rtc_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
202 BCD_TO_BIN(val);
203 return val;
204}
205
206/* Write binary value into possibly BCD register */
207static void cmos_bcd_write(u32 val, int offset, int rtc_control)
208{
209 if (!(rtc_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
210 BIN_TO_BCD(val);
211 CMOS_WRITE(val, offset);
212}
213
197static ssize_t 214static ssize_t
198acpi_system_write_alarm(struct file *file, 215acpi_system_write_alarm(struct file *file,
199 const char __user * buffer, size_t count, loff_t * ppos) 216 const char __user * buffer, size_t count, loff_t * ppos)
@@ -258,35 +275,18 @@ acpi_system_write_alarm(struct file *file,
258 spin_lock_irq(&rtc_lock); 275 spin_lock_irq(&rtc_lock);
259 276
260 rtc_control = CMOS_READ(RTC_CONTROL); 277 rtc_control = CMOS_READ(RTC_CONTROL);
261 if (!(rtc_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
262 BIN_TO_BCD(yr);
263 BIN_TO_BCD(mo);
264 BIN_TO_BCD(day);
265 BIN_TO_BCD(hr);
266 BIN_TO_BCD(min);
267 BIN_TO_BCD(sec);
268 }
269 278
270 if (adjust) { 279 if (adjust) {
271 yr += CMOS_READ(RTC_YEAR); 280 yr += cmos_bcd_read(RTC_YEAR, rtc_control);
272 mo += CMOS_READ(RTC_MONTH); 281 mo += cmos_bcd_read(RTC_MONTH, rtc_control);
273 day += CMOS_READ(RTC_DAY_OF_MONTH); 282 day += cmos_bcd_read(RTC_DAY_OF_MONTH, rtc_control);
274 hr += CMOS_READ(RTC_HOURS); 283 hr += cmos_bcd_read(RTC_HOURS, rtc_control);
275 min += CMOS_READ(RTC_MINUTES); 284 min += cmos_bcd_read(RTC_MINUTES, rtc_control);
276 sec += CMOS_READ(RTC_SECONDS); 285 sec += cmos_bcd_read(RTC_SECONDS, rtc_control);
277 } 286 }
278 287
279 spin_unlock_irq(&rtc_lock); 288 spin_unlock_irq(&rtc_lock);
280 289
281 if (!(rtc_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
282 BCD_TO_BIN(yr);
283 BCD_TO_BIN(mo);
284 BCD_TO_BIN(day);
285 BCD_TO_BIN(hr);
286 BCD_TO_BIN(min);
287 BCD_TO_BIN(sec);
288 }
289
290 if (sec > 59) { 290 if (sec > 59) {
291 min++; 291 min++;
292 sec -= 60; 292 sec -= 60;
@@ -307,14 +307,6 @@ acpi_system_write_alarm(struct file *file,
307 yr++; 307 yr++;
308 mo -= 12; 308 mo -= 12;
309 } 309 }
310 if (!(rtc_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
311 BIN_TO_BCD(yr);
312 BIN_TO_BCD(mo);
313 BIN_TO_BCD(day);
314 BIN_TO_BCD(hr);
315 BIN_TO_BCD(min);
316 BIN_TO_BCD(sec);
317 }
318 310
319 spin_lock_irq(&rtc_lock); 311 spin_lock_irq(&rtc_lock);
320 /* 312 /*
@@ -326,9 +318,9 @@ acpi_system_write_alarm(struct file *file,
326 CMOS_READ(RTC_INTR_FLAGS); 318 CMOS_READ(RTC_INTR_FLAGS);
327 319
328 /* write the fields the rtc knows about */ 320 /* write the fields the rtc knows about */
329 CMOS_WRITE(hr, RTC_HOURS_ALARM); 321 cmos_bcd_write(hr, RTC_HOURS_ALARM, rtc_control);
330 CMOS_WRITE(min, RTC_MINUTES_ALARM); 322 cmos_bcd_write(min, RTC_MINUTES_ALARM, rtc_control);
331 CMOS_WRITE(sec, RTC_SECONDS_ALARM); 323 cmos_bcd_write(sec, RTC_SECONDS_ALARM, rtc_control);
332 324
333 /* 325 /*
334 * If the system supports an enhanced alarm it will have non-zero 326 * If the system supports an enhanced alarm it will have non-zero
@@ -336,11 +328,11 @@ acpi_system_write_alarm(struct file *file,
336 * to the RTC area of memory. 328 * to the RTC area of memory.
337 */ 329 */
338 if (acpi_gbl_FADT.day_alarm) 330 if (acpi_gbl_FADT.day_alarm)
339 CMOS_WRITE(day, acpi_gbl_FADT.day_alarm); 331 cmos_bcd_write(day, acpi_gbl_FADT.day_alarm, rtc_control);
340 if (acpi_gbl_FADT.month_alarm) 332 if (acpi_gbl_FADT.month_alarm)
341 CMOS_WRITE(mo, acpi_gbl_FADT.month_alarm); 333 cmos_bcd_write(mo, acpi_gbl_FADT.month_alarm, rtc_control);
342 if (acpi_gbl_FADT.century) 334 if (acpi_gbl_FADT.century)
343 CMOS_WRITE(yr / 100, acpi_gbl_FADT.century); 335 cmos_bcd_write(yr / 100, acpi_gbl_FADT.century, rtc_control);
344 /* enable the rtc alarm interrupt */ 336 /* enable the rtc alarm interrupt */
345 rtc_control |= RTC_AIE; 337 rtc_control |= RTC_AIE;
346 CMOS_WRITE(rtc_control, RTC_CONTROL); 338 CMOS_WRITE(rtc_control, RTC_CONTROL);
diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 95229e77bffe..49cf4cf1a5a2 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -41,6 +41,7 @@
41#include <linux/interrupt.h> 41#include <linux/interrupt.h>
42#include <linux/dma-mapping.h> 42#include <linux/dma-mapping.h>
43#include <linux/device.h> 43#include <linux/device.h>
44#include <linux/dmi.h>
44#include <scsi/scsi_host.h> 45#include <scsi/scsi_host.h>
45#include <scsi/scsi_cmnd.h> 46#include <scsi/scsi_cmnd.h>
46#include <linux/libata.h> 47#include <linux/libata.h>
@@ -241,6 +242,7 @@ static void ahci_pmp_attach(struct ata_port *ap);
241static void ahci_pmp_detach(struct ata_port *ap); 242static void ahci_pmp_detach(struct ata_port *ap);
242static void ahci_error_handler(struct ata_port *ap); 243static void ahci_error_handler(struct ata_port *ap);
243static void ahci_vt8251_error_handler(struct ata_port *ap); 244static void ahci_vt8251_error_handler(struct ata_port *ap);
245static void ahci_p5wdh_error_handler(struct ata_port *ap);
244static void ahci_post_internal_cmd(struct ata_queued_cmd *qc); 246static void ahci_post_internal_cmd(struct ata_queued_cmd *qc);
245static int ahci_port_resume(struct ata_port *ap); 247static int ahci_port_resume(struct ata_port *ap);
246static unsigned int ahci_fill_sg(struct ata_queued_cmd *qc, void *cmd_tbl); 248static unsigned int ahci_fill_sg(struct ata_queued_cmd *qc, void *cmd_tbl);
@@ -339,6 +341,40 @@ static const struct ata_port_operations ahci_vt8251_ops = {
339 .port_stop = ahci_port_stop, 341 .port_stop = ahci_port_stop,
340}; 342};
341 343
344static const struct ata_port_operations ahci_p5wdh_ops = {
345 .check_status = ahci_check_status,
346 .check_altstatus = ahci_check_status,
347 .dev_select = ata_noop_dev_select,
348
349 .tf_read = ahci_tf_read,
350
351 .qc_defer = sata_pmp_qc_defer_cmd_switch,
352 .qc_prep = ahci_qc_prep,
353 .qc_issue = ahci_qc_issue,
354
355 .irq_clear = ahci_irq_clear,
356
357 .scr_read = ahci_scr_read,
358 .scr_write = ahci_scr_write,
359
360 .freeze = ahci_freeze,
361 .thaw = ahci_thaw,
362
363 .error_handler = ahci_p5wdh_error_handler,
364 .post_internal_cmd = ahci_post_internal_cmd,
365
366 .pmp_attach = ahci_pmp_attach,
367 .pmp_detach = ahci_pmp_detach,
368
369#ifdef CONFIG_PM
370 .port_suspend = ahci_port_suspend,
371 .port_resume = ahci_port_resume,
372#endif
373
374 .port_start = ahci_port_start,
375 .port_stop = ahci_port_stop,
376};
377
342#define AHCI_HFLAGS(flags) .private_data = (void *)(flags) 378#define AHCI_HFLAGS(flags) .private_data = (void *)(flags)
343 379
344static const struct ata_port_info ahci_port_info[] = { 380static const struct ata_port_info ahci_port_info[] = {
@@ -1213,6 +1249,53 @@ static int ahci_vt8251_hardreset(struct ata_link *link, unsigned int *class,
1213 return rc ?: -EAGAIN; 1249 return rc ?: -EAGAIN;
1214} 1250}
1215 1251
1252static int ahci_p5wdh_hardreset(struct ata_link *link, unsigned int *class,
1253 unsigned long deadline)
1254{
1255 struct ata_port *ap = link->ap;
1256 struct ahci_port_priv *pp = ap->private_data;
1257 u8 *d2h_fis = pp->rx_fis + RX_FIS_D2H_REG;
1258 struct ata_taskfile tf;
1259 int rc;
1260
1261 ahci_stop_engine(ap);
1262
1263 /* clear D2H reception area to properly wait for D2H FIS */
1264 ata_tf_init(link->device, &tf);
1265 tf.command = 0x80;
1266 ata_tf_to_fis(&tf, 0, 0, d2h_fis);
1267
1268 rc = sata_link_hardreset(link, sata_ehc_deb_timing(&link->eh_context),
1269 deadline);
1270
1271 ahci_start_engine(ap);
1272
1273 if (rc || ata_link_offline(link))
1274 return rc;
1275
1276 /* spec mandates ">= 2ms" before checking status */
1277 msleep(150);
1278
1279 /* The pseudo configuration device on SIMG4726 attached to
1280 * ASUS P5W-DH Deluxe doesn't send signature FIS after
1281 * hardreset if no device is attached to the first downstream
1282 * port && the pseudo device locks up on SRST w/ PMP==0. To
1283 * work around this, wait for !BSY only briefly. If BSY isn't
1284 * cleared, perform CLO and proceed to IDENTIFY (achieved by
1285 * ATA_LFLAG_NO_SRST and ATA_LFLAG_ASSUME_ATA).
1286 *
1287 * Wait for two seconds. Devices attached to downstream port
1288 * which can't process the following IDENTIFY after this will
1289 * have to be reset again. For most cases, this should
1290 * suffice while making probing snappish enough.
1291 */
1292 rc = ata_wait_ready(ap, jiffies + 2 * HZ);
1293 if (rc)
1294 ahci_kick_engine(ap, 0);
1295
1296 return 0;
1297}
1298
1216static void ahci_postreset(struct ata_link *link, unsigned int *class) 1299static void ahci_postreset(struct ata_link *link, unsigned int *class)
1217{ 1300{
1218 struct ata_port *ap = link->ap; 1301 struct ata_port *ap = link->ap;
@@ -1670,6 +1753,19 @@ static void ahci_vt8251_error_handler(struct ata_port *ap)
1670 ahci_postreset); 1753 ahci_postreset);
1671} 1754}
1672 1755
1756static void ahci_p5wdh_error_handler(struct ata_port *ap)
1757{
1758 if (!(ap->pflags & ATA_PFLAG_FROZEN)) {
1759 /* restart engine */
1760 ahci_stop_engine(ap);
1761 ahci_start_engine(ap);
1762 }
1763
1764 /* perform recovery */
1765 ata_do_eh(ap, ata_std_prereset, ahci_softreset, ahci_p5wdh_hardreset,
1766 ahci_postreset);
1767}
1768
1673static void ahci_post_internal_cmd(struct ata_queued_cmd *qc) 1769static void ahci_post_internal_cmd(struct ata_queued_cmd *qc)
1674{ 1770{
1675 struct ata_port *ap = qc->ap; 1771 struct ata_port *ap = qc->ap;
@@ -1955,6 +2051,51 @@ static void ahci_print_info(struct ata_host *host)
1955 ); 2051 );
1956} 2052}
1957 2053
2054/* On ASUS P5W DH Deluxe, the second port of PCI device 00:1f.2 is
2055 * hardwired to on-board SIMG 4726. The chipset is ICH8 and doesn't
2056 * support PMP and the 4726 either directly exports the device
2057 * attached to the first downstream port or acts as a hardware storage
2058 * controller and emulate a single ATA device (can be RAID 0/1 or some
2059 * other configuration).
2060 *
2061 * When there's no device attached to the first downstream port of the
2062 * 4726, "Config Disk" appears, which is a pseudo ATA device to
2063 * configure the 4726. However, ATA emulation of the device is very
2064 * lame. It doesn't send signature D2H Reg FIS after the initial
2065 * hardreset, pukes on SRST w/ PMP==0 and has bunch of other issues.
2066 *
2067 * The following function works around the problem by always using
2068 * hardreset on the port and not depending on receiving signature FIS
2069 * afterward. If signature FIS isn't received soon, ATA class is
2070 * assumed without follow-up softreset.
2071 */
2072static void ahci_p5wdh_workaround(struct ata_host *host)
2073{
2074 static struct dmi_system_id sysids[] = {
2075 {
2076 .ident = "P5W DH Deluxe",
2077 .matches = {
2078 DMI_MATCH(DMI_SYS_VENDOR,
2079 "ASUSTEK COMPUTER INC"),
2080 DMI_MATCH(DMI_PRODUCT_NAME, "P5W DH Deluxe"),
2081 },
2082 },
2083 { }
2084 };
2085 struct pci_dev *pdev = to_pci_dev(host->dev);
2086
2087 if (pdev->bus->number == 0 && pdev->devfn == PCI_DEVFN(0x1f, 2) &&
2088 dmi_check_system(sysids)) {
2089 struct ata_port *ap = host->ports[1];
2090
2091 dev_printk(KERN_INFO, &pdev->dev, "enabling ASUS P5W DH "
2092 "Deluxe on-board SIMG4726 workaround\n");
2093
2094 ap->ops = &ahci_p5wdh_ops;
2095 ap->link.flags |= ATA_LFLAG_NO_SRST | ATA_LFLAG_ASSUME_ATA;
2096 }
2097}
2098
1958static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) 2099static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
1959{ 2100{
1960 static int printed_version; 2101 static int printed_version;
@@ -2024,6 +2165,9 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
2024 ap->ops = &ata_dummy_port_ops; 2165 ap->ops = &ata_dummy_port_ops;
2025 } 2166 }
2026 2167
2168 /* apply workaround for ASUS P5W DH Deluxe mainboard */
2169 ahci_p5wdh_workaround(host);
2170
2027 /* initialize adapter */ 2171 /* initialize adapter */
2028 rc = ahci_configure_dma_masks(pdev, hpriv->cap & HOST_CAP_64); 2172 rc = ahci_configure_dma_masks(pdev, hpriv->cap & HOST_CAP_64);
2029 if (rc) 2173 if (rc)
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 2d147b51c978..081e3dfb64d4 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -68,7 +68,8 @@ const unsigned long sata_deb_timing_long[] = { 100, 2000, 5000 };
68static unsigned int ata_dev_init_params(struct ata_device *dev, 68static unsigned int ata_dev_init_params(struct ata_device *dev,
69 u16 heads, u16 sectors); 69 u16 heads, u16 sectors);
70static unsigned int ata_dev_set_xfermode(struct ata_device *dev); 70static unsigned int ata_dev_set_xfermode(struct ata_device *dev);
71static unsigned int ata_dev_set_AN(struct ata_device *dev, u8 enable); 71static unsigned int ata_dev_set_feature(struct ata_device *dev,
72 u8 enable, u8 feature);
72static void ata_dev_xfermask(struct ata_device *dev); 73static void ata_dev_xfermask(struct ata_device *dev);
73static unsigned long ata_dev_blacklisted(const struct ata_device *dev); 74static unsigned long ata_dev_blacklisted(const struct ata_device *dev);
74 75
@@ -1799,13 +1800,7 @@ int ata_dev_read_id(struct ata_device *dev, unsigned int *p_class,
1799 * SET_FEATURES spin-up subcommand before it will accept 1800 * SET_FEATURES spin-up subcommand before it will accept
1800 * anything other than the original IDENTIFY command. 1801 * anything other than the original IDENTIFY command.
1801 */ 1802 */
1802 ata_tf_init(dev, &tf); 1803 err_mask = ata_dev_set_feature(dev, SETFEATURES_SPINUP, 0);
1803 tf.command = ATA_CMD_SET_FEATURES;
1804 tf.feature = SETFEATURES_SPINUP;
1805 tf.protocol = ATA_PROT_NODATA;
1806 tf.flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE;
1807 err_mask = ata_exec_internal(dev, &tf, NULL,
1808 DMA_NONE, NULL, 0, 0);
1809 if (err_mask && id[2] != 0x738c) { 1804 if (err_mask && id[2] != 0x738c) {
1810 rc = -EIO; 1805 rc = -EIO;
1811 reason = "SPINUP failed"; 1806 reason = "SPINUP failed";
@@ -2075,7 +2070,8 @@ int ata_dev_configure(struct ata_device *dev)
2075 unsigned int err_mask; 2070 unsigned int err_mask;
2076 2071
2077 /* issue SET feature command to turn this on */ 2072 /* issue SET feature command to turn this on */
2078 err_mask = ata_dev_set_AN(dev, SETFEATURES_SATA_ENABLE); 2073 err_mask = ata_dev_set_feature(dev,
2074 SETFEATURES_SATA_ENABLE, SATA_AN);
2079 if (err_mask) 2075 if (err_mask)
2080 ata_dev_printk(dev, KERN_ERR, 2076 ata_dev_printk(dev, KERN_ERR,
2081 "failed to enable ATAPI AN " 2077 "failed to enable ATAPI AN "
@@ -2886,6 +2882,13 @@ static int ata_dev_set_mode(struct ata_device *dev)
2886 dev->pio_mode <= XFER_PIO_2) 2882 dev->pio_mode <= XFER_PIO_2)
2887 err_mask &= ~AC_ERR_DEV; 2883 err_mask &= ~AC_ERR_DEV;
2888 2884
2885 /* Early MWDMA devices do DMA but don't allow DMA mode setting.
2886 Don't fail an MWDMA0 set IFF the device indicates it is in MWDMA0 */
2887 if (dev->xfer_shift == ATA_SHIFT_MWDMA &&
2888 dev->dma_mode == XFER_MW_DMA_0 &&
2889 (dev->id[63] >> 8) & 1)
2890 err_mask &= ~AC_ERR_DEV;
2891
2889 if (err_mask) { 2892 if (err_mask) {
2890 ata_dev_printk(dev, KERN_ERR, "failed to set xfermode " 2893 ata_dev_printk(dev, KERN_ERR, "failed to set xfermode "
2891 "(err_mask=0x%x)\n", err_mask); 2894 "(err_mask=0x%x)\n", err_mask);
@@ -3947,9 +3950,6 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = {
3947 { "_NEC DV5800A", NULL, ATA_HORKAGE_NODMA }, 3950 { "_NEC DV5800A", NULL, ATA_HORKAGE_NODMA },
3948 { "SAMSUNG CD-ROM SN-124", "N001", ATA_HORKAGE_NODMA }, 3951 { "SAMSUNG CD-ROM SN-124", "N001", ATA_HORKAGE_NODMA },
3949 { "Seagate STT20000A", NULL, ATA_HORKAGE_NODMA }, 3952 { "Seagate STT20000A", NULL, ATA_HORKAGE_NODMA },
3950 { "IOMEGA ZIP 250 ATAPI", NULL, ATA_HORKAGE_NODMA }, /* temporary fix */
3951 { "IOMEGA ZIP 250 ATAPI Floppy",
3952 NULL, ATA_HORKAGE_NODMA },
3953 /* Odd clown on sil3726/4726 PMPs */ 3953 /* Odd clown on sil3726/4726 PMPs */
3954 { "Config Disk", NULL, ATA_HORKAGE_NODMA | 3954 { "Config Disk", NULL, ATA_HORKAGE_NODMA |
3955 ATA_HORKAGE_SKIP_PM }, 3955 ATA_HORKAGE_SKIP_PM },
@@ -4007,7 +4007,7 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = {
4007 { } 4007 { }
4008}; 4008};
4009 4009
4010int strn_pattern_cmp(const char *patt, const char *name, int wildchar) 4010static int strn_pattern_cmp(const char *patt, const char *name, int wildchar)
4011{ 4011{
4012 const char *p; 4012 const char *p;
4013 int len; 4013 int len;
@@ -4181,15 +4181,14 @@ static unsigned int ata_dev_set_xfermode(struct ata_device *dev)
4181 DPRINTK("EXIT, err_mask=%x\n", err_mask); 4181 DPRINTK("EXIT, err_mask=%x\n", err_mask);
4182 return err_mask; 4182 return err_mask;
4183} 4183}
4184
4185/** 4184/**
4186 * ata_dev_set_AN - Issue SET FEATURES - SATA FEATURES 4185 * ata_dev_set_feature - Issue SET FEATURES - SATA FEATURES
4187 * @dev: Device to which command will be sent 4186 * @dev: Device to which command will be sent
4188 * @enable: Whether to enable or disable the feature 4187 * @enable: Whether to enable or disable the feature
4188 * @feature: The sector count represents the feature to set
4189 * 4189 *
4190 * Issue SET FEATURES - SATA FEATURES command to device @dev 4190 * Issue SET FEATURES - SATA FEATURES command to device @dev
4191 * on port @ap with sector count set to indicate Asynchronous 4191 * on port @ap with sector count
4192 * Notification feature
4193 * 4192 *
4194 * LOCKING: 4193 * LOCKING:
4195 * PCI/etc. bus probe sem. 4194 * PCI/etc. bus probe sem.
@@ -4197,7 +4196,8 @@ static unsigned int ata_dev_set_xfermode(struct ata_device *dev)
4197 * RETURNS: 4196 * RETURNS:
4198 * 0 on success, AC_ERR_* mask otherwise. 4197 * 0 on success, AC_ERR_* mask otherwise.
4199 */ 4198 */
4200static unsigned int ata_dev_set_AN(struct ata_device *dev, u8 enable) 4199static unsigned int ata_dev_set_feature(struct ata_device *dev, u8 enable,
4200 u8 feature)
4201{ 4201{
4202 struct ata_taskfile tf; 4202 struct ata_taskfile tf;
4203 unsigned int err_mask; 4203 unsigned int err_mask;
@@ -4210,7 +4210,7 @@ static unsigned int ata_dev_set_AN(struct ata_device *dev, u8 enable)
4210 tf.feature = enable; 4210 tf.feature = enable;
4211 tf.flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE; 4211 tf.flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE;
4212 tf.protocol = ATA_PROT_NODATA; 4212 tf.protocol = ATA_PROT_NODATA;
4213 tf.nsect = SATA_AN; 4213 tf.nsect = feature;
4214 4214
4215 err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, 0); 4215 err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, 0);
4216 4216
@@ -4689,8 +4689,8 @@ static int ata_sg_setup(struct ata_queued_cmd *qc)
4689 * data in this function or read data in ata_sg_clean. 4689 * data in this function or read data in ata_sg_clean.
4690 */ 4690 */
4691 offset = lsg->offset + lsg->length - qc->pad_len; 4691 offset = lsg->offset + lsg->length - qc->pad_len;
4692 sg_set_page(psg, nth_page(sg_page(lsg), offset >> PAGE_SHIFT)); 4692 sg_set_page(psg, nth_page(sg_page(lsg), offset >> PAGE_SHIFT),
4693 psg->offset = offset_in_page(offset); 4693 qc->pad_len, offset_in_page(offset));
4694 4694
4695 if (qc->tf.flags & ATA_TFLAG_WRITE) { 4695 if (qc->tf.flags & ATA_TFLAG_WRITE) {
4696 void *addr = kmap_atomic(sg_page(psg), KM_IRQ0); 4696 void *addr = kmap_atomic(sg_page(psg), KM_IRQ0);
@@ -6921,7 +6921,7 @@ int ata_host_activate(struct ata_host *host, int irq,
6921 * LOCKING: 6921 * LOCKING:
6922 * Kernel thread context (may sleep). 6922 * Kernel thread context (may sleep).
6923 */ 6923 */
6924void ata_port_detach(struct ata_port *ap) 6924static void ata_port_detach(struct ata_port *ap)
6925{ 6925{
6926 unsigned long flags; 6926 unsigned long flags;
6927 struct ata_link *link; 6927 struct ata_link *link;
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 93e2b545b439..8cb35bb87605 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -2071,7 +2071,7 @@ int ata_eh_reset(struct ata_link *link, int classify,
2071 int try = 0; 2071 int try = 0;
2072 struct ata_device *dev; 2072 struct ata_device *dev;
2073 unsigned long deadline; 2073 unsigned long deadline;
2074 unsigned int action; 2074 unsigned int tmp_action;
2075 ata_reset_fn_t reset; 2075 ata_reset_fn_t reset;
2076 unsigned long flags; 2076 unsigned long flags;
2077 int rc; 2077 int rc;
@@ -2086,14 +2086,14 @@ int ata_eh_reset(struct ata_link *link, int classify,
2086 /* Determine which reset to use and record in ehc->i.action. 2086 /* Determine which reset to use and record in ehc->i.action.
2087 * prereset() may examine and modify it. 2087 * prereset() may examine and modify it.
2088 */ 2088 */
2089 action = ehc->i.action;
2090 ehc->i.action &= ~ATA_EH_RESET_MASK;
2091 if (softreset && (!hardreset || (!(link->flags & ATA_LFLAG_NO_SRST) && 2089 if (softreset && (!hardreset || (!(link->flags & ATA_LFLAG_NO_SRST) &&
2092 !sata_set_spd_needed(link) && 2090 !sata_set_spd_needed(link) &&
2093 !(action & ATA_EH_HARDRESET)))) 2091 !(ehc->i.action & ATA_EH_HARDRESET))))
2094 ehc->i.action |= ATA_EH_SOFTRESET; 2092 tmp_action = ATA_EH_SOFTRESET;
2095 else 2093 else
2096 ehc->i.action |= ATA_EH_HARDRESET; 2094 tmp_action = ATA_EH_HARDRESET;
2095
2096 ehc->i.action = (ehc->i.action & ~ATA_EH_RESET_MASK) | tmp_action;
2097 2097
2098 if (prereset) { 2098 if (prereset) {
2099 rc = prereset(link, jiffies + ATA_EH_PRERESET_TIMEOUT); 2099 rc = prereset(link, jiffies + ATA_EH_PRERESET_TIMEOUT);
diff --git a/drivers/ata/pata_icside.c b/drivers/ata/pata_icside.c
index be30923566c5..842fe08a3c13 100644
--- a/drivers/ata/pata_icside.c
+++ b/drivers/ata/pata_icside.c
@@ -332,12 +332,13 @@ static void ata_dummy_noret(struct ata_port *port)
332{ 332{
333} 333}
334 334
335static void pata_icside_postreset(struct ata_port *ap, unsigned int *classes) 335static void pata_icside_postreset(struct ata_link *link, unsigned int *classes)
336{ 336{
337 struct ata_port *ap = link->ap;
337 struct pata_icside_state *state = ap->host->private_data; 338 struct pata_icside_state *state = ap->host->private_data;
338 339
339 if (classes[0] != ATA_DEV_NONE || classes[1] != ATA_DEV_NONE) 340 if (classes[0] != ATA_DEV_NONE || classes[1] != ATA_DEV_NONE)
340 return ata_std_postreset(ap, classes); 341 return ata_std_postreset(link, classes);
341 342
342 state->port[ap->port_no].disabled = 1; 343 state->port[ap->port_no].disabled = 1;
343 344
@@ -395,29 +396,30 @@ static struct ata_port_operations pata_icside_port_ops = {
395 396
396static void __devinit 397static void __devinit
397pata_icside_setup_ioaddr(struct ata_port *ap, void __iomem *base, 398pata_icside_setup_ioaddr(struct ata_port *ap, void __iomem *base,
398 const struct portinfo *info) 399 struct pata_icside_info *info,
400 const struct portinfo *port)
399{ 401{
400 struct ata_ioports *ioaddr = &ap->ioaddr; 402 struct ata_ioports *ioaddr = &ap->ioaddr;
401 void __iomem *cmd = base + info->dataoffset; 403 void __iomem *cmd = base + port->dataoffset;
402 404
403 ioaddr->cmd_addr = cmd; 405 ioaddr->cmd_addr = cmd;
404 ioaddr->data_addr = cmd + (ATA_REG_DATA << info->stepping); 406 ioaddr->data_addr = cmd + (ATA_REG_DATA << port->stepping);
405 ioaddr->error_addr = cmd + (ATA_REG_ERR << info->stepping); 407 ioaddr->error_addr = cmd + (ATA_REG_ERR << port->stepping);
406 ioaddr->feature_addr = cmd + (ATA_REG_FEATURE << info->stepping); 408 ioaddr->feature_addr = cmd + (ATA_REG_FEATURE << port->stepping);
407 ioaddr->nsect_addr = cmd + (ATA_REG_NSECT << info->stepping); 409 ioaddr->nsect_addr = cmd + (ATA_REG_NSECT << port->stepping);
408 ioaddr->lbal_addr = cmd + (ATA_REG_LBAL << info->stepping); 410 ioaddr->lbal_addr = cmd + (ATA_REG_LBAL << port->stepping);
409 ioaddr->lbam_addr = cmd + (ATA_REG_LBAM << info->stepping); 411 ioaddr->lbam_addr = cmd + (ATA_REG_LBAM << port->stepping);
410 ioaddr->lbah_addr = cmd + (ATA_REG_LBAH << info->stepping); 412 ioaddr->lbah_addr = cmd + (ATA_REG_LBAH << port->stepping);
411 ioaddr->device_addr = cmd + (ATA_REG_DEVICE << info->stepping); 413 ioaddr->device_addr = cmd + (ATA_REG_DEVICE << port->stepping);
412 ioaddr->status_addr = cmd + (ATA_REG_STATUS << info->stepping); 414 ioaddr->status_addr = cmd + (ATA_REG_STATUS << port->stepping);
413 ioaddr->command_addr = cmd + (ATA_REG_CMD << info->stepping); 415 ioaddr->command_addr = cmd + (ATA_REG_CMD << port->stepping);
414 416
415 ioaddr->ctl_addr = base + info->ctrloffset; 417 ioaddr->ctl_addr = base + port->ctrloffset;
416 ioaddr->altstatus_addr = ioaddr->ctl_addr; 418 ioaddr->altstatus_addr = ioaddr->ctl_addr;
417 419
418 ata_port_desc(ap, "cmd 0x%lx ctl 0x%lx", 420 ata_port_desc(ap, "cmd 0x%lx ctl 0x%lx",
419 info->raw_base + info->dataoffset, 421 info->raw_base + port->dataoffset,
420 info->raw_base + info->ctrloffset); 422 info->raw_base + port->ctrloffset);
421 423
422 if (info->raw_ioc_base) 424 if (info->raw_ioc_base)
423 ata_port_desc(ap, "iocbase 0x%lx", info->raw_ioc_base); 425 ata_port_desc(ap, "iocbase 0x%lx", info->raw_ioc_base);
@@ -441,7 +443,7 @@ static int __devinit pata_icside_register_v5(struct pata_icside_info *info)
441 info->nr_ports = 1; 443 info->nr_ports = 1;
442 info->port[0] = &pata_icside_portinfo_v5; 444 info->port[0] = &pata_icside_portinfo_v5;
443 445
444 info->raw_base = ecard_resource_start(ec, ECARD_RES_MEMC); 446 info->raw_base = ecard_resource_start(info->ec, ECARD_RES_MEMC);
445 447
446 return 0; 448 return 0;
447} 449}
@@ -522,7 +524,7 @@ static int __devinit pata_icside_add_ports(struct pata_icside_info *info)
522 ap->flags |= ATA_FLAG_SLAVE_POSS; 524 ap->flags |= ATA_FLAG_SLAVE_POSS;
523 ap->ops = &pata_icside_port_ops; 525 ap->ops = &pata_icside_port_ops;
524 526
525 pata_icside_setup_ioaddr(ap, info->base, info->port[i]); 527 pata_icside_setup_ioaddr(ap, info->base, info, info->port[i]);
526 } 528 }
527 529
528 return ata_host_activate(host, ec->irq, ata_interrupt, 0, 530 return ata_host_activate(host, ec->irq, ata_interrupt, 0,
diff --git a/drivers/ata/sata_nv.c b/drivers/ata/sata_nv.c
index 2e0279fdd7aa..f1b422f7c749 100644
--- a/drivers/ata/sata_nv.c
+++ b/drivers/ata/sata_nv.c
@@ -365,9 +365,9 @@ static const struct pci_device_id nv_pci_tbl[] = {
365 { PCI_VDEVICE(NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP51_SATA2), SWNCQ }, 365 { PCI_VDEVICE(NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP51_SATA2), SWNCQ },
366 { PCI_VDEVICE(NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_SATA), SWNCQ }, 366 { PCI_VDEVICE(NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_SATA), SWNCQ },
367 { PCI_VDEVICE(NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_SATA2), SWNCQ }, 367 { PCI_VDEVICE(NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_SATA2), SWNCQ },
368 { PCI_VDEVICE(NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_SATA), SWNCQ }, 368 { PCI_VDEVICE(NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_SATA), GENERIC },
369 { PCI_VDEVICE(NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_SATA2), SWNCQ }, 369 { PCI_VDEVICE(NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_SATA2), GENERIC },
370 { PCI_VDEVICE(NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_SATA3), SWNCQ }, 370 { PCI_VDEVICE(NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_SATA3), GENERIC },
371 371
372 { } /* terminate list */ 372 { } /* terminate list */
373}; 373};
diff --git a/drivers/block/cryptoloop.c b/drivers/block/cryptoloop.c
index 1b58b010797f..241167878edf 100644
--- a/drivers/block/cryptoloop.c
+++ b/drivers/block/cryptoloop.c
@@ -150,13 +150,8 @@ cryptoloop_transfer(struct loop_device *lo, int cmd,
150 u32 iv[4] = { 0, }; 150 u32 iv[4] = { 0, };
151 iv[0] = cpu_to_le32(IV & 0xffffffff); 151 iv[0] = cpu_to_le32(IV & 0xffffffff);
152 152
153 sg_set_page(&sg_in, in_page); 153 sg_set_page(&sg_in, in_page, sz, in_offs);
154 sg_in.offset = in_offs; 154 sg_set_page(&sg_out, out_page, sz, out_offs);
155 sg_in.length = sz;
156
157 sg_set_page(&sg_out, out_page);
158 sg_out.offset = out_offs;
159 sg_out.length = sz;
160 155
161 desc.info = iv; 156 desc.info = iv;
162 err = encdecfunc(&desc, &sg_out, &sg_in, sz); 157 err = encdecfunc(&desc, &sg_out, &sg_in, sz);
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index 7276f7d207c2..fac4c6cd04f7 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -15,6 +15,7 @@
15#include <linux/delay.h> 15#include <linux/delay.h>
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/list.h> 17#include <linux/list.h>
18#include <linux/scatterlist.h>
18 19
19#include <asm/vio.h> 20#include <asm/vio.h>
20#include <asm/ldc.h> 21#include <asm/ldc.h>
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index 14143f2c484d..08e909dc7944 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -1428,9 +1428,9 @@ static void ub_state_sense(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
1428 scmd->state = UB_CMDST_INIT; 1428 scmd->state = UB_CMDST_INIT;
1429 scmd->nsg = 1; 1429 scmd->nsg = 1;
1430 sg = &scmd->sgv[0]; 1430 sg = &scmd->sgv[0];
1431 sg_set_page(sg, virt_to_page(sc->top_sense)); 1431 sg_init_table(sg, UB_MAX_REQ_SG);
1432 sg->offset = (unsigned long)sc->top_sense & (PAGE_SIZE-1); 1432 sg_set_page(sg, virt_to_page(sc->top_sense), UB_SENSE_SIZE,
1433 sg->length = UB_SENSE_SIZE; 1433 (unsigned long)sc->top_sense & (PAGE_SIZE-1));
1434 scmd->len = UB_SENSE_SIZE; 1434 scmd->len = UB_SENSE_SIZE;
1435 scmd->lun = cmd->lun; 1435 scmd->lun = cmd->lun;
1436 scmd->done = ub_top_sense_done; 1436 scmd->done = ub_top_sense_done;
@@ -1864,9 +1864,8 @@ static int ub_sync_read_cap(struct ub_dev *sc, struct ub_lun *lun,
1864 cmd->state = UB_CMDST_INIT; 1864 cmd->state = UB_CMDST_INIT;
1865 cmd->nsg = 1; 1865 cmd->nsg = 1;
1866 sg = &cmd->sgv[0]; 1866 sg = &cmd->sgv[0];
1867 sg_set_page(sg, virt_to_page(p)); 1867 sg_init_table(sg, UB_MAX_REQ_SG);
1868 sg->offset = (unsigned long)p & (PAGE_SIZE-1); 1868 sg_set_page(sg, virt_to_page(p), 8, (unsigned long)p & (PAGE_SIZE-1));
1869 sg->length = 8;
1870 cmd->len = 8; 1869 cmd->len = 8;
1871 cmd->lun = lun; 1870 cmd->lun = lun;
1872 cmd->done = ub_probe_done; 1871 cmd->done = ub_probe_done;
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index a901eee64ba5..3cf7129d83e6 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -4,7 +4,9 @@
4#include <linux/hdreg.h> 4#include <linux/hdreg.h>
5#include <linux/virtio.h> 5#include <linux/virtio.h>
6#include <linux/virtio_blk.h> 6#include <linux/virtio_blk.h>
7#include <linux/virtio_blk.h> 7#include <linux/scatterlist.h>
8
9#define VIRTIO_MAX_SG (3+MAX_PHYS_SEGMENTS)
8 10
9static unsigned char virtblk_index = 'a'; 11static unsigned char virtblk_index = 'a';
10struct virtio_blk 12struct virtio_blk
@@ -23,7 +25,7 @@ struct virtio_blk
23 mempool_t *pool; 25 mempool_t *pool;
24 26
25 /* Scatterlist: can be too big for stack. */ 27 /* Scatterlist: can be too big for stack. */
26 struct scatterlist sg[3+MAX_PHYS_SEGMENTS]; 28 struct scatterlist sg[VIRTIO_MAX_SG];
27}; 29};
28 30
29struct virtblk_req 31struct virtblk_req
@@ -94,8 +96,8 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
94 if (blk_barrier_rq(vbr->req)) 96 if (blk_barrier_rq(vbr->req))
95 vbr->out_hdr.type |= VIRTIO_BLK_T_BARRIER; 97 vbr->out_hdr.type |= VIRTIO_BLK_T_BARRIER;
96 98
97 /* We have to zero this, otherwise blk_rq_map_sg gets upset. */ 99 /* This init could be done at vblk creation time */
98 memset(vblk->sg, 0, sizeof(vblk->sg)); 100 sg_init_table(vblk->sg, VIRTIO_MAX_SG);
99 sg_set_buf(&vblk->sg[0], &vbr->out_hdr, sizeof(vbr->out_hdr)); 101 sg_set_buf(&vblk->sg[0], &vbr->out_hdr, sizeof(vbr->out_hdr));
100 num = blk_rq_map_sg(q, vbr->req, vblk->sg+1); 102 num = blk_rq_map_sg(q, vbr->req, vblk->sg+1);
101 sg_set_buf(&vblk->sg[num+1], &vbr->in_hdr, sizeof(vbr->in_hdr)); 103 sg_set_buf(&vblk->sg[num+1], &vbr->in_hdr, sizeof(vbr->in_hdr));
diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c
index 880b5dce3a62..d8bb44b98a6a 100644
--- a/drivers/cdrom/viocd.c
+++ b/drivers/cdrom/viocd.c
@@ -41,9 +41,9 @@
41#include <linux/completion.h> 41#include <linux/completion.h>
42#include <linux/proc_fs.h> 42#include <linux/proc_fs.h>
43#include <linux/seq_file.h> 43#include <linux/seq_file.h>
44#include <linux/scatterlist.h>
44 45
45#include <asm/vio.h> 46#include <asm/vio.h>
46#include <asm/scatterlist.h>
47#include <asm/iseries/hv_types.h> 47#include <asm/iseries/hv_types.h>
48#include <asm/iseries/hv_lp_event.h> 48#include <asm/iseries/hv_lp_event.h>
49#include <asm/iseries/vio.h> 49#include <asm/iseries/vio.h>
@@ -258,6 +258,7 @@ static int send_request(struct request *req)
258 cmd = viomajorsubtype_cdio | viocdwrite; 258 cmd = viomajorsubtype_cdio | viocdwrite;
259 } 259 }
260 260
261 sg_init_table(&sg, 1);
261 if (blk_rq_map_sg(req->q, req, &sg) == 0) { 262 if (blk_rq_map_sg(req->q, req, &sg) == 0) {
262 printk(VIOCD_KERN_WARNING 263 printk(VIOCD_KERN_WARNING
263 "error setting up scatter/gather list\n"); 264 "error setting up scatter/gather list\n");
diff --git a/drivers/ieee1394/dma.c b/drivers/ieee1394/dma.c
index 3051e312fdc8..f5f4983dfbf3 100644
--- a/drivers/ieee1394/dma.c
+++ b/drivers/ieee1394/dma.c
@@ -111,8 +111,8 @@ int dma_region_alloc(struct dma_region *dma, unsigned long n_bytes,
111 unsigned long va = 111 unsigned long va =
112 (unsigned long)dma->kvirt + (i << PAGE_SHIFT); 112 (unsigned long)dma->kvirt + (i << PAGE_SHIFT);
113 113
114 sg_set_page(&dma->sglist[i], vmalloc_to_page((void *)va)); 114 sg_set_page(&dma->sglist[i], vmalloc_to_page((void *)va),
115 dma->sglist[i].length = PAGE_SIZE; 115 PAGE_SIZE, 0);
116 } 116 }
117 117
118 /* map sglist to the IOMMU */ 118 /* map sglist to the IOMMU */
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 14159ff29408..4e3128ff73c1 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -171,9 +171,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
171 if (vma_list && 171 if (vma_list &&
172 !is_vm_hugetlb_page(vma_list[i + off])) 172 !is_vm_hugetlb_page(vma_list[i + off]))
173 umem->hugetlb = 0; 173 umem->hugetlb = 0;
174 sg_set_page(&chunk->page_list[i], page_list[i + off]); 174 sg_set_page(&chunk->page_list[i], page_list[i + off], PAGE_SIZE, 0);
175 chunk->page_list[i].offset = 0;
176 chunk->page_list[i].length = PAGE_SIZE;
177 } 175 }
178 176
179 chunk->nmap = ib_dma_map_sg(context->device, 177 chunk->nmap = ib_dma_map_sg(context->device,
diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c
index 007b38157fc4..1f4d27d7c16d 100644
--- a/drivers/infiniband/hw/mthca/mthca_memfree.c
+++ b/drivers/infiniband/hw/mthca/mthca_memfree.c
@@ -113,9 +113,7 @@ static int mthca_alloc_icm_pages(struct scatterlist *mem, int order, gfp_t gfp_m
113 if (!page) 113 if (!page)
114 return -ENOMEM; 114 return -ENOMEM;
115 115
116 sg_set_page(mem, page); 116 sg_set_page(mem, page, PAGE_SIZE << order, 0);
117 mem->length = PAGE_SIZE << order;
118 mem->offset = 0;
119 return 0; 117 return 0;
120} 118}
121 119
@@ -481,9 +479,8 @@ int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
481 if (ret < 0) 479 if (ret < 0)
482 goto out; 480 goto out;
483 481
484 sg_set_page(&db_tab->page[i].mem, pages[0]); 482 sg_set_page(&db_tab->page[i].mem, pages[0], MTHCA_ICM_PAGE_SIZE,
485 db_tab->page[i].mem.length = MTHCA_ICM_PAGE_SIZE; 483 uaddr & ~PAGE_MASK);
486 db_tab->page[i].mem.offset = uaddr & ~PAGE_MASK;
487 484
488 ret = pci_map_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE); 485 ret = pci_map_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE);
489 if (ret < 0) { 486 if (ret < 0) {
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 35d19ae58de7..cb4c67025d52 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -128,9 +128,12 @@ static void unmap_switcher(void)
128 __free_pages(switcher_page[i], 0); 128 __free_pages(switcher_page[i], 0);
129} 129}
130 130
131/*L:305 131/*H:032
132 * Dealing With Guest Memory. 132 * Dealing With Guest Memory.
133 * 133 *
134 * Before we go too much further into the Host, we need to grok the routines
135 * we use to deal with Guest memory.
136 *
134 * When the Guest gives us (what it thinks is) a physical address, we can use 137 * When the Guest gives us (what it thinks is) a physical address, we can use
135 * the normal copy_from_user() & copy_to_user() on the corresponding place in 138 * the normal copy_from_user() & copy_to_user() on the corresponding place in
136 * the memory region allocated by the Launcher. 139 * the memory region allocated by the Launcher.
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
index 9d5184c7c14a..b478affe8f91 100644
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -90,6 +90,7 @@ static void do_hcall(struct lguest *lg, struct hcall_args *args)
90 lg->pending_notify = args->arg1; 90 lg->pending_notify = args->arg1;
91 break; 91 break;
92 default: 92 default:
93 /* It should be an architecture-specific hypercall. */
93 if (lguest_arch_do_hcall(lg, args)) 94 if (lguest_arch_do_hcall(lg, args))
94 kill_guest(lg, "Bad hypercall %li\n", args->arg0); 95 kill_guest(lg, "Bad hypercall %li\n", args->arg0);
95 } 96 }
@@ -157,7 +158,6 @@ static void do_async_hcalls(struct lguest *lg)
157 * Guest makes a hypercall, we end up here to set things up: */ 158 * Guest makes a hypercall, we end up here to set things up: */
158static void initialize(struct lguest *lg) 159static void initialize(struct lguest *lg)
159{ 160{
160
161 /* You can't do anything until you're initialized. The Guest knows the 161 /* You can't do anything until you're initialized. The Guest knows the
162 * rules, so we're unforgiving here. */ 162 * rules, so we're unforgiving here. */
163 if (lg->hcall->arg0 != LHCALL_LGUEST_INIT) { 163 if (lg->hcall->arg0 != LHCALL_LGUEST_INIT) {
@@ -174,7 +174,8 @@ static void initialize(struct lguest *lg)
174 || get_user(lg->noirq_end, &lg->lguest_data->noirq_end)) 174 || get_user(lg->noirq_end, &lg->lguest_data->noirq_end))
175 kill_guest(lg, "bad guest page %p", lg->lguest_data); 175 kill_guest(lg, "bad guest page %p", lg->lguest_data);
176 176
177 /* We write the current time into the Guest's data page once now. */ 177 /* We write the current time into the Guest's data page once so it can
178 * set its clock. */
178 write_timestamp(lg); 179 write_timestamp(lg);
179 180
180 /* page_tables.c will also do some setup. */ 181 /* page_tables.c will also do some setup. */
@@ -182,8 +183,8 @@ static void initialize(struct lguest *lg)
182 183
183 /* This is the one case where the above accesses might have been the 184 /* This is the one case where the above accesses might have been the
184 * first write to a Guest page. This may have caused a copy-on-write 185 * first write to a Guest page. This may have caused a copy-on-write
185 * fault, but the Guest might be referring to the old (read-only) 186 * fault, but the old page might be (read-only) in the Guest
186 * page. */ 187 * pagetable. */
187 guest_pagetable_clear_all(lg); 188 guest_pagetable_clear_all(lg);
188} 189}
189 190
@@ -220,7 +221,7 @@ void do_hypercalls(struct lguest *lg)
220 * Normally it doesn't matter: the Guest will run again and 221 * Normally it doesn't matter: the Guest will run again and
221 * update the trap number before we come back here. 222 * update the trap number before we come back here.
222 * 223 *
223 * However, if we are signalled or the Guest sends DMA to the 224 * However, if we are signalled or the Guest sends I/O to the
224 * Launcher, the run_guest() loop will exit without running the 225 * Launcher, the run_guest() loop will exit without running the
225 * Guest. When it comes back it would try to re-run the 226 * Guest. When it comes back it would try to re-run the
226 * hypercall. */ 227 * hypercall. */
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
index 82966982cb38..2b66f79c208b 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -92,8 +92,8 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
92 92
93 /* Remember that we never let the Guest actually disable interrupts, so 93 /* Remember that we never let the Guest actually disable interrupts, so
94 * the "Interrupt Flag" bit is always set. We copy that bit from the 94 * the "Interrupt Flag" bit is always set. We copy that bit from the
95 * Guest's "irq_enabled" field into the eflags word: the Guest copies 95 * Guest's "irq_enabled" field into the eflags word: we saw the Guest
96 * it back in "lguest_iret". */ 96 * copy it back in "lguest_iret". */
97 eflags = lg->regs->eflags; 97 eflags = lg->regs->eflags;
98 if (get_user(irq_enable, &lg->lguest_data->irq_enabled) == 0 98 if (get_user(irq_enable, &lg->lguest_data->irq_enabled) == 0
99 && !(irq_enable & X86_EFLAGS_IF)) 99 && !(irq_enable & X86_EFLAGS_IF))
@@ -124,7 +124,7 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
124 kill_guest(lg, "Disabling interrupts"); 124 kill_guest(lg, "Disabling interrupts");
125} 125}
126 126
127/*H:200 127/*H:205
128 * Virtual Interrupts. 128 * Virtual Interrupts.
129 * 129 *
130 * maybe_do_interrupt() gets called before every entry to the Guest, to see if 130 * maybe_do_interrupt() gets called before every entry to the Guest, to see if
@@ -256,19 +256,21 @@ int deliver_trap(struct lguest *lg, unsigned int num)
256 * bogus one in): if we fail here, the Guest will be killed. */ 256 * bogus one in): if we fail here, the Guest will be killed. */
257 if (!idt_present(lg->arch.idt[num].a, lg->arch.idt[num].b)) 257 if (!idt_present(lg->arch.idt[num].a, lg->arch.idt[num].b))
258 return 0; 258 return 0;
259 set_guest_interrupt(lg, lg->arch.idt[num].a, lg->arch.idt[num].b, has_err(num)); 259 set_guest_interrupt(lg, lg->arch.idt[num].a, lg->arch.idt[num].b,
260 has_err(num));
260 return 1; 261 return 1;
261} 262}
262 263
263/*H:250 Here's the hard part: returning to the Host every time a trap happens 264/*H:250 Here's the hard part: returning to the Host every time a trap happens
264 * and then calling deliver_trap() and re-entering the Guest is slow. 265 * and then calling deliver_trap() and re-entering the Guest is slow.
265 * Particularly because Guest userspace system calls are traps (trap 128). 266 * Particularly because Guest userspace system calls are traps (usually trap
267 * 128).
266 * 268 *
267 * So we'd like to set up the IDT to tell the CPU to deliver traps directly 269 * So we'd like to set up the IDT to tell the CPU to deliver traps directly
268 * into the Guest. This is possible, but the complexities cause the size of 270 * into the Guest. This is possible, but the complexities cause the size of
269 * this file to double! However, 150 lines of code is worth writing for taking 271 * this file to double! However, 150 lines of code is worth writing for taking
270 * system calls down from 1750ns to 270ns. Plus, if lguest didn't do it, all 272 * system calls down from 1750ns to 270ns. Plus, if lguest didn't do it, all
271 * the other hypervisors would tease it. 273 * the other hypervisors would beat it up at lunchtime.
272 * 274 *
273 * This routine indicates if a particular trap number could be delivered 275 * This routine indicates if a particular trap number could be delivered
274 * directly. */ 276 * directly. */
@@ -331,7 +333,7 @@ void pin_stack_pages(struct lguest *lg)
331 * change stacks on each context switch. */ 333 * change stacks on each context switch. */
332void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages) 334void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages)
333{ 335{
334 /* You are not allowd have a stack segment with privilege level 0: bad 336 /* You are not allowed have a stack segment with privilege level 0: bad
335 * Guest! */ 337 * Guest! */
336 if ((seg & 0x3) != GUEST_PL) 338 if ((seg & 0x3) != GUEST_PL)
337 kill_guest(lg, "bad stack segment %i", seg); 339 kill_guest(lg, "bad stack segment %i", seg);
@@ -350,7 +352,7 @@ void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages)
350 * part of the Host: page table handling. */ 352 * part of the Host: page table handling. */
351 353
352/*H:235 This is the routine which actually checks the Guest's IDT entry and 354/*H:235 This is the routine which actually checks the Guest's IDT entry and
353 * transfers it into our entry in "struct lguest": */ 355 * transfers it into the entry in "struct lguest": */
354static void set_trap(struct lguest *lg, struct desc_struct *trap, 356static void set_trap(struct lguest *lg, struct desc_struct *trap,
355 unsigned int num, u32 lo, u32 hi) 357 unsigned int num, u32 lo, u32 hi)
356{ 358{
@@ -456,6 +458,18 @@ void copy_traps(const struct lguest *lg, struct desc_struct *idt,
456 } 458 }
457} 459}
458 460
461/*H:200
462 * The Guest Clock.
463 *
464 * There are two sources of virtual interrupts. We saw one in lguest_user.c:
465 * the Launcher sending interrupts for virtual devices. The other is the Guest
466 * timer interrupt.
467 *
468 * The Guest uses the LHCALL_SET_CLOCKEVENT hypercall to tell us how long to
469 * the next timer interrupt (in nanoseconds). We use the high-resolution timer
470 * infrastructure to set a callback at that time.
471 *
472 * 0 means "turn off the clock". */
459void guest_set_clockevent(struct lguest *lg, unsigned long delta) 473void guest_set_clockevent(struct lguest *lg, unsigned long delta)
460{ 474{
461 ktime_t expires; 475 ktime_t expires;
@@ -466,20 +480,27 @@ void guest_set_clockevent(struct lguest *lg, unsigned long delta)
466 return; 480 return;
467 } 481 }
468 482
483 /* We use wallclock time here, so the Guest might not be running for
484 * all the time between now and the timer interrupt it asked for. This
485 * is almost always the right thing to do. */
469 expires = ktime_add_ns(ktime_get_real(), delta); 486 expires = ktime_add_ns(ktime_get_real(), delta);
470 hrtimer_start(&lg->hrt, expires, HRTIMER_MODE_ABS); 487 hrtimer_start(&lg->hrt, expires, HRTIMER_MODE_ABS);
471} 488}
472 489
490/* This is the function called when the Guest's timer expires. */
473static enum hrtimer_restart clockdev_fn(struct hrtimer *timer) 491static enum hrtimer_restart clockdev_fn(struct hrtimer *timer)
474{ 492{
475 struct lguest *lg = container_of(timer, struct lguest, hrt); 493 struct lguest *lg = container_of(timer, struct lguest, hrt);
476 494
495 /* Remember the first interrupt is the timer interrupt. */
477 set_bit(0, lg->irqs_pending); 496 set_bit(0, lg->irqs_pending);
497 /* If the Guest is actually stopped, we need to wake it up. */
478 if (lg->halted) 498 if (lg->halted)
479 wake_up_process(lg->tsk); 499 wake_up_process(lg->tsk);
480 return HRTIMER_NORESTART; 500 return HRTIMER_NORESTART;
481} 501}
482 502
503/* This sets up the timer for this Guest. */
483void init_clockdev(struct lguest *lg) 504void init_clockdev(struct lguest *lg)
484{ 505{
485 hrtimer_init(&lg->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS); 506 hrtimer_init(&lg->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS);
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index d9144beca82c..86924891b5eb 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -74,9 +74,6 @@ struct lguest
74 u32 pgdidx; 74 u32 pgdidx;
75 struct pgdir pgdirs[4]; 75 struct pgdir pgdirs[4];
76 76
77 /* Cached wakeup: we hold a reference to this task. */
78 struct task_struct *wake;
79
80 unsigned long noirq_start, noirq_end; 77 unsigned long noirq_start, noirq_end;
81 unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */ 78 unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */
82 79
@@ -103,7 +100,7 @@ int lguest_address_ok(const struct lguest *lg,
103void __lgread(struct lguest *, void *, unsigned long, unsigned); 100void __lgread(struct lguest *, void *, unsigned long, unsigned);
104void __lgwrite(struct lguest *, unsigned long, const void *, unsigned); 101void __lgwrite(struct lguest *, unsigned long, const void *, unsigned);
105 102
106/*L:306 Using memory-copy operations like that is usually inconvient, so we 103/*H:035 Using memory-copy operations like that is usually inconvient, so we
107 * have the following helper macros which read and write a specific type (often 104 * have the following helper macros which read and write a specific type (often
108 * an unsigned long). 105 * an unsigned long).
109 * 106 *
@@ -191,7 +188,7 @@ void write_timestamp(struct lguest *lg);
191 * Let's step aside for the moment, to study one important routine that's used 188 * Let's step aside for the moment, to study one important routine that's used
192 * widely in the Host code. 189 * widely in the Host code.
193 * 190 *
194 * There are many cases where the Guest does something invalid, like pass crap 191 * There are many cases where the Guest can do something invalid, like pass crap
195 * to a hypercall. Since only the Guest kernel can make hypercalls, it's quite 192 * to a hypercall. Since only the Guest kernel can make hypercalls, it's quite
196 * acceptable to simply terminate the Guest and give the Launcher a nicely 193 * acceptable to simply terminate the Guest and give the Launcher a nicely
197 * formatted reason. It's also simpler for the Guest itself, which doesn't 194 * formatted reason. It's also simpler for the Guest itself, which doesn't
diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
index 71c64837b437..8904f72f97c6 100644
--- a/drivers/lguest/lguest_device.c
+++ b/drivers/lguest/lguest_device.c
@@ -53,7 +53,8 @@ struct lguest_device {
53 * Device configurations 53 * Device configurations
54 * 54 *
55 * The configuration information for a device consists of a series of fields. 55 * The configuration information for a device consists of a series of fields.
56 * The device will look for these fields during setup. 56 * We don't really care what they are: the Launcher set them up, and the driver
57 * will look at them during setup.
57 * 58 *
58 * For us these fields come immediately after that device's descriptor in the 59 * For us these fields come immediately after that device's descriptor in the
59 * lguest_devices page. 60 * lguest_devices page.
@@ -122,8 +123,8 @@ static void lg_set_status(struct virtio_device *vdev, u8 status)
122 * The other piece of infrastructure virtio needs is a "virtqueue": a way of 123 * The other piece of infrastructure virtio needs is a "virtqueue": a way of
123 * the Guest device registering buffers for the other side to read from or 124 * the Guest device registering buffers for the other side to read from or
124 * write into (ie. send and receive buffers). Each device can have multiple 125 * write into (ie. send and receive buffers). Each device can have multiple
125 * virtqueues: for example the console has one queue for sending and one for 126 * virtqueues: for example the console driver uses one queue for sending and
126 * receiving. 127 * another for receiving.
127 * 128 *
128 * Fortunately for us, a very fast shared-memory-plus-descriptors virtqueue 129 * Fortunately for us, a very fast shared-memory-plus-descriptors virtqueue
129 * already exists in virtio_ring.c. We just need to connect it up. 130 * already exists in virtio_ring.c. We just need to connect it up.
@@ -158,7 +159,7 @@ static void lg_notify(struct virtqueue *vq)
158 * 159 *
159 * This is kind of an ugly duckling. It'd be nicer to have a standard 160 * This is kind of an ugly duckling. It'd be nicer to have a standard
160 * representation of a virtqueue in the configuration space, but it seems that 161 * representation of a virtqueue in the configuration space, but it seems that
161 * everyone wants to do it differently. The KVM guys want the Guest to 162 * everyone wants to do it differently. The KVM coders want the Guest to
162 * allocate its own pages and tell the Host where they are, but for lguest it's 163 * allocate its own pages and tell the Host where they are, but for lguest it's
163 * simpler for the Host to simply tell us where the pages are. 164 * simpler for the Host to simply tell us where the pages are.
164 * 165 *
@@ -284,6 +285,8 @@ static void add_lguest_device(struct lguest_device_desc *d)
284{ 285{
285 struct lguest_device *ldev; 286 struct lguest_device *ldev;
286 287
288 /* Start with zeroed memory; Linux's device layer seems to count on
289 * it. */
287 ldev = kzalloc(sizeof(*ldev), GFP_KERNEL); 290 ldev = kzalloc(sizeof(*ldev), GFP_KERNEL);
288 if (!ldev) { 291 if (!ldev) {
289 printk(KERN_EMERG "Cannot allocate lguest dev %u\n", 292 printk(KERN_EMERG "Cannot allocate lguest dev %u\n",
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index ee405b38383d..9d716fa42cad 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -8,20 +8,22 @@
8#include <linux/fs.h> 8#include <linux/fs.h>
9#include "lg.h" 9#include "lg.h"
10 10
11/*L:315 To force the Guest to stop running and return to the Launcher, the 11/*L:055 When something happens, the Waker process needs a way to stop the
12 * Waker sets writes LHREQ_BREAK and the value "1" to /dev/lguest. The 12 * kernel running the Guest and return to the Launcher. So the Waker writes
13 * Launcher then writes LHREQ_BREAK and "0" to release the Waker. */ 13 * LHREQ_BREAK and the value "1" to /dev/lguest to do this. Once the Launcher
14 * has done whatever needs attention, it writes LHREQ_BREAK and "0" to release
15 * the Waker. */
14static int break_guest_out(struct lguest *lg, const unsigned long __user *input) 16static int break_guest_out(struct lguest *lg, const unsigned long __user *input)
15{ 17{
16 unsigned long on; 18 unsigned long on;
17 19
18 /* Fetch whether they're turning break on or off.. */ 20 /* Fetch whether they're turning break on or off. */
19 if (get_user(on, input) != 0) 21 if (get_user(on, input) != 0)
20 return -EFAULT; 22 return -EFAULT;
21 23
22 if (on) { 24 if (on) {
23 lg->break_out = 1; 25 lg->break_out = 1;
24 /* Pop it out (may be running on different CPU) */ 26 /* Pop it out of the Guest (may be running on different CPU) */
25 wake_up_process(lg->tsk); 27 wake_up_process(lg->tsk);
26 /* Wait for them to reset it */ 28 /* Wait for them to reset it */
27 return wait_event_interruptible(lg->break_wq, !lg->break_out); 29 return wait_event_interruptible(lg->break_wq, !lg->break_out);
@@ -58,7 +60,7 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
58 if (!lg) 60 if (!lg)
59 return -EINVAL; 61 return -EINVAL;
60 62
61 /* If you're not the task which owns the guest, go away. */ 63 /* If you're not the task which owns the Guest, go away. */
62 if (current != lg->tsk) 64 if (current != lg->tsk)
63 return -EPERM; 65 return -EPERM;
64 66
@@ -92,8 +94,8 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
92 * base: The start of the Guest-physical memory inside the Launcher memory. 94 * base: The start of the Guest-physical memory inside the Launcher memory.
93 * 95 *
94 * pfnlimit: The highest (Guest-physical) page number the Guest should be 96 * pfnlimit: The highest (Guest-physical) page number the Guest should be
95 * allowed to access. The Launcher has to live in Guest memory, so it sets 97 * allowed to access. The Guest memory lives inside the Launcher, so it sets
96 * this to ensure the Guest can't reach it. 98 * this to ensure the Guest can only reach its own memory.
97 * 99 *
98 * pgdir: The (Guest-physical) address of the top of the initial Guest 100 * pgdir: The (Guest-physical) address of the top of the initial Guest
99 * pagetables (which are set up by the Launcher). 101 * pagetables (which are set up by the Launcher).
@@ -189,7 +191,7 @@ unlock:
189} 191}
190 192
191/*L:010 The first operation the Launcher does must be a write. All writes 193/*L:010 The first operation the Launcher does must be a write. All writes
192 * start with a 32 bit number: for the first write this must be 194 * start with an unsigned long number: for the first write this must be
193 * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use 195 * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use
194 * writes of other values to send interrupts. */ 196 * writes of other values to send interrupts. */
195static ssize_t write(struct file *file, const char __user *in, 197static ssize_t write(struct file *file, const char __user *in,
@@ -275,8 +277,7 @@ static int close(struct inode *inode, struct file *file)
275 * The Launcher is the Host userspace program which sets up, runs and services 277 * The Launcher is the Host userspace program which sets up, runs and services
276 * the Guest. In fact, many comments in the Drivers which refer to "the Host" 278 * the Guest. In fact, many comments in the Drivers which refer to "the Host"
277 * doing things are inaccurate: the Launcher does all the device handling for 279 * doing things are inaccurate: the Launcher does all the device handling for
278 * the Guest. The Guest can't tell what's done by the the Launcher and what by 280 * the Guest, but the Guest can't know that.
279 * the Host.
280 * 281 *
281 * Just to confuse you: to the Host kernel, the Launcher *is* the Guest and we 282 * Just to confuse you: to the Host kernel, the Launcher *is* the Guest and we
282 * shall see more of that later. 283 * shall see more of that later.
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index 2a45f0691c9b..fffabb327157 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -26,7 +26,8 @@
26 * 26 *
27 * We use two-level page tables for the Guest. If you're not entirely 27 * We use two-level page tables for the Guest. If you're not entirely
28 * comfortable with virtual addresses, physical addresses and page tables then 28 * comfortable with virtual addresses, physical addresses and page tables then
29 * I recommend you review lguest.c's "Page Table Handling" (with diagrams!). 29 * I recommend you review arch/x86/lguest/boot.c's "Page Table Handling" (with
30 * diagrams!).
30 * 31 *
31 * The Guest keeps page tables, but we maintain the actual ones here: these are 32 * The Guest keeps page tables, but we maintain the actual ones here: these are
32 * called "shadow" page tables. Which is a very Guest-centric name: these are 33 * called "shadow" page tables. Which is a very Guest-centric name: these are
@@ -36,11 +37,11 @@
36 * 37 *
37 * Anyway, this is the most complicated part of the Host code. There are seven 38 * Anyway, this is the most complicated part of the Host code. There are seven
38 * parts to this: 39 * parts to this:
39 * (i) Setting up a page table entry for the Guest when it faults, 40 * (i) Looking up a page table entry when the Guest faults,
40 * (ii) Setting up the page table entry for the Guest stack, 41 * (ii) Making sure the Guest stack is mapped,
41 * (iii) Setting up a page table entry when the Guest tells us it has changed, 42 * (iii) Setting up a page table entry when the Guest tells us one has changed,
42 * (iv) Switching page tables, 43 * (iv) Switching page tables,
43 * (v) Flushing (thowing away) page tables, 44 * (v) Flushing (throwing away) page tables,
44 * (vi) Mapping the Switcher when the Guest is about to run, 45 * (vi) Mapping the Switcher when the Guest is about to run,
45 * (vii) Setting up the page tables initially. 46 * (vii) Setting up the page tables initially.
46 :*/ 47 :*/
@@ -57,16 +58,15 @@
57static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); 58static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
58#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) 59#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)
59 60
60/*H:320 With our shadow and Guest types established, we need to deal with 61/*H:320 The page table code is curly enough to need helper functions to keep it
61 * them: the page table code is curly enough to need helper functions to keep 62 * clear and clean.
62 * it clear and clean.
63 * 63 *
64 * There are two functions which return pointers to the shadow (aka "real") 64 * There are two functions which return pointers to the shadow (aka "real")
65 * page tables. 65 * page tables.
66 * 66 *
67 * spgd_addr() takes the virtual address and returns a pointer to the top-level 67 * spgd_addr() takes the virtual address and returns a pointer to the top-level
68 * page directory entry for that address. Since we keep track of several page 68 * page directory entry (PGD) for that address. Since we keep track of several
69 * tables, the "i" argument tells us which one we're interested in (it's 69 * page tables, the "i" argument tells us which one we're interested in (it's
70 * usually the current one). */ 70 * usually the current one). */
71static pgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr) 71static pgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr)
72{ 72{
@@ -81,9 +81,9 @@ static pgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr)
81 return &lg->pgdirs[i].pgdir[index]; 81 return &lg->pgdirs[i].pgdir[index];
82} 82}
83 83
84/* This routine then takes the PGD entry given above, which contains the 84/* This routine then takes the page directory entry returned above, which
85 * address of the PTE page. It then returns a pointer to the PTE entry for the 85 * contains the address of the page table entry (PTE) page. It then returns a
86 * given address. */ 86 * pointer to the PTE entry for the given address. */
87static pte_t *spte_addr(struct lguest *lg, pgd_t spgd, unsigned long vaddr) 87static pte_t *spte_addr(struct lguest *lg, pgd_t spgd, unsigned long vaddr)
88{ 88{
89 pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); 89 pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
@@ -191,7 +191,7 @@ static void check_gpgd(struct lguest *lg, pgd_t gpgd)
191} 191}
192 192
193/*H:330 193/*H:330
194 * (i) Setting up a page table entry for the Guest when it faults 194 * (i) Looking up a page table entry when the Guest faults.
195 * 195 *
196 * We saw this call in run_guest(): when we see a page fault in the Guest, we 196 * We saw this call in run_guest(): when we see a page fault in the Guest, we
197 * come here. That's because we only set up the shadow page tables lazily as 197 * come here. That's because we only set up the shadow page tables lazily as
@@ -199,7 +199,7 @@ static void check_gpgd(struct lguest *lg, pgd_t gpgd)
199 * and return to the Guest without it knowing. 199 * and return to the Guest without it knowing.
200 * 200 *
201 * If we fixed up the fault (ie. we mapped the address), this routine returns 201 * If we fixed up the fault (ie. we mapped the address), this routine returns
202 * true. */ 202 * true. Otherwise, it was a real fault and we need to tell the Guest. */
203int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) 203int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
204{ 204{
205 pgd_t gpgd; 205 pgd_t gpgd;
@@ -246,16 +246,16 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
246 if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW)) 246 if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW))
247 return 0; 247 return 0;
248 248
249 /* User access to a kernel page? (bit 3 == user access) */ 249 /* User access to a kernel-only page? (bit 3 == user access) */
250 if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER)) 250 if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER))
251 return 0; 251 return 0;
252 252
253 /* Check that the Guest PTE flags are OK, and the page number is below 253 /* Check that the Guest PTE flags are OK, and the page number is below
254 * the pfn_limit (ie. not mapping the Launcher binary). */ 254 * the pfn_limit (ie. not mapping the Launcher binary). */
255 check_gpte(lg, gpte); 255 check_gpte(lg, gpte);
256
256 /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ 257 /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
257 gpte = pte_mkyoung(gpte); 258 gpte = pte_mkyoung(gpte);
258
259 if (errcode & 2) 259 if (errcode & 2)
260 gpte = pte_mkdirty(gpte); 260 gpte = pte_mkdirty(gpte);
261 261
@@ -272,23 +272,28 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
272 else 272 else
273 /* If this is a read, don't set the "writable" bit in the page 273 /* If this is a read, don't set the "writable" bit in the page
274 * table entry, even if the Guest says it's writable. That way 274 * table entry, even if the Guest says it's writable. That way
275 * we come back here when a write does actually ocur, so we can 275 * we will come back here when a write does actually occur, so
276 * update the Guest's _PAGE_DIRTY flag. */ 276 * we can update the Guest's _PAGE_DIRTY flag. */
277 *spte = gpte_to_spte(lg, pte_wrprotect(gpte), 0); 277 *spte = gpte_to_spte(lg, pte_wrprotect(gpte), 0);
278 278
279 /* Finally, we write the Guest PTE entry back: we've set the 279 /* Finally, we write the Guest PTE entry back: we've set the
280 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ 280 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */
281 lgwrite(lg, gpte_ptr, pte_t, gpte); 281 lgwrite(lg, gpte_ptr, pte_t, gpte);
282 282
283 /* We succeeded in mapping the page! */ 283 /* The fault is fixed, the page table is populated, the mapping
284 * manipulated, the result returned and the code complete. A small
285 * delay and a trace of alliteration are the only indications the Guest
286 * has that a page fault occurred at all. */
284 return 1; 287 return 1;
285} 288}
286 289
287/*H:360 (ii) Setting up the page table entry for the Guest stack. 290/*H:360
291 * (ii) Making sure the Guest stack is mapped.
288 * 292 *
289 * Remember pin_stack_pages() which makes sure the stack is mapped? It could 293 * Remember that direct traps into the Guest need a mapped Guest kernel stack.
290 * simply call demand_page(), but as we've seen that logic is quite long, and 294 * pin_stack_pages() calls us here: we could simply call demand_page(), but as
291 * usually the stack pages are already mapped anyway, so it's not required. 295 * we've seen that logic is quite long, and usually the stack pages are already
296 * mapped, so it's overkill.
292 * 297 *
293 * This is a quick version which answers the question: is this virtual address 298 * This is a quick version which answers the question: is this virtual address
294 * mapped by the shadow page tables, and is it writable? */ 299 * mapped by the shadow page tables, and is it writable? */
@@ -297,7 +302,7 @@ static int page_writable(struct lguest *lg, unsigned long vaddr)
297 pgd_t *spgd; 302 pgd_t *spgd;
298 unsigned long flags; 303 unsigned long flags;
299 304
300 /* Look at the top level entry: is it present? */ 305 /* Look at the current top level entry: is it present? */
301 spgd = spgd_addr(lg, lg->pgdidx, vaddr); 306 spgd = spgd_addr(lg, lg->pgdidx, vaddr);
302 if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) 307 if (!(pgd_flags(*spgd) & _PAGE_PRESENT))
303 return 0; 308 return 0;
@@ -333,15 +338,14 @@ static void release_pgd(struct lguest *lg, pgd_t *spgd)
333 release_pte(ptepage[i]); 338 release_pte(ptepage[i]);
334 /* Now we can free the page of PTEs */ 339 /* Now we can free the page of PTEs */
335 free_page((long)ptepage); 340 free_page((long)ptepage);
336 /* And zero out the PGD entry we we never release it twice. */ 341 /* And zero out the PGD entry so we never release it twice. */
337 *spgd = __pgd(0); 342 *spgd = __pgd(0);
338 } 343 }
339} 344}
340 345
341/*H:440 (v) Flushing (thowing away) page tables, 346/*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings()
342 * 347 * hypercall and once in new_pgdir() when we re-used a top-level pgdir page.
343 * We saw flush_user_mappings() called when we re-used a top-level pgdir page. 348 * It simply releases every PTE page from 0 up to the Guest's kernel address. */
344 * It simply releases every PTE page from 0 up to the kernel address. */
345static void flush_user_mappings(struct lguest *lg, int idx) 349static void flush_user_mappings(struct lguest *lg, int idx)
346{ 350{
347 unsigned int i; 351 unsigned int i;
@@ -350,8 +354,10 @@ static void flush_user_mappings(struct lguest *lg, int idx)
350 release_pgd(lg, lg->pgdirs[idx].pgdir + i); 354 release_pgd(lg, lg->pgdirs[idx].pgdir + i);
351} 355}
352 356
353/* The Guest also has a hypercall to do this manually: it's used when a large 357/*H:440 (v) Flushing (throwing away) page tables,
354 * number of mappings have been changed. */ 358 *
359 * The Guest has a hypercall to throw away the page tables: it's used when a
360 * large number of mappings have been changed. */
355void guest_pagetable_flush_user(struct lguest *lg) 361void guest_pagetable_flush_user(struct lguest *lg)
356{ 362{
357 /* Drop the userspace part of the current page table. */ 363 /* Drop the userspace part of the current page table. */
@@ -423,8 +429,9 @@ static unsigned int new_pgdir(struct lguest *lg,
423 429
424/*H:430 (iv) Switching page tables 430/*H:430 (iv) Switching page tables
425 * 431 *
426 * This is what happens when the Guest changes page tables (ie. changes the 432 * Now we've seen all the page table setting and manipulation, let's see what
427 * top-level pgdir). This happens on almost every context switch. */ 433 * what happens when the Guest changes page tables (ie. changes the top-level
434 * pgdir). This occurs on almost every context switch. */
428void guest_new_pagetable(struct lguest *lg, unsigned long pgtable) 435void guest_new_pagetable(struct lguest *lg, unsigned long pgtable)
429{ 436{
430 int newpgdir, repin = 0; 437 int newpgdir, repin = 0;
@@ -443,7 +450,8 @@ void guest_new_pagetable(struct lguest *lg, unsigned long pgtable)
443} 450}
444 451
445/*H:470 Finally, a routine which throws away everything: all PGD entries in all 452/*H:470 Finally, a routine which throws away everything: all PGD entries in all
446 * the shadow page tables. This is used when we destroy the Guest. */ 453 * the shadow page tables, including the Guest's kernel mappings. This is used
454 * when we destroy the Guest. */
447static void release_all_pagetables(struct lguest *lg) 455static void release_all_pagetables(struct lguest *lg)
448{ 456{
449 unsigned int i, j; 457 unsigned int i, j;
@@ -458,13 +466,22 @@ static void release_all_pagetables(struct lguest *lg)
458 466
459/* We also throw away everything when a Guest tells us it's changed a kernel 467/* We also throw away everything when a Guest tells us it's changed a kernel
460 * mapping. Since kernel mappings are in every page table, it's easiest to 468 * mapping. Since kernel mappings are in every page table, it's easiest to
461 * throw them all away. This is amazingly slow, but thankfully rare. */ 469 * throw them all away. This traps the Guest in amber for a while as
470 * everything faults back in, but it's rare. */
462void guest_pagetable_clear_all(struct lguest *lg) 471void guest_pagetable_clear_all(struct lguest *lg)
463{ 472{
464 release_all_pagetables(lg); 473 release_all_pagetables(lg);
465 /* We need the Guest kernel stack mapped again. */ 474 /* We need the Guest kernel stack mapped again. */
466 pin_stack_pages(lg); 475 pin_stack_pages(lg);
467} 476}
477/*:*/
478/*M:009 Since we throw away all mappings when a kernel mapping changes, our
479 * performance sucks for guests using highmem. In fact, a guest with
480 * PAGE_OFFSET 0xc0000000 (the default) and more than about 700MB of RAM is
481 * usually slower than a Guest with less memory.
482 *
483 * This, of course, cannot be fixed. It would take some kind of... well, I
484 * don't know, but the term "puissant code-fu" comes to mind. :*/
468 485
469/*H:420 This is the routine which actually sets the page table entry for then 486/*H:420 This is the routine which actually sets the page table entry for then
470 * "idx"'th shadow page table. 487 * "idx"'th shadow page table.
@@ -483,7 +500,7 @@ void guest_pagetable_clear_all(struct lguest *lg)
483static void do_set_pte(struct lguest *lg, int idx, 500static void do_set_pte(struct lguest *lg, int idx,
484 unsigned long vaddr, pte_t gpte) 501 unsigned long vaddr, pte_t gpte)
485{ 502{
486 /* Look up the matching shadow page directot entry. */ 503 /* Look up the matching shadow page directory entry. */
487 pgd_t *spgd = spgd_addr(lg, idx, vaddr); 504 pgd_t *spgd = spgd_addr(lg, idx, vaddr);
488 505
489 /* If the top level isn't present, there's no entry to update. */ 506 /* If the top level isn't present, there's no entry to update. */
@@ -500,7 +517,8 @@ static void do_set_pte(struct lguest *lg, int idx,
500 *spte = gpte_to_spte(lg, gpte, 517 *spte = gpte_to_spte(lg, gpte,
501 pte_flags(gpte) & _PAGE_DIRTY); 518 pte_flags(gpte) & _PAGE_DIRTY);
502 } else 519 } else
503 /* Otherwise we can demand_page() it in later. */ 520 /* Otherwise kill it and we can demand_page() it in
521 * later. */
504 *spte = __pte(0); 522 *spte = __pte(0);
505 } 523 }
506} 524}
@@ -535,7 +553,7 @@ void guest_set_pte(struct lguest *lg,
535} 553}
536 554
537/*H:400 555/*H:400
538 * (iii) Setting up a page table entry when the Guest tells us it has changed. 556 * (iii) Setting up a page table entry when the Guest tells us one has changed.
539 * 557 *
540 * Just like we did in interrupts_and_traps.c, it makes sense for us to deal 558 * Just like we did in interrupts_and_traps.c, it makes sense for us to deal
541 * with the other side of page tables while we're here: what happens when the 559 * with the other side of page tables while we're here: what happens when the
@@ -612,9 +630,10 @@ void free_guest_pagetable(struct lguest *lg)
612 630
613/*H:480 (vi) Mapping the Switcher when the Guest is about to run. 631/*H:480 (vi) Mapping the Switcher when the Guest is about to run.
614 * 632 *
615 * The Switcher and the two pages for this CPU need to be available to the 633 * The Switcher and the two pages for this CPU need to be visible in the
616 * Guest (and not the pages for other CPUs). We have the appropriate PTE pages 634 * Guest (and not the pages for other CPUs). We have the appropriate PTE pages
617 * for each CPU already set up, we just need to hook them in. */ 635 * for each CPU already set up, we just need to hook them in now we know which
636 * Guest is about to run on this CPU. */
618void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages) 637void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages)
619{ 638{
620 pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); 639 pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages);
@@ -677,6 +696,18 @@ static __init void populate_switcher_pte_page(unsigned int cpu,
677 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)); 696 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED));
678} 697}
679 698
699/* We've made it through the page table code. Perhaps our tired brains are
700 * still processing the details, or perhaps we're simply glad it's over.
701 *
702 * If nothing else, note that all this complexity in juggling shadow page
703 * tables in sync with the Guest's page tables is for one reason: for most
704 * Guests this page table dance determines how bad performance will be. This
705 * is why Xen uses exotic direct Guest pagetable manipulation, and why both
706 * Intel and AMD have implemented shadow page table support directly into
707 * hardware.
708 *
709 * There is just one file remaining in the Host. */
710
680/*H:510 At boot or module load time, init_pagetables() allocates and populates 711/*H:510 At boot or module load time, init_pagetables() allocates and populates
681 * the Switcher PTE page for each CPU. */ 712 * the Switcher PTE page for each CPU. */
682__init int init_pagetables(struct page **switcher_page, unsigned int pages) 713__init int init_pagetables(struct page **switcher_page, unsigned int pages)
diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c
index c2434ec99f7b..9e189cbec7dd 100644
--- a/drivers/lguest/segments.c
+++ b/drivers/lguest/segments.c
@@ -12,8 +12,6 @@
12#include "lg.h" 12#include "lg.h"
13 13
14/*H:600 14/*H:600
15 * We've almost completed the Host; there's just one file to go!
16 *
17 * Segments & The Global Descriptor Table 15 * Segments & The Global Descriptor Table
18 * 16 *
19 * (That title sounds like a bad Nerdcore group. Not to suggest that there are 17 * (That title sounds like a bad Nerdcore group. Not to suggest that there are
@@ -55,7 +53,7 @@ static int ignored_gdt(unsigned int num)
55 || num == GDT_ENTRY_DOUBLEFAULT_TSS); 53 || num == GDT_ENTRY_DOUBLEFAULT_TSS);
56} 54}
57 55
58/*H:610 Once the GDT has been changed, we fix the new entries up a little. We 56/*H:630 Once the Guest gave us new GDT entries, we fix them up a little. We
59 * don't care if they're invalid: the worst that can happen is a General 57 * don't care if they're invalid: the worst that can happen is a General
60 * Protection Fault in the Switcher when it restores a Guest segment register 58 * Protection Fault in the Switcher when it restores a Guest segment register
61 * which tries to use that entry. Then we kill the Guest for causing such a 59 * which tries to use that entry. Then we kill the Guest for causing such a
@@ -84,25 +82,33 @@ static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end)
84 } 82 }
85} 83}
86 84
87/* This routine is called at boot or modprobe time for each CPU to set up the 85/*H:610 Like the IDT, we never simply use the GDT the Guest gives us. We keep
88 * "constant" GDT entries for Guests running on that CPU. */ 86 * a GDT for each CPU, and copy across the Guest's entries each time we want to
87 * run the Guest on that CPU.
88 *
89 * This routine is called at boot or modprobe time for each CPU to set up the
90 * constant GDT entries: the ones which are the same no matter what Guest we're
91 * running. */
89void setup_default_gdt_entries(struct lguest_ro_state *state) 92void setup_default_gdt_entries(struct lguest_ro_state *state)
90{ 93{
91 struct desc_struct *gdt = state->guest_gdt; 94 struct desc_struct *gdt = state->guest_gdt;
92 unsigned long tss = (unsigned long)&state->guest_tss; 95 unsigned long tss = (unsigned long)&state->guest_tss;
93 96
94 /* The hypervisor segments are full 0-4G segments, privilege level 0 */ 97 /* The Switcher segments are full 0-4G segments, privilege level 0 */
95 gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; 98 gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
96 gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; 99 gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
97 100
98 /* The TSS segment refers to the TSS entry for this CPU, so we cannot 101 /* The TSS segment refers to the TSS entry for this particular CPU.
99 * copy it from the Guest. Forgive the magic flags */ 102 * Forgive the magic flags: the 0x8900 means the entry is Present, it's
103 * privilege level 0 Available 386 TSS system segment, and the 0x67
104 * means Saturn is eclipsed by Mercury in the twelfth house. */
100 gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16); 105 gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16);
101 gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000) 106 gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000)
102 | ((tss >> 16) & 0x000000FF); 107 | ((tss >> 16) & 0x000000FF);
103} 108}
104 109
105/* This routine is called before the Guest is run for the first time. */ 110/* This routine sets up the initial Guest GDT for booting. All entries start
111 * as 0 (unusable). */
106void setup_guest_gdt(struct lguest *lg) 112void setup_guest_gdt(struct lguest *lg)
107{ 113{
108 /* Start with full 0-4G segments... */ 114 /* Start with full 0-4G segments... */
@@ -114,13 +120,8 @@ void setup_guest_gdt(struct lguest *lg)
114 lg->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13); 120 lg->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13);
115} 121}
116 122
117/* Like the IDT, we never simply use the GDT the Guest gives us. We set up the 123/*H:650 An optimization of copy_gdt(), for just the three "thead-local storage"
118 * GDTs for each CPU, then we copy across the entries each time we want to run 124 * entries. */
119 * a different Guest on that CPU. */
120
121/* A partial GDT load, for the three "thead-local storage" entries. Otherwise
122 * it's just like load_guest_gdt(). So much, in fact, it would probably be
123 * neater to have a single hypercall to cover both. */
124void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt) 125void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt)
125{ 126{
126 unsigned int i; 127 unsigned int i;
@@ -129,7 +130,9 @@ void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt)
129 gdt[i] = lg->arch.gdt[i]; 130 gdt[i] = lg->arch.gdt[i];
130} 131}
131 132
132/* This is the full version */ 133/*H:640 When the Guest is run on a different CPU, or the GDT entries have
134 * changed, copy_gdt() is called to copy the Guest's GDT entries across to this
135 * CPU's GDT. */
133void copy_gdt(const struct lguest *lg, struct desc_struct *gdt) 136void copy_gdt(const struct lguest *lg, struct desc_struct *gdt)
134{ 137{
135 unsigned int i; 138 unsigned int i;
@@ -141,7 +144,8 @@ void copy_gdt(const struct lguest *lg, struct desc_struct *gdt)
141 gdt[i] = lg->arch.gdt[i]; 144 gdt[i] = lg->arch.gdt[i];
142} 145}
143 146
144/* This is where the Guest asks us to load a new GDT (LHCALL_LOAD_GDT). */ 147/*H:620 This is where the Guest asks us to load a new GDT (LHCALL_LOAD_GDT).
148 * We copy it from the Guest and tweak the entries. */
145void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num) 149void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num)
146{ 150{
147 /* We assume the Guest has the same number of GDT entries as the 151 /* We assume the Guest has the same number of GDT entries as the
@@ -157,16 +161,22 @@ void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num)
157 lg->changed |= CHANGED_GDT; 161 lg->changed |= CHANGED_GDT;
158} 162}
159 163
164/* This is the fast-track version for just changing the three TLS entries.
165 * Remember that this happens on every context switch, so it's worth
166 * optimizing. But wouldn't it be neater to have a single hypercall to cover
167 * both cases? */
160void guest_load_tls(struct lguest *lg, unsigned long gtls) 168void guest_load_tls(struct lguest *lg, unsigned long gtls)
161{ 169{
162 struct desc_struct *tls = &lg->arch.gdt[GDT_ENTRY_TLS_MIN]; 170 struct desc_struct *tls = &lg->arch.gdt[GDT_ENTRY_TLS_MIN];
163 171
164 __lgread(lg, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES); 172 __lgread(lg, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
165 fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1); 173 fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1);
174 /* Note that just the TLS entries have changed. */
166 lg->changed |= CHANGED_GDT_TLS; 175 lg->changed |= CHANGED_GDT_TLS;
167} 176}
177/*:*/
168 178
169/* 179/*H:660
170 * With this, we have finished the Host. 180 * With this, we have finished the Host.
171 * 181 *
172 * Five of the seven parts of our task are complete. You have made it through 182 * Five of the seven parts of our task are complete. You have made it through
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 9eed12d5a395..482aec2a9631 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -63,7 +63,7 @@ static struct lguest_pages *lguest_pages(unsigned int cpu)
63static DEFINE_PER_CPU(struct lguest *, last_guest); 63static DEFINE_PER_CPU(struct lguest *, last_guest);
64 64
65/*S:010 65/*S:010
66 * We are getting close to the Switcher. 66 * We approach the Switcher.
67 * 67 *
68 * Remember that each CPU has two pages which are visible to the Guest when it 68 * Remember that each CPU has two pages which are visible to the Guest when it
69 * runs on that CPU. This has to contain the state for that Guest: we copy the 69 * runs on that CPU. This has to contain the state for that Guest: we copy the
@@ -134,7 +134,7 @@ static void run_guest_once(struct lguest *lg, struct lguest_pages *pages)
134 * 134 *
135 * The lcall also pushes the old code segment (KERNEL_CS) onto the 135 * The lcall also pushes the old code segment (KERNEL_CS) onto the
136 * stack, then the address of this call. This stack layout happens to 136 * stack, then the address of this call. This stack layout happens to
137 * exactly match the stack of an interrupt... */ 137 * exactly match the stack layout created by an interrupt... */
138 asm volatile("pushf; lcall *lguest_entry" 138 asm volatile("pushf; lcall *lguest_entry"
139 /* This is how we tell GCC that %eax ("a") and %ebx ("b") 139 /* This is how we tell GCC that %eax ("a") and %ebx ("b")
140 * are changed by this routine. The "=" means output. */ 140 * are changed by this routine. The "=" means output. */
@@ -151,40 +151,46 @@ static void run_guest_once(struct lguest *lg, struct lguest_pages *pages)
151} 151}
152/*:*/ 152/*:*/
153 153
154/*M:002 There are hooks in the scheduler which we can register to tell when we
155 * get kicked off the CPU (preempt_notifier_register()). This would allow us
156 * to lazily disable SYSENTER which would regain some performance, and should
157 * also simplify copy_in_guest_info(). Note that we'd still need to restore
158 * things when we exit to Launcher userspace, but that's fairly easy.
159 *
160 * The hooks were designed for KVM, but we can also put them to good use. :*/
161
154/*H:040 This is the i386-specific code to setup and run the Guest. Interrupts 162/*H:040 This is the i386-specific code to setup and run the Guest. Interrupts
155 * are disabled: we own the CPU. */ 163 * are disabled: we own the CPU. */
156void lguest_arch_run_guest(struct lguest *lg) 164void lguest_arch_run_guest(struct lguest *lg)
157{ 165{
158 /* Remember the awfully-named TS bit? If the Guest has asked 166 /* Remember the awfully-named TS bit? If the Guest has asked to set it
159 * to set it we set it now, so we can trap and pass that trap 167 * we set it now, so we can trap and pass that trap to the Guest if it
160 * to the Guest if it uses the FPU. */ 168 * uses the FPU. */
161 if (lg->ts) 169 if (lg->ts)
162 lguest_set_ts(); 170 lguest_set_ts();
163 171
164 /* SYSENTER is an optimized way of doing system calls. We 172 /* SYSENTER is an optimized way of doing system calls. We can't allow
165 * can't allow it because it always jumps to privilege level 0. 173 * it because it always jumps to privilege level 0. A normal Guest
166 * A normal Guest won't try it because we don't advertise it in 174 * won't try it because we don't advertise it in CPUID, but a malicious
167 * CPUID, but a malicious Guest (or malicious Guest userspace 175 * Guest (or malicious Guest userspace program) could, so we tell the
168 * program) could, so we tell the CPU to disable it before 176 * CPU to disable it before running the Guest. */
169 * running the Guest. */
170 if (boot_cpu_has(X86_FEATURE_SEP)) 177 if (boot_cpu_has(X86_FEATURE_SEP))
171 wrmsr(MSR_IA32_SYSENTER_CS, 0, 0); 178 wrmsr(MSR_IA32_SYSENTER_CS, 0, 0);
172 179
173 /* Now we actually run the Guest. It will pop back out when 180 /* Now we actually run the Guest. It will return when something
174 * something interesting happens, and we can examine its 181 * interesting happens, and we can examine its registers to see what it
175 * registers to see what it was doing. */ 182 * was doing. */
176 run_guest_once(lg, lguest_pages(raw_smp_processor_id())); 183 run_guest_once(lg, lguest_pages(raw_smp_processor_id()));
177 184
178 /* The "regs" pointer contains two extra entries which are not 185 /* Note that the "regs" pointer contains two extra entries which are
179 * really registers: a trap number which says what interrupt or 186 * not really registers: a trap number which says what interrupt or
180 * trap made the switcher code come back, and an error code 187 * trap made the switcher code come back, and an error code which some
181 * which some traps set. */ 188 * traps set. */
182 189
183 /* If the Guest page faulted, then the cr2 register will tell 190 /* If the Guest page faulted, then the cr2 register will tell us the
184 * us the bad virtual address. We have to grab this now, 191 * bad virtual address. We have to grab this now, because once we
185 * because once we re-enable interrupts an interrupt could 192 * re-enable interrupts an interrupt could fault and thus overwrite
186 * fault and thus overwrite cr2, or we could even move off to a 193 * cr2, or we could even move off to a different CPU. */
187 * different CPU. */
188 if (lg->regs->trapnum == 14) 194 if (lg->regs->trapnum == 14)
189 lg->arch.last_pagefault = read_cr2(); 195 lg->arch.last_pagefault = read_cr2();
190 /* Similarly, if we took a trap because the Guest used the FPU, 196 /* Similarly, if we took a trap because the Guest used the FPU,
@@ -197,14 +203,15 @@ void lguest_arch_run_guest(struct lguest *lg)
197 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); 203 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
198} 204}
199 205
200/*H:130 Our Guest is usually so well behaved; it never tries to do things it 206/*H:130 Now we've examined the hypercall code; our Guest can make requests.
201 * isn't allowed to. Unfortunately, Linux's paravirtual infrastructure isn't 207 * Our Guest is usually so well behaved; it never tries to do things it isn't
202 * quite complete, because it doesn't contain replacements for the Intel I/O 208 * allowed to, and uses hypercalls instead. Unfortunately, Linux's paravirtual
203 * instructions. As a result, the Guest sometimes fumbles across one during 209 * infrastructure isn't quite complete, because it doesn't contain replacements
204 * the boot process as it probes for various things which are usually attached 210 * for the Intel I/O instructions. As a result, the Guest sometimes fumbles
205 * to a PC. 211 * across one during the boot process as it probes for various things which are
212 * usually attached to a PC.
206 * 213 *
207 * When the Guest uses one of these instructions, we get trap #13 (General 214 * When the Guest uses one of these instructions, we get a trap (General
208 * Protection Fault) and come here. We see if it's one of those troublesome 215 * Protection Fault) and come here. We see if it's one of those troublesome
209 * instructions and skip over it. We return true if we did. */ 216 * instructions and skip over it. We return true if we did. */
210static int emulate_insn(struct lguest *lg) 217static int emulate_insn(struct lguest *lg)
@@ -275,43 +282,43 @@ static int emulate_insn(struct lguest *lg)
275void lguest_arch_handle_trap(struct lguest *lg) 282void lguest_arch_handle_trap(struct lguest *lg)
276{ 283{
277 switch (lg->regs->trapnum) { 284 switch (lg->regs->trapnum) {
278 case 13: /* We've intercepted a GPF. */ 285 case 13: /* We've intercepted a General Protection Fault. */
279 /* Check if this was one of those annoying IN or OUT 286 /* Check if this was one of those annoying IN or OUT
280 * instructions which we need to emulate. If so, we 287 * instructions which we need to emulate. If so, we just go
281 * just go back into the Guest after we've done it. */ 288 * back into the Guest after we've done it. */
282 if (lg->regs->errcode == 0) { 289 if (lg->regs->errcode == 0) {
283 if (emulate_insn(lg)) 290 if (emulate_insn(lg))
284 return; 291 return;
285 } 292 }
286 break; 293 break;
287 case 14: /* We've intercepted a page fault. */ 294 case 14: /* We've intercepted a Page Fault. */
288 /* The Guest accessed a virtual address that wasn't 295 /* The Guest accessed a virtual address that wasn't mapped.
289 * mapped. This happens a lot: we don't actually set 296 * This happens a lot: we don't actually set up most of the
290 * up most of the page tables for the Guest at all when 297 * page tables for the Guest at all when we start: as it runs
291 * we start: as it runs it asks for more and more, and 298 * it asks for more and more, and we set them up as
292 * we set them up as required. In this case, we don't 299 * required. In this case, we don't even tell the Guest that
293 * even tell the Guest that the fault happened. 300 * the fault happened.
294 * 301 *
295 * The errcode tells whether this was a read or a 302 * The errcode tells whether this was a read or a write, and
296 * write, and whether kernel or userspace code. */ 303 * whether kernel or userspace code. */
297 if (demand_page(lg, lg->arch.last_pagefault, lg->regs->errcode)) 304 if (demand_page(lg, lg->arch.last_pagefault, lg->regs->errcode))
298 return; 305 return;
299 306
300 /* OK, it's really not there (or not OK): the Guest 307 /* OK, it's really not there (or not OK): the Guest needs to
301 * needs to know. We write out the cr2 value so it 308 * know. We write out the cr2 value so it knows where the
302 * knows where the fault occurred. 309 * fault occurred.
303 * 310 *
304 * Note that if the Guest were really messed up, this 311 * Note that if the Guest were really messed up, this could
305 * could happen before it's done the INITIALIZE 312 * happen before it's done the LHCALL_LGUEST_INIT hypercall, so
306 * hypercall, so lg->lguest_data will be NULL */ 313 * lg->lguest_data could be NULL */
307 if (lg->lguest_data && 314 if (lg->lguest_data &&
308 put_user(lg->arch.last_pagefault, &lg->lguest_data->cr2)) 315 put_user(lg->arch.last_pagefault, &lg->lguest_data->cr2))
309 kill_guest(lg, "Writing cr2"); 316 kill_guest(lg, "Writing cr2");
310 break; 317 break;
311 case 7: /* We've intercepted a Device Not Available fault. */ 318 case 7: /* We've intercepted a Device Not Available fault. */
312 /* If the Guest doesn't want to know, we already 319 /* If the Guest doesn't want to know, we already restored the
313 * restored the Floating Point Unit, so we just 320 * Floating Point Unit, so we just continue without telling
314 * continue without telling it. */ 321 * it. */
315 if (!lg->ts) 322 if (!lg->ts)
316 return; 323 return;
317 break; 324 break;
@@ -536,9 +543,6 @@ int lguest_arch_init_hypercalls(struct lguest *lg)
536 543
537 return 0; 544 return 0;
538} 545}
539/* Now we've examined the hypercall code; our Guest can make requests. There
540 * is one other way we can do things for the Guest, as we see in
541 * emulate_insn(). :*/
542 546
543/*L:030 lguest_arch_setup_regs() 547/*L:030 lguest_arch_setup_regs()
544 * 548 *
@@ -562,7 +566,7 @@ void lguest_arch_setup_regs(struct lguest *lg, unsigned long start)
562 * is supposed to always be "1". Bit 9 (0x200) controls whether 566 * is supposed to always be "1". Bit 9 (0x200) controls whether
563 * interrupts are enabled. We always leave interrupts enabled while 567 * interrupts are enabled. We always leave interrupts enabled while
564 * running the Guest. */ 568 * running the Guest. */
565 regs->eflags = 0x202; 569 regs->eflags = X86_EFLAGS_IF | 0x2;
566 570
567 /* The "Extended Instruction Pointer" register says where the Guest is 571 /* The "Extended Instruction Pointer" register says where the Guest is
568 * running. */ 572 * running. */
@@ -570,8 +574,8 @@ void lguest_arch_setup_regs(struct lguest *lg, unsigned long start)
570 574
571 /* %esi points to our boot information, at physical address 0, so don't 575 /* %esi points to our boot information, at physical address 0, so don't
572 * touch it. */ 576 * touch it. */
577
573 /* There are a couple of GDT entries the Guest expects when first 578 /* There are a couple of GDT entries the Guest expects when first
574 * booting. */ 579 * booting. */
575
576 setup_guest_gdt(lg); 580 setup_guest_gdt(lg);
577} 581}
diff --git a/drivers/lguest/x86/switcher_32.S b/drivers/lguest/x86/switcher_32.S
index 1010b90b11fc..0af8baaa0d4a 100644
--- a/drivers/lguest/x86/switcher_32.S
+++ b/drivers/lguest/x86/switcher_32.S
@@ -6,6 +6,37 @@
6 * are feeling invigorated and refreshed then the next, more challenging stage 6 * are feeling invigorated and refreshed then the next, more challenging stage
7 * can be found in "make Guest". :*/ 7 * can be found in "make Guest". :*/
8 8
9/*M:012 Lguest is meant to be simple: my rule of thumb is that 1% more LOC must
10 * gain at least 1% more performance. Since neither LOC nor performance can be
11 * measured beforehand, it generally means implementing a feature then deciding
12 * if it's worth it. And once it's implemented, who can say no?
13 *
14 * This is why I haven't implemented this idea myself. I want to, but I
15 * haven't. You could, though.
16 *
17 * The main place where lguest performance sucks is Guest page faulting. When
18 * a Guest userspace process hits an unmapped page we switch back to the Host,
19 * walk the page tables, find it's not mapped, switch back to the Guest page
20 * fault handler, which calls a hypercall to set the page table entry, then
21 * finally returns to userspace. That's two round-trips.
22 *
23 * If we had a small walker in the Switcher, we could quickly check the Guest
24 * page table and if the page isn't mapped, immediately reflect the fault back
25 * into the Guest. This means the Switcher would have to know the top of the
26 * Guest page table and the page fault handler address.
27 *
28 * For simplicity, the Guest should only handle the case where the privilege
29 * level of the fault is 3 and probably only not present or write faults. It
30 * should also detect recursive faults, and hand the original fault to the
31 * Host (which is actually really easy).
32 *
33 * Two questions remain. Would the performance gain outweigh the complexity?
34 * And who would write the verse documenting it? :*/
35
36/*M:011 Lguest64 handles NMI. This gave me NMI envy (until I looked at their
37 * code). It's worth doing though, since it would let us use oprofile in the
38 * Host when a Guest is running. :*/
39
9/*S:100 40/*S:100
10 * Welcome to the Switcher itself! 41 * Welcome to the Switcher itself!
11 * 42 *
@@ -88,7 +119,7 @@ ENTRY(switch_to_guest)
88 119
89 // All saved and there's now five steps before us: 120 // All saved and there's now five steps before us:
90 // Stack, GDT, IDT, TSS 121 // Stack, GDT, IDT, TSS
91 // And last of all the page tables are flipped. 122 // Then last of all the page tables are flipped.
92 123
93 // Yet beware that our stack pointer must be 124 // Yet beware that our stack pointer must be
94 // Always valid lest an NMI hits 125 // Always valid lest an NMI hits
@@ -103,25 +134,25 @@ ENTRY(switch_to_guest)
103 lgdt LGUEST_PAGES_guest_gdt_desc(%eax) 134 lgdt LGUEST_PAGES_guest_gdt_desc(%eax)
104 135
105 // The Guest's IDT we did partially 136 // The Guest's IDT we did partially
106 // Move to the "struct lguest_pages" as well. 137 // Copy to "struct lguest_pages" as well.
107 lidt LGUEST_PAGES_guest_idt_desc(%eax) 138 lidt LGUEST_PAGES_guest_idt_desc(%eax)
108 139
109 // The TSS entry which controls traps 140 // The TSS entry which controls traps
110 // Must be loaded up with "ltr" now: 141 // Must be loaded up with "ltr" now:
142 // The GDT entry that TSS uses
143 // Changes type when we load it: damn Intel!
111 // For after we switch over our page tables 144 // For after we switch over our page tables
112 // It (as the rest) will be writable no more. 145 // That entry will be read-only: we'd crash.
113 // (The GDT entry TSS needs
114 // Changes type when we load it: damn Intel!)
115 movl $(GDT_ENTRY_TSS*8), %edx 146 movl $(GDT_ENTRY_TSS*8), %edx
116 ltr %dx 147 ltr %dx
117 148
118 // Look back now, before we take this last step! 149 // Look back now, before we take this last step!
119 // The Host's TSS entry was also marked used; 150 // The Host's TSS entry was also marked used;
120 // Let's clear it again, ere we return. 151 // Let's clear it again for our return.
121 // The GDT descriptor of the Host 152 // The GDT descriptor of the Host
122 // Points to the table after two "size" bytes 153 // Points to the table after two "size" bytes
123 movl (LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx 154 movl (LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx
124 // Clear the type field of "used" (byte 5, bit 2) 155 // Clear "used" from type field (byte 5, bit 2)
125 andb $0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx) 156 andb $0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx)
126 157
127 // Once our page table's switched, the Guest is live! 158 // Once our page table's switched, the Guest is live!
@@ -131,7 +162,7 @@ ENTRY(switch_to_guest)
131 162
132 // The page table change did one tricky thing: 163 // The page table change did one tricky thing:
133 // The Guest's register page has been mapped 164 // The Guest's register page has been mapped
134 // Writable onto our %esp (stack) -- 165 // Writable under our %esp (stack) --
135 // We can simply pop off all Guest regs. 166 // We can simply pop off all Guest regs.
136 popl %eax 167 popl %eax
137 popl %ebx 168 popl %ebx
@@ -152,16 +183,15 @@ ENTRY(switch_to_guest)
152 addl $8, %esp 183 addl $8, %esp
153 184
154 // The last five stack slots hold return address 185 // The last five stack slots hold return address
155 // And everything needed to change privilege 186 // And everything needed to switch privilege
156 // Into the Guest privilege level of 1, 187 // From Switcher's level 0 to Guest's 1,
157 // And the stack where the Guest had last left it. 188 // And the stack where the Guest had last left it.
158 // Interrupts are turned back on: we are Guest. 189 // Interrupts are turned back on: we are Guest.
159 iret 190 iret
160 191
161// There are two paths where we switch to the Host 192// We treat two paths to switch back to the Host
193// Yet both must save Guest state and restore Host
162// So we put the routine in a macro. 194// So we put the routine in a macro.
163// We are on our way home, back to the Host
164// Interrupted out of the Guest, we come here.
165#define SWITCH_TO_HOST \ 195#define SWITCH_TO_HOST \
166 /* We save the Guest state: all registers first \ 196 /* We save the Guest state: all registers first \
167 * Laid out just as "struct lguest_regs" defines */ \ 197 * Laid out just as "struct lguest_regs" defines */ \
@@ -194,7 +224,7 @@ ENTRY(switch_to_guest)
194 movl %esp, %eax; \ 224 movl %esp, %eax; \
195 andl $(~(1 << PAGE_SHIFT - 1)), %eax; \ 225 andl $(~(1 << PAGE_SHIFT - 1)), %eax; \
196 /* Save our trap number: the switch will obscure it \ 226 /* Save our trap number: the switch will obscure it \
197 * (The Guest regs are not mapped here in the Host) \ 227 * (In the Host the Guest regs are not mapped here) \
198 * %ebx holds it safe for deliver_to_host */ \ 228 * %ebx holds it safe for deliver_to_host */ \
199 movl LGUEST_PAGES_regs_trapnum(%eax), %ebx; \ 229 movl LGUEST_PAGES_regs_trapnum(%eax), %ebx; \
200 /* The Host GDT, IDT and stack! \ 230 /* The Host GDT, IDT and stack! \
@@ -210,9 +240,9 @@ ENTRY(switch_to_guest)
210 /* Switch to Host's GDT, IDT. */ \ 240 /* Switch to Host's GDT, IDT. */ \
211 lgdt LGUEST_PAGES_host_gdt_desc(%eax); \ 241 lgdt LGUEST_PAGES_host_gdt_desc(%eax); \
212 lidt LGUEST_PAGES_host_idt_desc(%eax); \ 242 lidt LGUEST_PAGES_host_idt_desc(%eax); \
213 /* Restore the Host's stack where it's saved regs lie */ \ 243 /* Restore the Host's stack where its saved regs lie */ \
214 movl LGUEST_PAGES_host_sp(%eax), %esp; \ 244 movl LGUEST_PAGES_host_sp(%eax), %esp; \
215 /* Last the TSS: our Host is complete */ \ 245 /* Last the TSS: our Host is returned */ \
216 movl $(GDT_ENTRY_TSS*8), %edx; \ 246 movl $(GDT_ENTRY_TSS*8), %edx; \
217 ltr %dx; \ 247 ltr %dx; \
218 /* Restore now the regs saved right at the first. */ \ 248 /* Restore now the regs saved right at the first. */ \
@@ -222,14 +252,15 @@ ENTRY(switch_to_guest)
222 popl %ds; \ 252 popl %ds; \
223 popl %es 253 popl %es
224 254
225// Here's where we come when the Guest has just trapped: 255// The first path is trod when the Guest has trapped:
226// (Which trap we'll see has been pushed on the stack). 256// (Which trap it was has been pushed on the stack).
227// We need only switch back, and the Host will decode 257// We need only switch back, and the Host will decode
228// Why we came home, and what needs to be done. 258// Why we came home, and what needs to be done.
229return_to_host: 259return_to_host:
230 SWITCH_TO_HOST 260 SWITCH_TO_HOST
231 iret 261 iret
232 262
263// We are lead to the second path like so:
233// An interrupt, with some cause external 264// An interrupt, with some cause external
234// Has ajerked us rudely from the Guest's code 265// Has ajerked us rudely from the Guest's code
235// Again we must return home to the Host 266// Again we must return home to the Host
@@ -238,7 +269,7 @@ deliver_to_host:
238 // But now we must go home via that place 269 // But now we must go home via that place
239 // Where that interrupt was supposed to go 270 // Where that interrupt was supposed to go
240 // Had we not been ensconced, running the Guest. 271 // Had we not been ensconced, running the Guest.
241 // Here we see the cleverness of our stack: 272 // Here we see the trickness of run_guest_once():
242 // The Host stack is formed like an interrupt 273 // The Host stack is formed like an interrupt
243 // With EIP, CS and EFLAGS layered. 274 // With EIP, CS and EFLAGS layered.
244 // Interrupt handlers end with "iret" 275 // Interrupt handlers end with "iret"
@@ -263,7 +294,7 @@ deliver_to_host:
263 xorw %ax, %ax 294 xorw %ax, %ax
264 orl %eax, %edx 295 orl %eax, %edx
265 // Now the address of the handler's in %edx 296 // Now the address of the handler's in %edx
266 // We call it now: its "iret" takes us home. 297 // We call it now: its "iret" drops us home.
267 jmp *%edx 298 jmp *%edx
268 299
269// Every interrupt can come to us here 300// Every interrupt can come to us here
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index ac54f697c508..1c159ac68c98 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -351,14 +351,10 @@ static int crypt_convert(struct crypt_config *cc,
351 struct scatterlist sg_in, sg_out; 351 struct scatterlist sg_in, sg_out;
352 352
353 sg_init_table(&sg_in, 1); 353 sg_init_table(&sg_in, 1);
354 sg_set_page(&sg_in, bv_in->bv_page); 354 sg_set_page(&sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT, bv_in->bv_offset + ctx->offset_in);
355 sg_in.offset = bv_in->bv_offset + ctx->offset_in;
356 sg_in.length = 1 << SECTOR_SHIFT;
357 355
358 sg_init_table(&sg_out, 1); 356 sg_init_table(&sg_out, 1);
359 sg_set_page(&sg_out, bv_out->bv_page); 357 sg_set_page(&sg_out, bv_out->bv_page, 1 << SECTOR_SHIFT, bv_out->bv_offset + ctx->offset_out);
360 sg_out.offset = bv_out->bv_offset + ctx->offset_out;
361 sg_out.length = 1 << SECTOR_SHIFT;
362 358
363 ctx->offset_in += sg_in.length; 359 ctx->offset_in += sg_in.length;
364 if (ctx->offset_in >= bv_in->bv_len) { 360 if (ctx->offset_in >= bv_in->bv_len) {
diff --git a/drivers/media/common/saa7146_core.c b/drivers/media/common/saa7146_core.c
index 2b1f8b4be00a..cb034ead95ab 100644
--- a/drivers/media/common/saa7146_core.c
+++ b/drivers/media/common/saa7146_core.c
@@ -118,8 +118,7 @@ static struct scatterlist* vmalloc_to_sg(unsigned char *virt, int nr_pages)
118 if (NULL == pg) 118 if (NULL == pg)
119 goto err; 119 goto err;
120 BUG_ON(PageHighMem(pg)); 120 BUG_ON(PageHighMem(pg));
121 sg_set_page(&sglist[i], pg); 121 sg_set_page(&sglist[i], pg, PAGE_SIZE, 0);
122 sglist[i].length = PAGE_SIZE;
123 } 122 }
124 return sglist; 123 return sglist;
125 124
diff --git a/drivers/media/video/ivtv/ivtv-udma.c b/drivers/media/video/ivtv/ivtv-udma.c
index 912b424e5204..460db03b0ba0 100644
--- a/drivers/media/video/ivtv/ivtv-udma.c
+++ b/drivers/media/video/ivtv/ivtv-udma.c
@@ -49,8 +49,6 @@ int ivtv_udma_fill_sg_list (struct ivtv_user_dma *dma, struct ivtv_dma_page_info
49 unsigned int len = (i == dma_page->page_count - 1) ? 49 unsigned int len = (i == dma_page->page_count - 1) ?
50 dma_page->tail : PAGE_SIZE - offset; 50 dma_page->tail : PAGE_SIZE - offset;
51 51
52 dma->SGlist[map_offset].length = len;
53 dma->SGlist[map_offset].offset = offset;
54 if (PageHighMem(dma->map[map_offset])) { 52 if (PageHighMem(dma->map[map_offset])) {
55 void *src; 53 void *src;
56 54
@@ -63,10 +61,10 @@ int ivtv_udma_fill_sg_list (struct ivtv_user_dma *dma, struct ivtv_dma_page_info
63 memcpy(page_address(dma->bouncemap[map_offset]) + offset, src, len); 61 memcpy(page_address(dma->bouncemap[map_offset]) + offset, src, len);
64 kunmap_atomic(src, KM_BOUNCE_READ); 62 kunmap_atomic(src, KM_BOUNCE_READ);
65 local_irq_restore(flags); 63 local_irq_restore(flags);
66 sg_set_page(&dma->SGlist[map_offset], dma->bouncemap[map_offset]); 64 sg_set_page(&dma->SGlist[map_offset], dma->bouncemap[map_offset], len, offset);
67 } 65 }
68 else { 66 else {
69 sg_set_page(&dma->SGlist[map_offset], dma->map[map_offset]); 67 sg_set_page(&dma->SGlist[map_offset], dma->map[map_offset], len, offset);
70 } 68 }
71 offset = 0; 69 offset = 0;
72 map_offset++; 70 map_offset++;
diff --git a/drivers/media/video/videobuf-dma-sg.c b/drivers/media/video/videobuf-dma-sg.c
index 9ab94a749d81..44ee408e145f 100644
--- a/drivers/media/video/videobuf-dma-sg.c
+++ b/drivers/media/video/videobuf-dma-sg.c
@@ -67,8 +67,7 @@ videobuf_vmalloc_to_sg(unsigned char *virt, int nr_pages)
67 if (NULL == pg) 67 if (NULL == pg)
68 goto err; 68 goto err;
69 BUG_ON(PageHighMem(pg)); 69 BUG_ON(PageHighMem(pg));
70 sg_set_page(&sglist[i], pg); 70 sg_set_page(&sglist[i], pg, PAGE_SIZE, 0);
71 sglist[i].length = PAGE_SIZE;
72 } 71 }
73 return sglist; 72 return sglist;
74 73
@@ -95,16 +94,13 @@ videobuf_pages_to_sg(struct page **pages, int nr_pages, int offset)
95 if (PageHighMem(pages[0])) 94 if (PageHighMem(pages[0]))
96 /* DMA to highmem pages might not work */ 95 /* DMA to highmem pages might not work */
97 goto highmem; 96 goto highmem;
98 sg_set_page(&sglist[0], pages[0]); 97 sg_set_page(&sglist[0], pages[0], PAGE_SIZE - offset, offset);
99 sglist[0].offset = offset;
100 sglist[0].length = PAGE_SIZE - offset;
101 for (i = 1; i < nr_pages; i++) { 98 for (i = 1; i < nr_pages; i++) {
102 if (NULL == pages[i]) 99 if (NULL == pages[i])
103 goto nopage; 100 goto nopage;
104 if (PageHighMem(pages[i])) 101 if (PageHighMem(pages[i]))
105 goto highmem; 102 goto highmem;
106 sg_set_page(&sglist[i], pages[i]); 103 sg_set_page(&sglist[i], pages[i], PAGE_SIZE, 0);
107 sglist[i].length = PAGE_SIZE;
108 } 104 }
109 return sglist; 105 return sglist;
110 106
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index d602ba6d5417..682406168de9 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -284,6 +284,7 @@ static inline struct i2o_block_request *i2o_block_request_alloc(void)
284 return ERR_PTR(-ENOMEM); 284 return ERR_PTR(-ENOMEM);
285 285
286 INIT_LIST_HEAD(&ireq->queue); 286 INIT_LIST_HEAD(&ireq->queue);
287 sg_init_table(ireq->sg_table, I2O_MAX_PHYS_SEGMENTS);
287 288
288 return ireq; 289 return ireq;
289}; 290};
diff --git a/drivers/mmc/host/au1xmmc.c b/drivers/mmc/host/au1xmmc.c
index bcbb6d247bf7..c77fadc0dfa3 100644
--- a/drivers/mmc/host/au1xmmc.c
+++ b/drivers/mmc/host/au1xmmc.c
@@ -40,13 +40,13 @@
40#include <linux/mm.h> 40#include <linux/mm.h>
41#include <linux/interrupt.h> 41#include <linux/interrupt.h>
42#include <linux/dma-mapping.h> 42#include <linux/dma-mapping.h>
43#include <scatterlist/scatterlist.h>
43 44
44#include <linux/mmc/host.h> 45#include <linux/mmc/host.h>
45#include <asm/io.h> 46#include <asm/io.h>
46#include <asm/mach-au1x00/au1000.h> 47#include <asm/mach-au1x00/au1000.h>
47#include <asm/mach-au1x00/au1xxx_dbdma.h> 48#include <asm/mach-au1x00/au1xxx_dbdma.h>
48#include <asm/mach-au1x00/au1100_mmc.h> 49#include <asm/mach-au1x00/au1100_mmc.h>
49#include <asm/scatterlist.h>
50 50
51#include <au1xxx.h> 51#include <au1xxx.h>
52#include "au1xmmc.h" 52#include "au1xmmc.h"
diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
index d0eb0a2abf4d..95244a7e7353 100644
--- a/drivers/mmc/host/mmci.c
+++ b/drivers/mmc/host/mmci.c
@@ -20,11 +20,11 @@
20#include <linux/mmc/host.h> 20#include <linux/mmc/host.h>
21#include <linux/amba/bus.h> 21#include <linux/amba/bus.h>
22#include <linux/clk.h> 22#include <linux/clk.h>
23#include <linux/scatterlist.h>
23 24
24#include <asm/cacheflush.h> 25#include <asm/cacheflush.h>
25#include <asm/div64.h> 26#include <asm/div64.h>
26#include <asm/io.h> 27#include <asm/io.h>
27#include <asm/scatterlist.h>
28#include <asm/sizes.h> 28#include <asm/sizes.h>
29#include <asm/mach/mmc.h> 29#include <asm/mach/mmc.h>
30 30
@@ -167,7 +167,7 @@ mmci_data_irq(struct mmci_host *host, struct mmc_data *data,
167 * partially written to a page is properly coherent. 167 * partially written to a page is properly coherent.
168 */ 168 */
169 if (host->sg_len && data->flags & MMC_DATA_READ) 169 if (host->sg_len && data->flags & MMC_DATA_READ)
170 flush_dcache_page(host->sg_ptr->page); 170 flush_dcache_page(sg_page(host->sg_ptr));
171 } 171 }
172 if (status & MCI_DATAEND) { 172 if (status & MCI_DATAEND) {
173 mmci_stop_data(host); 173 mmci_stop_data(host);
@@ -319,7 +319,7 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id)
319 * page, ensure that the data cache is coherent. 319 * page, ensure that the data cache is coherent.
320 */ 320 */
321 if (status & MCI_RXACTIVE) 321 if (status & MCI_RXACTIVE)
322 flush_dcache_page(host->sg_ptr->page); 322 flush_dcache_page(sg_page(host->sg_ptr));
323 323
324 if (!mmci_next_sg(host)) 324 if (!mmci_next_sg(host))
325 break; 325 break;
diff --git a/drivers/mmc/host/pxamci.c b/drivers/mmc/host/pxamci.c
index 0601e01aa2c2..a25ee71998a9 100644
--- a/drivers/mmc/host/pxamci.c
+++ b/drivers/mmc/host/pxamci.c
@@ -29,7 +29,6 @@
29 29
30#include <asm/dma.h> 30#include <asm/dma.h>
31#include <asm/io.h> 31#include <asm/io.h>
32#include <asm/scatterlist.h>
33#include <asm/sizes.h> 32#include <asm/sizes.h>
34 33
35#include <asm/arch/pxa-regs.h> 34#include <asm/arch/pxa-regs.h>
diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c
index d7c5b94d8c58..6b80bf77a4ef 100644
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
@@ -17,8 +17,6 @@
17 17
18#include <linux/mmc/host.h> 18#include <linux/mmc/host.h>
19 19
20#include <asm/scatterlist.h>
21
22#include "sdhci.h" 20#include "sdhci.h"
23 21
24#define DRIVER_NAME "sdhci" 22#define DRIVER_NAME "sdhci"
diff --git a/drivers/mmc/host/wbsd.c b/drivers/mmc/host/wbsd.c
index fa4c8c53cc7a..4d5f37421874 100644
--- a/drivers/mmc/host/wbsd.c
+++ b/drivers/mmc/host/wbsd.c
@@ -33,10 +33,10 @@
33#include <linux/pnp.h> 33#include <linux/pnp.h>
34#include <linux/highmem.h> 34#include <linux/highmem.h>
35#include <linux/mmc/host.h> 35#include <linux/mmc/host.h>
36#include <linux/scatterlist.h>
36 37
37#include <asm/io.h> 38#include <asm/io.h>
38#include <asm/dma.h> 39#include <asm/dma.h>
39#include <asm/scatterlist.h>
40 40
41#include "wbsd.h" 41#include "wbsd.h"
42 42
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 6909becb10f6..6937ef0e7275 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -188,6 +188,7 @@ struct bond_parm_tbl arp_validate_tbl[] = {
188/*-------------------------- Forward declarations ---------------------------*/ 188/*-------------------------- Forward declarations ---------------------------*/
189 189
190static void bond_send_gratuitous_arp(struct bonding *bond); 190static void bond_send_gratuitous_arp(struct bonding *bond);
191static void bond_deinit(struct net_device *bond_dev);
191 192
192/*---------------------------- General routines -----------------------------*/ 193/*---------------------------- General routines -----------------------------*/
193 194
@@ -3681,7 +3682,7 @@ static int bond_open(struct net_device *bond_dev)
3681 } 3682 }
3682 3683
3683 if (bond->params.mode == BOND_MODE_8023AD) { 3684 if (bond->params.mode == BOND_MODE_8023AD) {
3684 INIT_DELAYED_WORK(&bond->ad_work, bond_alb_monitor); 3685 INIT_DELAYED_WORK(&bond->ad_work, bond_3ad_state_machine_handler);
3685 queue_delayed_work(bond->wq, &bond->ad_work, 0); 3686 queue_delayed_work(bond->wq, &bond->ad_work, 0);
3686 /* register to receive LACPDUs */ 3687 /* register to receive LACPDUs */
3687 bond_register_lacpdu(bond); 3688 bond_register_lacpdu(bond);
@@ -4449,7 +4450,7 @@ static int bond_init(struct net_device *bond_dev, struct bond_params *params)
4449/* De-initialize device specific data. 4450/* De-initialize device specific data.
4450 * Caller must hold rtnl_lock. 4451 * Caller must hold rtnl_lock.
4451 */ 4452 */
4452void bond_deinit(struct net_device *bond_dev) 4453static void bond_deinit(struct net_device *bond_dev)
4453{ 4454{
4454 struct bonding *bond = bond_dev->priv; 4455 struct bonding *bond = bond_dev->priv;
4455 4456
diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
index d1ed14bf1ccb..61c1b4536d34 100644
--- a/drivers/net/bonding/bonding.h
+++ b/drivers/net/bonding/bonding.h
@@ -302,7 +302,6 @@ int bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb, struct net_de
302int bond_create(char *name, struct bond_params *params, struct bonding **newbond); 302int bond_create(char *name, struct bond_params *params, struct bonding **newbond);
303void bond_destroy(struct bonding *bond); 303void bond_destroy(struct bonding *bond);
304int bond_release_and_destroy(struct net_device *bond_dev, struct net_device *slave_dev); 304int bond_release_and_destroy(struct net_device *bond_dev, struct net_device *slave_dev);
305void bond_deinit(struct net_device *bond_dev);
306int bond_create_sysfs(void); 305int bond_create_sysfs(void);
307void bond_destroy_sysfs(void); 306void bond_destroy_sysfs(void);
308void bond_destroy_sysfs_entry(struct bonding *bond); 307void bond_destroy_sysfs_entry(struct bonding *bond);
diff --git a/drivers/net/cpmac.c b/drivers/net/cpmac.c
index 57541d2d9e1e..6fd95a2c8cec 100644
--- a/drivers/net/cpmac.c
+++ b/drivers/net/cpmac.c
@@ -34,6 +34,7 @@
34#include <linux/skbuff.h> 34#include <linux/skbuff.h>
35#include <linux/mii.h> 35#include <linux/mii.h>
36#include <linux/phy.h> 36#include <linux/phy.h>
37#include <linux/phy_fixed.h>
37#include <linux/platform_device.h> 38#include <linux/platform_device.h>
38#include <linux/dma-mapping.h> 39#include <linux/dma-mapping.h>
39#include <asm/gpio.h> 40#include <asm/gpio.h>
@@ -53,12 +54,6 @@ MODULE_PARM_DESC(debug_level, "Number of NETIF_MSG bits to enable");
53MODULE_PARM_DESC(dumb_switch, "Assume switch is not connected to MDIO bus"); 54MODULE_PARM_DESC(dumb_switch, "Assume switch is not connected to MDIO bus");
54 55
55#define CPMAC_VERSION "0.5.0" 56#define CPMAC_VERSION "0.5.0"
56/* stolen from net/ieee80211.h */
57#ifndef MAC_FMT
58#define MAC_FMT "%02x:%02x:%02x:%02x:%02x:%02x"
59#define MAC_ARG(x) ((u8*)(x))[0], ((u8*)(x))[1], ((u8*)(x))[2], \
60 ((u8*)(x))[3], ((u8*)(x))[4], ((u8*)(x))[5]
61#endif
62/* frame size + 802.1q tag */ 57/* frame size + 802.1q tag */
63#define CPMAC_SKB_SIZE (ETH_FRAME_LEN + 4) 58#define CPMAC_SKB_SIZE (ETH_FRAME_LEN + 4)
64#define CPMAC_QUEUES 8 59#define CPMAC_QUEUES 8
@@ -211,6 +206,7 @@ struct cpmac_priv {
211 struct net_device *dev; 206 struct net_device *dev;
212 struct work_struct reset_work; 207 struct work_struct reset_work;
213 struct platform_device *pdev; 208 struct platform_device *pdev;
209 struct napi_struct napi;
214}; 210};
215 211
216static irqreturn_t cpmac_irq(int, void *); 212static irqreturn_t cpmac_irq(int, void *);
@@ -362,47 +358,48 @@ static void cpmac_set_multicast_list(struct net_device *dev)
362 } 358 }
363} 359}
364 360
365static struct sk_buff *cpmac_rx_one(struct net_device *dev, 361static struct sk_buff *cpmac_rx_one(struct cpmac_priv *priv,
366 struct cpmac_priv *priv,
367 struct cpmac_desc *desc) 362 struct cpmac_desc *desc)
368{ 363{
369 struct sk_buff *skb, *result = NULL; 364 struct sk_buff *skb, *result = NULL;
370 365
371 if (unlikely(netif_msg_hw(priv))) 366 if (unlikely(netif_msg_hw(priv)))
372 cpmac_dump_desc(dev, desc); 367 cpmac_dump_desc(priv->dev, desc);
373 cpmac_write(priv->regs, CPMAC_RX_ACK(0), (u32)desc->mapping); 368 cpmac_write(priv->regs, CPMAC_RX_ACK(0), (u32)desc->mapping);
374 if (unlikely(!desc->datalen)) { 369 if (unlikely(!desc->datalen)) {
375 if (netif_msg_rx_err(priv) && net_ratelimit()) 370 if (netif_msg_rx_err(priv) && net_ratelimit())
376 printk(KERN_WARNING "%s: rx: spurious interrupt\n", 371 printk(KERN_WARNING "%s: rx: spurious interrupt\n",
377 dev->name); 372 priv->dev->name);
378 return NULL; 373 return NULL;
379 } 374 }
380 375
381 skb = netdev_alloc_skb(dev, CPMAC_SKB_SIZE); 376 skb = netdev_alloc_skb(priv->dev, CPMAC_SKB_SIZE);
382 if (likely(skb)) { 377 if (likely(skb)) {
383 skb_reserve(skb, 2); 378 skb_reserve(skb, 2);
384 skb_put(desc->skb, desc->datalen); 379 skb_put(desc->skb, desc->datalen);
385 desc->skb->protocol = eth_type_trans(desc->skb, dev); 380 desc->skb->protocol = eth_type_trans(desc->skb, priv->dev);
386 desc->skb->ip_summed = CHECKSUM_NONE; 381 desc->skb->ip_summed = CHECKSUM_NONE;
387 dev->stats.rx_packets++; 382 priv->dev->stats.rx_packets++;
388 dev->stats.rx_bytes += desc->datalen; 383 priv->dev->stats.rx_bytes += desc->datalen;
389 result = desc->skb; 384 result = desc->skb;
390 dma_unmap_single(&dev->dev, desc->data_mapping, CPMAC_SKB_SIZE, 385 dma_unmap_single(&priv->dev->dev, desc->data_mapping,
391 DMA_FROM_DEVICE); 386 CPMAC_SKB_SIZE, DMA_FROM_DEVICE);
392 desc->skb = skb; 387 desc->skb = skb;
393 desc->data_mapping = dma_map_single(&dev->dev, skb->data, 388 desc->data_mapping = dma_map_single(&priv->dev->dev, skb->data,
394 CPMAC_SKB_SIZE, 389 CPMAC_SKB_SIZE,
395 DMA_FROM_DEVICE); 390 DMA_FROM_DEVICE);
396 desc->hw_data = (u32)desc->data_mapping; 391 desc->hw_data = (u32)desc->data_mapping;
397 if (unlikely(netif_msg_pktdata(priv))) { 392 if (unlikely(netif_msg_pktdata(priv))) {
398 printk(KERN_DEBUG "%s: received packet:\n", dev->name); 393 printk(KERN_DEBUG "%s: received packet:\n",
399 cpmac_dump_skb(dev, result); 394 priv->dev->name);
395 cpmac_dump_skb(priv->dev, result);
400 } 396 }
401 } else { 397 } else {
402 if (netif_msg_rx_err(priv) && net_ratelimit()) 398 if (netif_msg_rx_err(priv) && net_ratelimit())
403 printk(KERN_WARNING 399 printk(KERN_WARNING
404 "%s: low on skbs, dropping packet\n", dev->name); 400 "%s: low on skbs, dropping packet\n",
405 dev->stats.rx_dropped++; 401 priv->dev->name);
402 priv->dev->stats.rx_dropped++;
406 } 403 }
407 404
408 desc->buflen = CPMAC_SKB_SIZE; 405 desc->buflen = CPMAC_SKB_SIZE;
@@ -411,25 +408,25 @@ static struct sk_buff *cpmac_rx_one(struct net_device *dev,
411 return result; 408 return result;
412} 409}
413 410
414static int cpmac_poll(struct net_device *dev, int *budget) 411static int cpmac_poll(struct napi_struct *napi, int budget)
415{ 412{
416 struct sk_buff *skb; 413 struct sk_buff *skb;
417 struct cpmac_desc *desc; 414 struct cpmac_desc *desc;
418 int received = 0, quota = min(dev->quota, *budget); 415 int received = 0;
419 struct cpmac_priv *priv = netdev_priv(dev); 416 struct cpmac_priv *priv = container_of(napi, struct cpmac_priv, napi);
420 417
421 spin_lock(&priv->rx_lock); 418 spin_lock(&priv->rx_lock);
422 if (unlikely(!priv->rx_head)) { 419 if (unlikely(!priv->rx_head)) {
423 if (netif_msg_rx_err(priv) && net_ratelimit()) 420 if (netif_msg_rx_err(priv) && net_ratelimit())
424 printk(KERN_WARNING "%s: rx: polling, but no queue\n", 421 printk(KERN_WARNING "%s: rx: polling, but no queue\n",
425 dev->name); 422 priv->dev->name);
426 netif_rx_complete(dev); 423 netif_rx_complete(priv->dev, napi);
427 return 0; 424 return 0;
428 } 425 }
429 426
430 desc = priv->rx_head; 427 desc = priv->rx_head;
431 while ((received < quota) && ((desc->dataflags & CPMAC_OWN) == 0)) { 428 while (((desc->dataflags & CPMAC_OWN) == 0) && (received < budget)) {
432 skb = cpmac_rx_one(dev, priv, desc); 429 skb = cpmac_rx_one(priv, desc);
433 if (likely(skb)) { 430 if (likely(skb)) {
434 netif_receive_skb(skb); 431 netif_receive_skb(skb);
435 received++; 432 received++;
@@ -439,13 +436,11 @@ static int cpmac_poll(struct net_device *dev, int *budget)
439 436
440 priv->rx_head = desc; 437 priv->rx_head = desc;
441 spin_unlock(&priv->rx_lock); 438 spin_unlock(&priv->rx_lock);
442 *budget -= received;
443 dev->quota -= received;
444 if (unlikely(netif_msg_rx_status(priv))) 439 if (unlikely(netif_msg_rx_status(priv)))
445 printk(KERN_DEBUG "%s: poll processed %d packets\n", dev->name, 440 printk(KERN_DEBUG "%s: poll processed %d packets\n",
446 received); 441 priv->dev->name, received);
447 if (desc->dataflags & CPMAC_OWN) { 442 if (desc->dataflags & CPMAC_OWN) {
448 netif_rx_complete(dev); 443 netif_rx_complete(priv->dev, napi);
449 cpmac_write(priv->regs, CPMAC_RX_PTR(0), (u32)desc->mapping); 444 cpmac_write(priv->regs, CPMAC_RX_PTR(0), (u32)desc->mapping);
450 cpmac_write(priv->regs, CPMAC_RX_INT_ENABLE, 1); 445 cpmac_write(priv->regs, CPMAC_RX_INT_ENABLE, 1);
451 return 0; 446 return 0;
@@ -655,6 +650,7 @@ static void cpmac_hw_error(struct work_struct *work)
655 spin_unlock(&priv->rx_lock); 650 spin_unlock(&priv->rx_lock);
656 cpmac_clear_tx(priv->dev); 651 cpmac_clear_tx(priv->dev);
657 cpmac_hw_start(priv->dev); 652 cpmac_hw_start(priv->dev);
653 napi_enable(&priv->napi);
658 netif_start_queue(priv->dev); 654 netif_start_queue(priv->dev);
659} 655}
660 656
@@ -681,8 +677,10 @@ static irqreturn_t cpmac_irq(int irq, void *dev_id)
681 677
682 if (status & MAC_INT_RX) { 678 if (status & MAC_INT_RX) {
683 queue = (status >> 8) & 7; 679 queue = (status >> 8) & 7;
684 netif_rx_schedule(dev); 680 if (netif_rx_schedule_prep(dev, &priv->napi)) {
685 cpmac_write(priv->regs, CPMAC_RX_INT_CLEAR, 1 << queue); 681 cpmac_write(priv->regs, CPMAC_RX_INT_CLEAR, 1 << queue);
682 __netif_rx_schedule(dev, &priv->napi);
683 }
686 } 684 }
687 685
688 cpmac_write(priv->regs, CPMAC_MAC_EOI_VECTOR, 0); 686 cpmac_write(priv->regs, CPMAC_MAC_EOI_VECTOR, 0);
@@ -692,6 +690,7 @@ static irqreturn_t cpmac_irq(int irq, void *dev_id)
692 printk(KERN_ERR "%s: hw error, resetting...\n", 690 printk(KERN_ERR "%s: hw error, resetting...\n",
693 dev->name); 691 dev->name);
694 netif_stop_queue(dev); 692 netif_stop_queue(dev);
693 napi_disable(&priv->napi);
695 cpmac_hw_stop(dev); 694 cpmac_hw_stop(dev);
696 schedule_work(&priv->reset_work); 695 schedule_work(&priv->reset_work);
697 if (unlikely(netif_msg_hw(priv))) 696 if (unlikely(netif_msg_hw(priv)))
@@ -849,6 +848,15 @@ static void cpmac_adjust_link(struct net_device *dev)
849 spin_unlock(&priv->lock); 848 spin_unlock(&priv->lock);
850} 849}
851 850
851static int cpmac_link_update(struct net_device *dev,
852 struct fixed_phy_status *status)
853{
854 status->link = 1;
855 status->speed = 100;
856 status->duplex = 1;
857 return 0;
858}
859
852static int cpmac_open(struct net_device *dev) 860static int cpmac_open(struct net_device *dev)
853{ 861{
854 int i, size, res; 862 int i, size, res;
@@ -857,15 +865,6 @@ static int cpmac_open(struct net_device *dev)
857 struct cpmac_desc *desc; 865 struct cpmac_desc *desc;
858 struct sk_buff *skb; 866 struct sk_buff *skb;
859 867
860 priv->phy = phy_connect(dev, priv->phy_name, &cpmac_adjust_link,
861 0, PHY_INTERFACE_MODE_MII);
862 if (IS_ERR(priv->phy)) {
863 if (netif_msg_drv(priv))
864 printk(KERN_ERR "%s: Could not attach to PHY\n",
865 dev->name);
866 return PTR_ERR(priv->phy);
867 }
868
869 mem = platform_get_resource_byname(priv->pdev, IORESOURCE_MEM, "regs"); 868 mem = platform_get_resource_byname(priv->pdev, IORESOURCE_MEM, "regs");
870 if (!request_mem_region(mem->start, mem->end - mem->start, dev->name)) { 869 if (!request_mem_region(mem->start, mem->end - mem->start, dev->name)) {
871 if (netif_msg_drv(priv)) 870 if (netif_msg_drv(priv))
@@ -927,6 +926,7 @@ static int cpmac_open(struct net_device *dev)
927 INIT_WORK(&priv->reset_work, cpmac_hw_error); 926 INIT_WORK(&priv->reset_work, cpmac_hw_error);
928 cpmac_hw_start(dev); 927 cpmac_hw_start(dev);
929 928
929 napi_enable(&priv->napi);
930 priv->phy->state = PHY_CHANGELINK; 930 priv->phy->state = PHY_CHANGELINK;
931 phy_start(priv->phy); 931 phy_start(priv->phy);
932 932
@@ -951,8 +951,6 @@ fail_remap:
951 release_mem_region(mem->start, mem->end - mem->start); 951 release_mem_region(mem->start, mem->end - mem->start);
952 952
953fail_reserve: 953fail_reserve:
954 phy_disconnect(priv->phy);
955
956 return res; 954 return res;
957} 955}
958 956
@@ -965,9 +963,8 @@ static int cpmac_stop(struct net_device *dev)
965 netif_stop_queue(dev); 963 netif_stop_queue(dev);
966 964
967 cancel_work_sync(&priv->reset_work); 965 cancel_work_sync(&priv->reset_work);
966 napi_disable(&priv->napi);
968 phy_stop(priv->phy); 967 phy_stop(priv->phy);
969 phy_disconnect(priv->phy);
970 priv->phy = NULL;
971 968
972 cpmac_hw_stop(dev); 969 cpmac_hw_stop(dev);
973 970
@@ -1001,11 +998,13 @@ static int external_switch;
1001 998
1002static int __devinit cpmac_probe(struct platform_device *pdev) 999static int __devinit cpmac_probe(struct platform_device *pdev)
1003{ 1000{
1004 int rc, phy_id; 1001 int rc, phy_id, i;
1005 struct resource *mem; 1002 struct resource *mem;
1006 struct cpmac_priv *priv; 1003 struct cpmac_priv *priv;
1007 struct net_device *dev; 1004 struct net_device *dev;
1008 struct plat_cpmac_data *pdata; 1005 struct plat_cpmac_data *pdata;
1006 struct fixed_info *fixed_phy;
1007 DECLARE_MAC_BUF(mac);
1009 1008
1010 pdata = pdev->dev.platform_data; 1009 pdata = pdev->dev.platform_data;
1011 1010
@@ -1053,21 +1052,51 @@ static int __devinit cpmac_probe(struct platform_device *pdev)
1053 dev->set_multicast_list = cpmac_set_multicast_list; 1052 dev->set_multicast_list = cpmac_set_multicast_list;
1054 dev->tx_timeout = cpmac_tx_timeout; 1053 dev->tx_timeout = cpmac_tx_timeout;
1055 dev->ethtool_ops = &cpmac_ethtool_ops; 1054 dev->ethtool_ops = &cpmac_ethtool_ops;
1056 dev->poll = cpmac_poll;
1057 dev->weight = 64;
1058 dev->features |= NETIF_F_MULTI_QUEUE; 1055 dev->features |= NETIF_F_MULTI_QUEUE;
1059 1056
1057 netif_napi_add(dev, &priv->napi, cpmac_poll, 64);
1058
1060 spin_lock_init(&priv->lock); 1059 spin_lock_init(&priv->lock);
1061 spin_lock_init(&priv->rx_lock); 1060 spin_lock_init(&priv->rx_lock);
1062 priv->dev = dev; 1061 priv->dev = dev;
1063 priv->ring_size = 64; 1062 priv->ring_size = 64;
1064 priv->msg_enable = netif_msg_init(debug_level, 0xff); 1063 priv->msg_enable = netif_msg_init(debug_level, 0xff);
1065 memcpy(dev->dev_addr, pdata->dev_addr, sizeof(dev->dev_addr)); 1064 memcpy(dev->dev_addr, pdata->dev_addr, sizeof(dev->dev_addr));
1065
1066 if (phy_id == 31) { 1066 if (phy_id == 31) {
1067 snprintf(priv->phy_name, BUS_ID_SIZE, PHY_ID_FMT, 1067 snprintf(priv->phy_name, BUS_ID_SIZE, PHY_ID_FMT, cpmac_mii.id,
1068 cpmac_mii.id, phy_id); 1068 phy_id);
1069 } else 1069 } else {
1070 snprintf(priv->phy_name, BUS_ID_SIZE, "fixed@%d:%d", 100, 1); 1070 /* Let's try to get a free fixed phy... */
1071 for (i = 0; i < MAX_PHY_AMNT; i++) {
1072 fixed_phy = fixed_mdio_get_phydev(i);
1073 if (!fixed_phy)
1074 continue;
1075 if (!fixed_phy->phydev->attached_dev) {
1076 strncpy(priv->phy_name,
1077 fixed_phy->phydev->dev.bus_id,
1078 BUS_ID_SIZE);
1079 fixed_mdio_set_link_update(fixed_phy->phydev,
1080 &cpmac_link_update);
1081 goto phy_found;
1082 }
1083 }
1084 if (netif_msg_drv(priv))
1085 printk(KERN_ERR "%s: Could not find fixed PHY\n",
1086 dev->name);
1087 rc = -ENODEV;
1088 goto fail;
1089 }
1090
1091phy_found:
1092 priv->phy = phy_connect(dev, priv->phy_name, &cpmac_adjust_link, 0,
1093 PHY_INTERFACE_MODE_MII);
1094 if (IS_ERR(priv->phy)) {
1095 if (netif_msg_drv(priv))
1096 printk(KERN_ERR "%s: Could not attach to PHY\n",
1097 dev->name);
1098 return PTR_ERR(priv->phy);
1099 }
1071 1100
1072 if ((rc = register_netdev(dev))) { 1101 if ((rc = register_netdev(dev))) {
1073 printk(KERN_ERR "cpmac: error %i registering device %s\n", rc, 1102 printk(KERN_ERR "cpmac: error %i registering device %s\n", rc,
@@ -1077,9 +1106,9 @@ static int __devinit cpmac_probe(struct platform_device *pdev)
1077 1106
1078 if (netif_msg_probe(priv)) { 1107 if (netif_msg_probe(priv)) {
1079 printk(KERN_INFO 1108 printk(KERN_INFO
1080 "cpmac: device %s (regs: %p, irq: %d, phy: %s, mac: " 1109 "cpmac: device %s (regs: %p, irq: %d, phy: %s, "
1081 MAC_FMT ")\n", dev->name, (void *)mem->start, dev->irq, 1110 "mac: %s)\n", dev->name, (void *)mem->start, dev->irq,
1082 priv->phy_name, MAC_ARG(dev->dev_addr)); 1111 priv->phy_name, print_mac(mac, dev->dev_addr));
1083 } 1112 }
1084 return 0; 1113 return 0;
1085 1114
diff --git a/drivers/net/ehea/ehea.h b/drivers/net/ehea/ehea.h
index b557bb44a36f..4b4b74e47a67 100644
--- a/drivers/net/ehea/ehea.h
+++ b/drivers/net/ehea/ehea.h
@@ -40,7 +40,7 @@
40#include <asm/io.h> 40#include <asm/io.h>
41 41
42#define DRV_NAME "ehea" 42#define DRV_NAME "ehea"
43#define DRV_VERSION "EHEA_0078" 43#define DRV_VERSION "EHEA_0079"
44 44
45/* eHEA capability flags */ 45/* eHEA capability flags */
46#define DLPAR_PORT_ADD_REM 1 46#define DLPAR_PORT_ADD_REM 1
diff --git a/drivers/net/ehea/ehea_main.c b/drivers/net/ehea/ehea_main.c
index 2809c99906e0..0a7e78925540 100644
--- a/drivers/net/ehea/ehea_main.c
+++ b/drivers/net/ehea/ehea_main.c
@@ -2329,7 +2329,7 @@ static void port_napi_disable(struct ehea_port *port)
2329{ 2329{
2330 int i; 2330 int i;
2331 2331
2332 for (i = 0; i < port->num_def_qps; i++) 2332 for (i = 0; i < port->num_def_qps + port->num_add_tx_qps; i++)
2333 napi_disable(&port->port_res[i].napi); 2333 napi_disable(&port->port_res[i].napi);
2334} 2334}
2335 2335
@@ -2337,7 +2337,7 @@ static void port_napi_enable(struct ehea_port *port)
2337{ 2337{
2338 int i; 2338 int i;
2339 2339
2340 for (i = 0; i < port->num_def_qps; i++) 2340 for (i = 0; i < port->num_def_qps + port->num_add_tx_qps; i++)
2341 napi_enable(&port->port_res[i].napi); 2341 napi_enable(&port->port_res[i].napi);
2342} 2342}
2343 2343
@@ -2373,8 +2373,6 @@ static int ehea_down(struct net_device *dev)
2373 ehea_drop_multicast_list(dev); 2373 ehea_drop_multicast_list(dev);
2374 ehea_free_interrupts(dev); 2374 ehea_free_interrupts(dev);
2375 2375
2376 port_napi_disable(port);
2377
2378 port->state = EHEA_PORT_DOWN; 2376 port->state = EHEA_PORT_DOWN;
2379 2377
2380 ret = ehea_clean_all_portres(port); 2378 ret = ehea_clean_all_portres(port);
@@ -2396,6 +2394,7 @@ static int ehea_stop(struct net_device *dev)
2396 flush_scheduled_work(); 2394 flush_scheduled_work();
2397 down(&port->port_lock); 2395 down(&port->port_lock);
2398 netif_stop_queue(dev); 2396 netif_stop_queue(dev);
2397 port_napi_disable(port);
2399 ret = ehea_down(dev); 2398 ret = ehea_down(dev);
2400 up(&port->port_lock); 2399 up(&port->port_lock);
2401 return ret; 2400 return ret;
diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c
index 70ddf1acfd88..92ce2e38f0d5 100644
--- a/drivers/net/forcedeth.c
+++ b/drivers/net/forcedeth.c
@@ -5597,6 +5597,22 @@ static struct pci_device_id pci_tbl[] = {
5597 PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NVENET_31), 5597 PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NVENET_31),
5598 .driver_data = DEV_NEED_TIMERIRQ|DEV_NEED_LINKTIMER|DEV_HAS_HIGH_DMA|DEV_HAS_POWER_CNTRL|DEV_HAS_MSI|DEV_HAS_PAUSEFRAME_TX|DEV_HAS_STATISTICS_V2|DEV_HAS_TEST_EXTENDED|DEV_HAS_MGMT_UNIT|DEV_HAS_CORRECT_MACADDR, 5598 .driver_data = DEV_NEED_TIMERIRQ|DEV_NEED_LINKTIMER|DEV_HAS_HIGH_DMA|DEV_HAS_POWER_CNTRL|DEV_HAS_MSI|DEV_HAS_PAUSEFRAME_TX|DEV_HAS_STATISTICS_V2|DEV_HAS_TEST_EXTENDED|DEV_HAS_MGMT_UNIT|DEV_HAS_CORRECT_MACADDR,
5599 }, 5599 },
5600 { /* MCP77 Ethernet Controller */
5601 PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NVENET_32),
5602 .driver_data = DEV_NEED_TIMERIRQ|DEV_NEED_LINKTIMER|DEV_HAS_CHECKSUM|DEV_HAS_HIGH_DMA|DEV_HAS_MSI|DEV_HAS_POWER_CNTRL|DEV_HAS_PAUSEFRAME_TX|DEV_HAS_STATISTICS_V2|DEV_HAS_TEST_EXTENDED|DEV_HAS_MGMT_UNIT,
5603 },
5604 { /* MCP77 Ethernet Controller */
5605 PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NVENET_33),
5606 .driver_data = DEV_NEED_TIMERIRQ|DEV_NEED_LINKTIMER|DEV_HAS_CHECKSUM|DEV_HAS_HIGH_DMA|DEV_HAS_MSI|DEV_HAS_POWER_CNTRL|DEV_HAS_PAUSEFRAME_TX|DEV_HAS_STATISTICS_V2|DEV_HAS_TEST_EXTENDED|DEV_HAS_MGMT_UNIT,
5607 },
5608 { /* MCP77 Ethernet Controller */
5609 PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NVENET_34),
5610 .driver_data = DEV_NEED_TIMERIRQ|DEV_NEED_LINKTIMER|DEV_HAS_CHECKSUM|DEV_HAS_HIGH_DMA|DEV_HAS_MSI|DEV_HAS_POWER_CNTRL|DEV_HAS_PAUSEFRAME_TX|DEV_HAS_STATISTICS_V2|DEV_HAS_TEST_EXTENDED|DEV_HAS_MGMT_UNIT,
5611 },
5612 { /* MCP77 Ethernet Controller */
5613 PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NVENET_35),
5614 .driver_data = DEV_NEED_TIMERIRQ|DEV_NEED_LINKTIMER|DEV_HAS_CHECKSUM|DEV_HAS_HIGH_DMA|DEV_HAS_MSI|DEV_HAS_POWER_CNTRL|DEV_HAS_PAUSEFRAME_TX|DEV_HAS_STATISTICS_V2|DEV_HAS_TEST_EXTENDED|DEV_HAS_MGMT_UNIT,
5615 },
5600 {0,}, 5616 {0,},
5601}; 5617};
5602 5618
diff --git a/drivers/net/ipg.c b/drivers/net/ipg.c
index 68887235d7e9..dbd23bb65d1e 100644
--- a/drivers/net/ipg.c
+++ b/drivers/net/ipg.c
@@ -55,6 +55,26 @@ MODULE_DESCRIPTION("IC Plus IP1000 Gigabit Ethernet Adapter Linux Driver "
55 DrvVer); 55 DrvVer);
56MODULE_LICENSE("GPL"); 56MODULE_LICENSE("GPL");
57 57
58//variable record -- index by leading revision/length
59//Revision/Length(=N*4), Address1, Data1, Address2, Data2,...,AddressN,DataN
60static unsigned short DefaultPhyParam[] = {
61 // 11/12/03 IP1000A v1-3 rev=0x40
62 /*--------------------------------------------------------------------------
63 (0x4000|(15*4)), 31, 0x0001, 27, 0x01e0, 31, 0x0002, 22, 0x85bd, 24, 0xfff2,
64 27, 0x0c10, 28, 0x0c10, 29, 0x2c10, 31, 0x0003, 23, 0x92f6,
65 31, 0x0000, 23, 0x003d, 30, 0x00de, 20, 0x20e7, 9, 0x0700,
66 --------------------------------------------------------------------------*/
67 // 12/17/03 IP1000A v1-4 rev=0x40
68 (0x4000 | (07 * 4)), 31, 0x0001, 27, 0x01e0, 31, 0x0002, 27, 0xeb8e, 31,
69 0x0000,
70 30, 0x005e, 9, 0x0700,
71 // 01/09/04 IP1000A v1-5 rev=0x41
72 (0x4100 | (07 * 4)), 31, 0x0001, 27, 0x01e0, 31, 0x0002, 27, 0xeb8e, 31,
73 0x0000,
74 30, 0x005e, 9, 0x0700,
75 0x0000
76};
77
58static const char *ipg_brand_name[] = { 78static const char *ipg_brand_name[] = {
59 "IC PLUS IP1000 1000/100/10 based NIC", 79 "IC PLUS IP1000 1000/100/10 based NIC",
60 "Sundance Technology ST2021 based NIC", 80 "Sundance Technology ST2021 based NIC",
@@ -990,7 +1010,7 @@ static void ipg_nic_txcleanup(struct net_device *dev)
990} 1010}
991 1011
992/* Provides statistical information about the IPG NIC. */ 1012/* Provides statistical information about the IPG NIC. */
993struct net_device_stats *ipg_nic_get_stats(struct net_device *dev) 1013static struct net_device_stats *ipg_nic_get_stats(struct net_device *dev)
994{ 1014{
995 struct ipg_nic_private *sp = netdev_priv(dev); 1015 struct ipg_nic_private *sp = netdev_priv(dev);
996 void __iomem *ioaddr = sp->ioaddr; 1016 void __iomem *ioaddr = sp->ioaddr;
diff --git a/drivers/net/ipg.h b/drivers/net/ipg.h
index e418b9035cac..d5d092c9d0af 100644
--- a/drivers/net/ipg.h
+++ b/drivers/net/ipg.h
@@ -833,24 +833,4 @@ struct ipg_nic_private {
833 struct delayed_work task; 833 struct delayed_work task;
834}; 834};
835 835
836//variable record -- index by leading revision/length
837//Revision/Length(=N*4), Address1, Data1, Address2, Data2,...,AddressN,DataN
838unsigned short DefaultPhyParam[] = {
839 // 11/12/03 IP1000A v1-3 rev=0x40
840 /*--------------------------------------------------------------------------
841 (0x4000|(15*4)), 31, 0x0001, 27, 0x01e0, 31, 0x0002, 22, 0x85bd, 24, 0xfff2,
842 27, 0x0c10, 28, 0x0c10, 29, 0x2c10, 31, 0x0003, 23, 0x92f6,
843 31, 0x0000, 23, 0x003d, 30, 0x00de, 20, 0x20e7, 9, 0x0700,
844 --------------------------------------------------------------------------*/
845 // 12/17/03 IP1000A v1-4 rev=0x40
846 (0x4000 | (07 * 4)), 31, 0x0001, 27, 0x01e0, 31, 0x0002, 27, 0xeb8e, 31,
847 0x0000,
848 30, 0x005e, 9, 0x0700,
849 // 01/09/04 IP1000A v1-5 rev=0x41
850 (0x4100 | (07 * 4)), 31, 0x0001, 27, 0x01e0, 31, 0x0002, 27, 0xeb8e, 31,
851 0x0000,
852 30, 0x005e, 9, 0x0700,
853 0x0000
854};
855
856#endif /* __LINUX_IPG_H */ 836#endif /* __LINUX_IPG_H */
diff --git a/drivers/net/mlx4/icm.c b/drivers/net/mlx4/icm.c
index 887633b207d9..2a5bef6388fe 100644
--- a/drivers/net/mlx4/icm.c
+++ b/drivers/net/mlx4/icm.c
@@ -101,9 +101,7 @@ static int mlx4_alloc_icm_pages(struct scatterlist *mem, int order, gfp_t gfp_ma
101 if (!page) 101 if (!page)
102 return -ENOMEM; 102 return -ENOMEM;
103 103
104 sg_set_page(mem, page); 104 sg_set_page(mem, page, PAGE_SIZE << order, 0);
105 mem->length = PAGE_SIZE << order;
106 mem->offset = 0;
107 return 0; 105 return 0;
108} 106}
109 107
diff --git a/drivers/net/natsemi.c b/drivers/net/natsemi.c
index 953117152bbd..87cde062fd63 100644
--- a/drivers/net/natsemi.c
+++ b/drivers/net/natsemi.c
@@ -864,6 +864,7 @@ static int __devinit natsemi_probe1 (struct pci_dev *pdev,
864 864
865 np = netdev_priv(dev); 865 np = netdev_priv(dev);
866 netif_napi_add(dev, &np->napi, natsemi_poll, 64); 866 netif_napi_add(dev, &np->napi, natsemi_poll, 64);
867 np->dev = dev;
867 868
868 np->pci_dev = pdev; 869 np->pci_dev = pdev;
869 pci_set_drvdata(pdev, dev); 870 pci_set_drvdata(pdev, dev);
diff --git a/drivers/net/usb/rndis_host.c b/drivers/net/usb/rndis_host.c
index cd991a0f75bb..1ebe3259be0d 100644
--- a/drivers/net/usb/rndis_host.c
+++ b/drivers/net/usb/rndis_host.c
@@ -512,11 +512,19 @@ static int rndis_bind(struct usbnet *dev, struct usb_interface *intf)
512 } 512 }
513 tmp = le32_to_cpu(u.init_c->max_transfer_size); 513 tmp = le32_to_cpu(u.init_c->max_transfer_size);
514 if (tmp < dev->hard_mtu) { 514 if (tmp < dev->hard_mtu) {
515 dev_err(&intf->dev, 515 if (tmp <= net->hard_header_len) {
516 "dev can't take %u byte packets (max %u)\n", 516 dev_err(&intf->dev,
517 dev->hard_mtu, tmp); 517 "dev can't take %u byte packets (max %u)\n",
518 retval = -EINVAL; 518 dev->hard_mtu, tmp);
519 goto fail_and_release; 519 retval = -EINVAL;
520 goto fail_and_release;
521 }
522 dev->hard_mtu = tmp;
523 net->mtu = dev->hard_mtu - net->hard_header_len;
524 dev_warn(&intf->dev,
525 "dev can't take %u byte packets (max %u), "
526 "adjusting MTU to %u\n",
527 dev->hard_mtu, tmp, net->mtu);
520 } 528 }
521 529
522 /* REVISIT: peripheral "alignment" request is ignored ... */ 530 /* REVISIT: peripheral "alignment" request is ignored ... */
diff --git a/drivers/s390/scsi/zfcp_aux.c b/drivers/s390/scsi/zfcp_aux.c
index fd5d0c1570df..00118499018b 100644
--- a/drivers/s390/scsi/zfcp_aux.c
+++ b/drivers/s390/scsi/zfcp_aux.c
@@ -562,8 +562,6 @@ zfcp_sg_list_alloc(struct zfcp_sg_list *sg_list, size_t size)
562 sg_init_table(sg_list->sg, sg_list->count); 562 sg_init_table(sg_list->sg, sg_list->count);
563 563
564 for (i = 0, sg = sg_list->sg; i < sg_list->count; i++, sg++) { 564 for (i = 0, sg = sg_list->sg; i < sg_list->count; i++, sg++) {
565 sg->length = min(size, PAGE_SIZE);
566 sg->offset = 0;
567 address = (void *) get_zeroed_page(GFP_KERNEL); 565 address = (void *) get_zeroed_page(GFP_KERNEL);
568 if (address == NULL) { 566 if (address == NULL) {
569 sg_list->count = i; 567 sg_list->count = i;
@@ -571,7 +569,7 @@ zfcp_sg_list_alloc(struct zfcp_sg_list *sg_list, size_t size)
571 retval = -ENOMEM; 569 retval = -ENOMEM;
572 goto out; 570 goto out;
573 } 571 }
574 zfcp_address_to_sg(address, sg); 572 zfcp_address_to_sg(address, sg, min(size, PAGE_SIZE));
575 size -= sg->length; 573 size -= sg->length;
576 } 574 }
577 575
@@ -1518,13 +1516,13 @@ zfcp_gid_pn_buffers_alloc(struct zfcp_gid_pn_data **gid_pn, mempool_t *pool)
1518 return -ENOMEM; 1516 return -ENOMEM;
1519 1517
1520 memset(data, 0, sizeof(*data)); 1518 memset(data, 0, sizeof(*data));
1519 sg_init_table(&data->req , 1);
1520 sg_init_table(&data->resp , 1);
1521 data->ct.req = &data->req; 1521 data->ct.req = &data->req;
1522 data->ct.resp = &data->resp; 1522 data->ct.resp = &data->resp;
1523 data->ct.req_count = data->ct.resp_count = 1; 1523 data->ct.req_count = data->ct.resp_count = 1;
1524 zfcp_address_to_sg(&data->ct_iu_req, &data->req); 1524 zfcp_address_to_sg(&data->ct_iu_req, &data->req, sizeof(struct ct_iu_gid_pn_req));
1525 zfcp_address_to_sg(&data->ct_iu_resp, &data->resp); 1525 zfcp_address_to_sg(&data->ct_iu_resp, &data->resp, sizeof(struct ct_iu_gid_pn_resp));
1526 data->req.length = sizeof(struct ct_iu_gid_pn_req);
1527 data->resp.length = sizeof(struct ct_iu_gid_pn_resp);
1528 1526
1529 *gid_pn = data; 1527 *gid_pn = data;
1530 return 0; 1528 return 0;
diff --git a/drivers/s390/scsi/zfcp_def.h b/drivers/s390/scsi/zfcp_def.h
index 326e7ee232cb..0754542978b6 100644
--- a/drivers/s390/scsi/zfcp_def.h
+++ b/drivers/s390/scsi/zfcp_def.h
@@ -74,8 +74,7 @@ zfcp_sg_to_address(struct scatterlist *list)
74static inline void 74static inline void
75zfcp_address_to_sg(void *address, struct scatterlist *list) 75zfcp_address_to_sg(void *address, struct scatterlist *list)
76{ 76{
77 sg_set_page(list, virt_to_page(address)); 77 sg_set_buf(list, address, 0);
78 list->offset = ((unsigned long) address) & (PAGE_SIZE - 1);
79} 78}
80 79
81#define REQUEST_LIST_SIZE 128 80#define REQUEST_LIST_SIZE 128
diff --git a/drivers/s390/scsi/zfcp_erp.c b/drivers/s390/scsi/zfcp_erp.c
index 9438d0b28799..5552b755c08a 100644
--- a/drivers/s390/scsi/zfcp_erp.c
+++ b/drivers/s390/scsi/zfcp_erp.c
@@ -322,9 +322,9 @@ zfcp_erp_adisc(struct zfcp_port *port)
322 if (address == NULL) 322 if (address == NULL)
323 goto nomem; 323 goto nomem;
324 324
325 zfcp_address_to_sg(address, send_els->req); 325 zfcp_address_to_sg(address, send_els->req, sizeof(struct zfcp_ls_adisc));
326 address += PAGE_SIZE >> 1; 326 address += PAGE_SIZE >> 1;
327 zfcp_address_to_sg(address, send_els->resp); 327 zfcp_address_to_sg(address, send_els->resp, sizeof(struct zfcp_ls_adisc_acc));
328 send_els->req_count = send_els->resp_count = 1; 328 send_els->req_count = send_els->resp_count = 1;
329 329
330 send_els->adapter = adapter; 330 send_els->adapter = adapter;
@@ -336,9 +336,6 @@ zfcp_erp_adisc(struct zfcp_port *port)
336 adisc = zfcp_sg_to_address(send_els->req); 336 adisc = zfcp_sg_to_address(send_els->req);
337 send_els->ls_code = adisc->code = ZFCP_LS_ADISC; 337 send_els->ls_code = adisc->code = ZFCP_LS_ADISC;
338 338
339 send_els->req->length = sizeof(struct zfcp_ls_adisc);
340 send_els->resp->length = sizeof(struct zfcp_ls_adisc_acc);
341
342 /* acc. to FC-FS, hard_nport_id in ADISC should not be set for ports 339 /* acc. to FC-FS, hard_nport_id in ADISC should not be set for ports
343 without FC-AL-2 capability, so we don't set it */ 340 without FC-AL-2 capability, so we don't set it */
344 adisc->wwpn = fc_host_port_name(adapter->scsi_host); 341 adisc->wwpn = fc_host_port_name(adapter->scsi_host);
diff --git a/drivers/scsi/atari_NCR5380.c b/drivers/scsi/atari_NCR5380.c
index d1780980fb20..a9680b5e8ac6 100644
--- a/drivers/scsi/atari_NCR5380.c
+++ b/drivers/scsi/atari_NCR5380.c
@@ -477,10 +477,9 @@ static void merge_contiguous_buffers(Scsi_Cmnd *cmd)
477 477
478 for (endaddr = virt_to_phys(cmd->SCp.ptr + cmd->SCp.this_residual - 1) + 1; 478 for (endaddr = virt_to_phys(cmd->SCp.ptr + cmd->SCp.this_residual - 1) + 1;
479 cmd->SCp.buffers_residual && 479 cmd->SCp.buffers_residual &&
480 virt_to_phys(page_address(cmd->SCp.buffer[1].page) + 480 virt_to_phys(sg_virt(&cmd->SCp.buffer[1])) == endaddr;) {
481 cmd->SCp.buffer[1].offset) == endaddr;) {
482 MER_PRINTK("VTOP(%p) == %08lx -> merging\n", 481 MER_PRINTK("VTOP(%p) == %08lx -> merging\n",
483 page_address(cmd->SCp.buffer[1].page), endaddr); 482 page_address(sg_page(&cmd->SCp.buffer[1])), endaddr);
484#if (NDEBUG & NDEBUG_MERGING) 483#if (NDEBUG & NDEBUG_MERGING)
485 ++cnt; 484 ++cnt;
486#endif 485#endif
diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index 439b97a6a269..0841df01bc19 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@@ -2890,7 +2890,7 @@ static struct ipr_sglist *ipr_alloc_ucode_buffer(int buf_len)
2890 return NULL; 2890 return NULL;
2891 } 2891 }
2892 2892
2893 sg_set_page(&scatterlist[i], page); 2893 sg_set_page(&scatterlist[i], page, 0, 0);
2894 } 2894 }
2895 2895
2896 return sglist; 2896 return sglist;
diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index 6ce4109efdf3..097a136398cb 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -79,9 +79,7 @@ static inline void
79iscsi_buf_init_sg(struct iscsi_buf *ibuf, struct scatterlist *sg) 79iscsi_buf_init_sg(struct iscsi_buf *ibuf, struct scatterlist *sg)
80{ 80{
81 sg_init_table(&ibuf->sg, 1); 81 sg_init_table(&ibuf->sg, 1);
82 sg_set_page(&ibuf->sg, sg_page(sg)); 82 sg_set_page(&ibuf->sg, sg_page(sg), sg->length, sg->offset);
83 ibuf->sg.offset = sg->offset;
84 ibuf->sg.length = sg->length;
85 /* 83 /*
86 * Fastpath: sg element fits into single page 84 * Fastpath: sg element fits into single page
87 */ 85 */
diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c
index 1c5c4b68f20f..4652ad22516b 100644
--- a/drivers/scsi/osst.c
+++ b/drivers/scsi/osst.c
@@ -5256,8 +5256,7 @@ static int enlarge_buffer(struct osst_buffer *STbuffer, int need_dma)
5256 5256
5257 STbuffer->sg[0].offset = 0; 5257 STbuffer->sg[0].offset = 0;
5258 if (page != NULL) { 5258 if (page != NULL) {
5259 sg_set_page(&STbuffer->sg[0], page); 5259 sg_set_page(&STbuffer->sg[0], page, b_size, 0);
5260 STbuffer->sg[0].length = b_size;
5261 STbuffer->b_data = page_address(page); 5260 STbuffer->b_data = page_address(page);
5262 break; 5261 break;
5263 } 5262 }
@@ -5285,8 +5284,7 @@ static int enlarge_buffer(struct osst_buffer *STbuffer, int need_dma)
5285 normalize_buffer(STbuffer); 5284 normalize_buffer(STbuffer);
5286 return 0; 5285 return 0;
5287 } 5286 }
5288 sg_set_page(&STbuffer->sg[segs], page); 5287 sg_set_page(&STbuffer->sg[segs], page, (OS_FRAME_SIZE - got <= PAGE_SIZE / 2) ? (OS_FRAME_SIZE - got) : b_size, 0);
5289 STbuffer->sg[segs].length = (OS_FRAME_SIZE - got <= PAGE_SIZE / 2) ? (OS_FRAME_SIZE - got) : b_size;
5290 got += STbuffer->sg[segs].length; 5288 got += STbuffer->sg[segs].length;
5291 STbuffer->buffer_size = got; 5289 STbuffer->buffer_size = got;
5292 STbuffer->sg_segs = ++segs; 5290 STbuffer->sg_segs = ++segs;
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index cc1971002846..b5fa4f091387 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1717,16 +1717,12 @@ st_map_user_pages(struct scatterlist *sgl, const unsigned int max_pages,
1717 goto out_unlock; */ 1717 goto out_unlock; */
1718 } 1718 }
1719 1719
1720 sg_set_page(sgl, pages[0]); 1720 sg_set_page(sgl, pages[0], 0, uaddr & ~PAGE_MASK);
1721 sgl[0].offset = uaddr & ~PAGE_MASK;
1722 if (nr_pages > 1) { 1721 if (nr_pages > 1) {
1723 sgl[0].length = PAGE_SIZE - sgl[0].offset; 1722 sgl[0].length = PAGE_SIZE - sgl[0].offset;
1724 count -= sgl[0].length; 1723 count -= sgl[0].length;
1725 for (i=1; i < nr_pages ; i++) { 1724 for (i=1; i < nr_pages ; i++)
1726 sg_set_page(&sgl[i], pages[i]); 1725 sg_set_page(&sgl[i], pages[i], count < PAGE_SIZE ? count : PAGE_SIZE, 0);
1727 sgl[i].length = count < PAGE_SIZE ? count : PAGE_SIZE;
1728 count -= PAGE_SIZE;
1729 }
1730 } 1726 }
1731 else { 1727 else {
1732 sgl[0].length = count; 1728 sgl[0].length = count;
@@ -1854,8 +1850,7 @@ sg_build_indirect(Sg_scatter_hold * schp, Sg_fd * sfp, int buff_size)
1854 scatter_elem_sz_prev = ret_sz; 1850 scatter_elem_sz_prev = ret_sz;
1855 } 1851 }
1856 } 1852 }
1857 sg_set_page(sg, p); 1853 sg_set_page(sg, p, (ret_sz > num) ? num : ret_sz, 0);
1858 sg->length = (ret_sz > num) ? num : ret_sz;
1859 1854
1860 SCSI_LOG_TIMEOUT(5, printk("sg_build_indirect: k=%d, num=%d, " 1855 SCSI_LOG_TIMEOUT(5, printk("sg_build_indirect: k=%d, num=%d, "
1861 "ret_sz=%d\n", k, num, ret_sz)); 1856 "ret_sz=%d\n", k, num, ret_sz));
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index ce69b9efc102..98dfd6ea209c 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -3797,13 +3797,11 @@ static void buf_to_sg(struct st_buffer *STbp, unsigned int length)
3797 sg = &(STbp->sg[0]); 3797 sg = &(STbp->sg[0]);
3798 frp = STbp->frp; 3798 frp = STbp->frp;
3799 for (i=count=0; count < length; i++) { 3799 for (i=count=0; count < length; i++) {
3800 sg_set_page(&sg[i], frp[i].page);
3801 if (length - count > frp[i].length) 3800 if (length - count > frp[i].length)
3802 sg[i].length = frp[i].length; 3801 sg_set_page(&sg[i], frp[i].page, frp[i].length, 0);
3803 else 3802 else
3804 sg[i].length = length - count; 3803 sg_set_page(&sg[i], frp[i].page, length - count, 0);
3805 count += sg[i].length; 3804 count += sg[i].length;
3806 sg[i].offset = 0;
3807 } 3805 }
3808 STbp->sg_segs = i; 3806 STbp->sg_segs = i;
3809 STbp->frp_sg_current = length; 3807 STbp->frp_sg_current = length;
@@ -4446,15 +4444,13 @@ static int sgl_map_user_pages(struct scatterlist *sgl, const unsigned int max_pa
4446 } 4444 }
4447 4445
4448 /* Populate the scatter/gather list */ 4446 /* Populate the scatter/gather list */
4449 sg_set_page(&sgl[0], pages[0]); 4447 sg_set_page(&sgl[0], pages[0], 0, uaddr & ~PAGE_MASK);
4450 sgl[0].offset = uaddr & ~PAGE_MASK;
4451 if (nr_pages > 1) { 4448 if (nr_pages > 1) {
4452 sgl[0].length = PAGE_SIZE - sgl[0].offset; 4449 sgl[0].length = PAGE_SIZE - sgl[0].offset;
4453 count -= sgl[0].length; 4450 count -= sgl[0].length;
4454 for (i=1; i < nr_pages ; i++) { 4451 for (i=1; i < nr_pages ; i++) {
4455 sg_set_page(&sgl[i], pages[i]);; 4452 sg_set_page(&sgl[i], pages[i],
4456 sgl[i].offset = 0; 4453 count < PAGE_SIZE ? count : PAGE_SIZE, 0);;
4457 sgl[i].length = count < PAGE_SIZE ? count : PAGE_SIZE;
4458 count -= PAGE_SIZE; 4454 count -= PAGE_SIZE;
4459 } 4455 }
4460 } 4456 }
diff --git a/drivers/scsi/sun3x_esp.c b/drivers/scsi/sun3x_esp.c
index 80fb3f88af2e..1bc41907a038 100644
--- a/drivers/scsi/sun3x_esp.c
+++ b/drivers/scsi/sun3x_esp.c
@@ -332,8 +332,8 @@ static void dma_mmu_get_scsi_sgl (struct NCR_ESP *esp, Scsi_Cmnd *sp)
332 struct scatterlist *sg = sp->SCp.buffer; 332 struct scatterlist *sg = sp->SCp.buffer;
333 333
334 while (sz >= 0) { 334 while (sz >= 0) {
335 sg[sz].dma_address = dvma_map((unsigned long)page_address(sg[sz].page) + 335 sg[sz].dma_address = dvma_map((unsigned long)sg_virt(&sg[sz]),
336 sg[sz].offset, sg[sz].length); 336 sg[sz].length);
337 sz--; 337 sz--;
338 } 338 }
339 sp->SCp.ptr=(char *)((unsigned long)sp->SCp.buffer->dma_address); 339 sp->SCp.ptr=(char *)((unsigned long)sp->SCp.buffer->dma_address);
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 0a9882edf562..7a472b129997 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -282,10 +282,8 @@ int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,
282 while (size > 0 && i < sg_size) { 282 while (size > 0 && i < sg_size) {
283 pg = virt_to_page(addr); 283 pg = virt_to_page(addr);
284 offset = offset_in_page(addr); 284 offset = offset_in_page(addr);
285 if (sg) { 285 if (sg)
286 sg_set_page(&sg[i], pg); 286 sg_set_page(&sg[i], pg, 0, offset);
287 sg[i].offset = offset;
288 }
289 remainder_of_page = PAGE_CACHE_SIZE - offset; 287 remainder_of_page = PAGE_CACHE_SIZE - offset;
290 if (size >= remainder_of_page) { 288 if (size >= remainder_of_page) {
291 if (sg) 289 if (sg)
@@ -716,12 +714,8 @@ ecryptfs_encrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
716 sg_init_table(&src_sg, 1); 714 sg_init_table(&src_sg, 1);
717 sg_init_table(&dst_sg, 1); 715 sg_init_table(&dst_sg, 1);
718 716
719 sg_set_page(&src_sg, src_page); 717 sg_set_page(&src_sg, src_page, size, src_offset);
720 src_sg.offset = src_offset; 718 sg_set_page(&dst_sg, dst_page, size, dst_offset);
721 src_sg.length = size;
722 sg_set_page(&dst_sg, dst_page);
723 dst_sg.offset = dst_offset;
724 dst_sg.length = size;
725 return encrypt_scatterlist(crypt_stat, &dst_sg, &src_sg, size, iv); 719 return encrypt_scatterlist(crypt_stat, &dst_sg, &src_sg, size, iv);
726} 720}
727 721
@@ -746,14 +740,11 @@ ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
746 struct scatterlist src_sg, dst_sg; 740 struct scatterlist src_sg, dst_sg;
747 741
748 sg_init_table(&src_sg, 1); 742 sg_init_table(&src_sg, 1);
743 sg_set_page(&src_sg, src_page, size, src_offset);
744
749 sg_init_table(&dst_sg, 1); 745 sg_init_table(&dst_sg, 1);
746 sg_set_page(&dst_sg, dst_page, size, dst_offset);
750 747
751 sg_set_page(&src_sg, src_page);
752 src_sg.offset = src_offset;
753 src_sg.length = size;
754 sg_set_page(&dst_sg, dst_page);
755 dst_sg.offset = dst_offset;
756 dst_sg.length = size;
757 return decrypt_scatterlist(crypt_stat, &dst_sg, &src_sg, size, iv); 748 return decrypt_scatterlist(crypt_stat, &dst_sg, &src_sg, size, iv);
758} 749}
759 750
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 1046cbefbfbf..eb31b73e7d69 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -403,9 +403,9 @@ mb_cache_entry_alloc(struct mb_cache *cache)
403{ 403{
404 struct mb_cache_entry *ce; 404 struct mb_cache_entry *ce;
405 405
406 atomic_inc(&cache->c_entry_count);
407 ce = kmem_cache_alloc(cache->c_entry_cache, GFP_KERNEL); 406 ce = kmem_cache_alloc(cache->c_entry_cache, GFP_KERNEL);
408 if (ce) { 407 if (ce) {
408 atomic_inc(&cache->c_entry_count);
409 INIT_LIST_HEAD(&ce->e_lru_list); 409 INIT_LIST_HEAD(&ce->e_lru_list);
410 INIT_LIST_HEAD(&ce->e_block_list); 410 INIT_LIST_HEAD(&ce->e_block_list);
411 ce->e_cache = cache; 411 ce->e_cache = cache;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 680c429bfa22..4e57fcf85982 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -171,7 +171,8 @@ static ssize_t proc_sys_read(struct file *filp, char __user *buf,
171 struct dentry *dentry = filp->f_dentry; 171 struct dentry *dentry = filp->f_dentry;
172 struct ctl_table_header *head; 172 struct ctl_table_header *head;
173 struct ctl_table *table; 173 struct ctl_table *table;
174 ssize_t error, res; 174 ssize_t error;
175 size_t res;
175 176
176 table = do_proc_sys_lookup(dentry->d_parent, &dentry->d_name, &head); 177 table = do_proc_sys_lookup(dentry->d_parent, &dentry->d_name, &head);
177 /* Has the sysctl entry disappeared on us? */ 178 /* Has the sysctl entry disappeared on us? */
@@ -209,7 +210,8 @@ static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
209 struct dentry *dentry = filp->f_dentry; 210 struct dentry *dentry = filp->f_dentry;
210 struct ctl_table_header *head; 211 struct ctl_table_header *head;
211 struct ctl_table *table; 212 struct ctl_table *table;
212 ssize_t error, res; 213 ssize_t error;
214 size_t res;
213 215
214 table = do_proc_sys_lookup(dentry->d_parent, &dentry->d_name, &head); 216 table = do_proc_sys_lookup(dentry->d_parent, &dentry->d_name, &head);
215 /* Has the sysctl entry disappeared on us? */ 217 /* Has the sysctl entry disappeared on us? */
diff --git a/include/asm-avr32/dma-mapping.h b/include/asm-avr32/dma-mapping.h
index a7131630c057..57dc672bab8e 100644
--- a/include/asm-avr32/dma-mapping.h
+++ b/include/asm-avr32/dma-mapping.h
@@ -3,7 +3,7 @@
3 3
4#include <linux/mm.h> 4#include <linux/mm.h>
5#include <linux/device.h> 5#include <linux/device.h>
6#include <asm/scatterlist.h> 6#include <linux/scatterlist.h>
7#include <asm/processor.h> 7#include <asm/processor.h>
8#include <asm/cacheflush.h> 8#include <asm/cacheflush.h>
9#include <asm/io.h> 9#include <asm/io.h>
diff --git a/include/asm-frv/scatterlist.h b/include/asm-frv/scatterlist.h
index 99ba76edc42a..2e7143b5a7ad 100644
--- a/include/asm-frv/scatterlist.h
+++ b/include/asm-frv/scatterlist.h
@@ -16,8 +16,7 @@
16 * 16 *
17 * can be rewritten as 17 * can be rewritten as
18 * 18 *
19 * sg_set_page(virt_to_page(some_ptr)); 19 * sg_set_buf(sg, some_ptr, length);
20 * sg->offset = (unsigned long) some_ptr & ~PAGE_MASK;
21 * 20 *
22 * and that's it. There's no excuse for not highmem enabling YOUR driver. /jens 21 * and that's it. There's no excuse for not highmem enabling YOUR driver. /jens
23 */ 22 */
diff --git a/include/asm-x86/lguest_hcall.h b/include/asm-x86/lguest_hcall.h
index f948491eb56a..9c5092b6aa9f 100644
--- a/include/asm-x86/lguest_hcall.h
+++ b/include/asm-x86/lguest_hcall.h
@@ -18,12 +18,17 @@
18#define LHCALL_LOAD_TLS 16 18#define LHCALL_LOAD_TLS 16
19#define LHCALL_NOTIFY 17 19#define LHCALL_NOTIFY 17
20 20
21#define LGUEST_TRAP_ENTRY 0x1F
22
23#ifndef __ASSEMBLY__
24#include <asm/hw_irq.h>
25
21/*G:031 First, how does our Guest contact the Host to ask for privileged 26/*G:031 First, how does our Guest contact the Host to ask for privileged
22 * operations? There are two ways: the direct way is to make a "hypercall", 27 * operations? There are two ways: the direct way is to make a "hypercall",
23 * to make requests of the Host Itself. 28 * to make requests of the Host Itself.
24 * 29 *
25 * Our hypercall mechanism uses the highest unused trap code (traps 32 and 30 * Our hypercall mechanism uses the highest unused trap code (traps 32 and
26 * above are used by real hardware interrupts). Seventeen hypercalls are 31 * above are used by real hardware interrupts). Fifteen hypercalls are
27 * available: the hypercall number is put in the %eax register, and the 32 * available: the hypercall number is put in the %eax register, and the
28 * arguments (when required) are placed in %edx, %ebx and %ecx. If a return 33 * arguments (when required) are placed in %edx, %ebx and %ecx. If a return
29 * value makes sense, it's returned in %eax. 34 * value makes sense, it's returned in %eax.
@@ -31,20 +36,15 @@
31 * Grossly invalid calls result in Sudden Death at the hands of the vengeful 36 * Grossly invalid calls result in Sudden Death at the hands of the vengeful
32 * Host, rather than returning failure. This reflects Winston Churchill's 37 * Host, rather than returning failure. This reflects Winston Churchill's
33 * definition of a gentleman: "someone who is only rude intentionally". */ 38 * definition of a gentleman: "someone who is only rude intentionally". */
34#define LGUEST_TRAP_ENTRY 0x1F
35
36#ifndef __ASSEMBLY__
37#include <asm/hw_irq.h>
38
39static inline unsigned long 39static inline unsigned long
40hcall(unsigned long call, 40hcall(unsigned long call,
41 unsigned long arg1, unsigned long arg2, unsigned long arg3) 41 unsigned long arg1, unsigned long arg2, unsigned long arg3)
42{ 42{
43 /* "int" is the Intel instruction to trigger a trap. */ 43 /* "int" is the Intel instruction to trigger a trap. */
44 asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY) 44 asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
45 /* The call is in %eax (aka "a"), and can be replaced */ 45 /* The call in %eax (aka "a") might be overwritten */
46 : "=a"(call) 46 : "=a"(call)
47 /* The other arguments are in %eax, %edx, %ebx & %ecx */ 47 /* The arguments are in %eax, %edx, %ebx & %ecx */
48 : "a"(call), "d"(arg1), "b"(arg2), "c"(arg3) 48 : "a"(call), "d"(arg1), "b"(arg2), "c"(arg3)
49 /* "memory" means this might write somewhere in memory. 49 /* "memory" means this might write somewhere in memory.
50 * This isn't true for all calls, but it's safe to tell 50 * This isn't true for all calls, but it's safe to tell
diff --git a/include/asm-xtensa/dma-mapping.h b/include/asm-xtensa/dma-mapping.h
index 8bd9d2c02a24..3c7d537dd15d 100644
--- a/include/asm-xtensa/dma-mapping.h
+++ b/include/asm-xtensa/dma-mapping.h
@@ -11,10 +11,10 @@
11#ifndef _XTENSA_DMA_MAPPING_H 11#ifndef _XTENSA_DMA_MAPPING_H
12#define _XTENSA_DMA_MAPPING_H 12#define _XTENSA_DMA_MAPPING_H
13 13
14#include <asm/scatterlist.h>
15#include <asm/cache.h> 14#include <asm/cache.h>
16#include <asm/io.h> 15#include <asm/io.h>
17#include <linux/mm.h> 16#include <linux/mm.h>
17#include <linux/scatterlist.h>
18 18
19/* 19/*
20 * DMA-consistent mapping functions. 20 * DMA-consistent mapping functions.
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index c811c8b979ac..c68b67b86ef1 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -101,6 +101,12 @@ extern void __chk_io_ptr(const volatile void __iomem *);
101#undef __must_check 101#undef __must_check
102#define __must_check 102#define __must_check
103#endif 103#endif
104#ifndef CONFIG_ENABLE_WARN_DEPRECATED
105#undef __deprecated
106#undef __deprecated_for_modules
107#define __deprecated
108#define __deprecated_for_modules
109#endif
104 110
105/* 111/*
106 * Allow us to avoid 'defined but not used' warnings on functions and data, 112 * Allow us to avoid 'defined but not used' warnings on functions and data,
diff --git a/include/linux/completion.h b/include/linux/completion.h
index 268c5a4a2bd4..33d6aaf94447 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -42,15 +42,15 @@ static inline void init_completion(struct completion *x)
42 init_waitqueue_head(&x->wait); 42 init_waitqueue_head(&x->wait);
43} 43}
44 44
45extern void FASTCALL(wait_for_completion(struct completion *)); 45extern void wait_for_completion(struct completion *);
46extern int FASTCALL(wait_for_completion_interruptible(struct completion *x)); 46extern int wait_for_completion_interruptible(struct completion *x);
47extern unsigned long FASTCALL(wait_for_completion_timeout(struct completion *x, 47extern unsigned long wait_for_completion_timeout(struct completion *x,
48 unsigned long timeout)); 48 unsigned long timeout);
49extern unsigned long FASTCALL(wait_for_completion_interruptible_timeout( 49extern unsigned long wait_for_completion_interruptible_timeout(
50 struct completion *x, unsigned long timeout)); 50 struct completion *x, unsigned long timeout);
51 51
52extern void FASTCALL(complete(struct completion *)); 52extern void complete(struct completion *);
53extern void FASTCALL(complete_all(struct completion *)); 53extern void complete_all(struct completion *);
54 54
55#define INIT_COMPLETION(x) ((x).done = 0) 55#define INIT_COMPLETION(x) ((x).done = 0)
56 56
diff --git a/include/linux/lguest.h b/include/linux/lguest.h
index 8beb29134626..175e63f4a8c0 100644
--- a/include/linux/lguest.h
+++ b/include/linux/lguest.h
@@ -12,8 +12,8 @@
12#define LG_CLOCK_MAX_DELTA ULONG_MAX 12#define LG_CLOCK_MAX_DELTA ULONG_MAX
13 13
14/*G:032 The second method of communicating with the Host is to via "struct 14/*G:032 The second method of communicating with the Host is to via "struct
15 * lguest_data". The Guest's very first hypercall is to tell the Host where 15 * lguest_data". Once the Guest's initialization hypercall tells the Host where
16 * this is, and then the Guest and Host both publish information in it. :*/ 16 * this is, the Guest and Host both publish information in it. :*/
17struct lguest_data 17struct lguest_data
18{ 18{
19 /* 512 == enabled (same as eflags in normal hardware). The Guest 19 /* 512 == enabled (same as eflags in normal hardware). The Guest
diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h
index 61e1e3e6b1cc..697104da91f1 100644
--- a/include/linux/lguest_launcher.h
+++ b/include/linux/lguest_launcher.h
@@ -1,17 +1,7 @@
1#ifndef _ASM_LGUEST_USER 1#ifndef _LINUX_LGUEST_LAUNCHER
2#define _ASM_LGUEST_USER 2#define _LINUX_LGUEST_LAUNCHER
3/* Everything the "lguest" userspace program needs to know. */ 3/* Everything the "lguest" userspace program needs to know. */
4#include <linux/types.h> 4#include <linux/types.h>
5/* They can register up to 32 arrays of lguest_dma. */
6#define LGUEST_MAX_DMA 32
7/* At most we can dma 16 lguest_dma in one op. */
8#define LGUEST_MAX_DMA_SECTIONS 16
9
10/* How many devices? Assume each one wants up to two dma arrays per device. */
11#define LGUEST_MAX_DEVICES (LGUEST_MAX_DMA/2)
12
13/* Where the Host expects the Guest to SEND_DMA console output to. */
14#define LGUEST_CONSOLE_DMA_KEY 0
15 5
16/*D:010 6/*D:010
17 * Drivers 7 * Drivers
@@ -20,7 +10,11 @@
20 * real devices (think of the damage it could do!) we provide virtual devices. 10 * real devices (think of the damage it could do!) we provide virtual devices.
21 * We could emulate a PCI bus with various devices on it, but that is a fairly 11 * We could emulate a PCI bus with various devices on it, but that is a fairly
22 * complex burden for the Host and suboptimal for the Guest, so we have our own 12 * complex burden for the Host and suboptimal for the Guest, so we have our own
23 * "lguest" bus and simple drivers. 13 * simple lguest bus and we use "virtio" drivers. These drivers need a set of
14 * routines from us which will actually do the virtual I/O, but they handle all
15 * the net/block/console stuff themselves. This means that if we want to add
16 * a new device, we simply need to write a new virtio driver and create support
17 * for it in the Launcher: this code won't need to change.
24 * 18 *
25 * Devices are described by a simplified ID, a status byte, and some "config" 19 * Devices are described by a simplified ID, a status byte, and some "config"
26 * bytes which describe this device's configuration. This is placed by the 20 * bytes which describe this device's configuration. This is placed by the
@@ -51,9 +45,9 @@ struct lguest_vqconfig {
51/* Write command first word is a request. */ 45/* Write command first word is a request. */
52enum lguest_req 46enum lguest_req
53{ 47{
54 LHREQ_INITIALIZE, /* + pfnlimit, pgdir, start, pageoffset */ 48 LHREQ_INITIALIZE, /* + base, pfnlimit, pgdir, start */
55 LHREQ_GETDMA, /* No longer used */ 49 LHREQ_GETDMA, /* No longer used */
56 LHREQ_IRQ, /* + irq */ 50 LHREQ_IRQ, /* + irq */
57 LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */ 51 LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */
58}; 52};
59#endif /* _ASM_LGUEST_USER */ 53#endif /* _LINUX_LGUEST_LAUNCHER */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 4e10a074ca56..e44aac8cf5ff 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1236,6 +1236,10 @@
1236#define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP67_IDE 0x0560 1236#define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP67_IDE 0x0560
1237#define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP73_IDE 0x056C 1237#define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP73_IDE 0x056C
1238#define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP77_IDE 0x0759 1238#define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP77_IDE 0x0759
1239#define PCI_DEVICE_ID_NVIDIA_NVENET_32 0x0760
1240#define PCI_DEVICE_ID_NVIDIA_NVENET_33 0x0761
1241#define PCI_DEVICE_ID_NVIDIA_NVENET_34 0x0762
1242#define PCI_DEVICE_ID_NVIDIA_NVENET_35 0x0763
1239 1243
1240#define PCI_VENDOR_ID_IMS 0x10e0 1244#define PCI_VENDOR_ID_IMS 0x10e0
1241#define PCI_DEVICE_ID_IMS_TT128 0x9128 1245#define PCI_DEVICE_ID_IMS_TT128 0x9128
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index df7ddcee7c4b..457123171389 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -1,6 +1,7 @@
1#ifndef _LINUX_SCATTERLIST_H 1#ifndef _LINUX_SCATTERLIST_H
2#define _LINUX_SCATTERLIST_H 2#define _LINUX_SCATTERLIST_H
3 3
4#include <asm/types.h>
4#include <asm/scatterlist.h> 5#include <asm/scatterlist.h>
5#include <linux/mm.h> 6#include <linux/mm.h>
6#include <linux/string.h> 7#include <linux/string.h>
@@ -26,18 +27,16 @@
26#define SG_MAGIC 0x87654321 27#define SG_MAGIC 0x87654321
27 28
28/** 29/**
29 * sg_set_page - Set sg entry to point at given page 30 * sg_assign_page - Assign a given page to an SG entry
30 * @sg: SG entry 31 * @sg: SG entry
31 * @page: The page 32 * @page: The page
32 * 33 *
33 * Description: 34 * Description:
34 * Use this function to set an sg entry pointing at a page, never assign 35 * Assign page to sg entry. Also see sg_set_page(), the most commonly used
35 * the page directly. We encode sg table information in the lower bits 36 * variant.
36 * of the page pointer. See sg_page() for looking up the page belonging
37 * to an sg entry.
38 * 37 *
39 **/ 38 **/
40static inline void sg_set_page(struct scatterlist *sg, struct page *page) 39static inline void sg_assign_page(struct scatterlist *sg, struct page *page)
41{ 40{
42 unsigned long page_link = sg->page_link & 0x3; 41 unsigned long page_link = sg->page_link & 0x3;
43 42
@@ -52,6 +51,28 @@ static inline void sg_set_page(struct scatterlist *sg, struct page *page)
52 sg->page_link = page_link | (unsigned long) page; 51 sg->page_link = page_link | (unsigned long) page;
53} 52}
54 53
54/**
55 * sg_set_page - Set sg entry to point at given page
56 * @sg: SG entry
57 * @page: The page
58 * @len: Length of data
59 * @offset: Offset into page
60 *
61 * Description:
62 * Use this function to set an sg entry pointing at a page, never assign
63 * the page directly. We encode sg table information in the lower bits
64 * of the page pointer. See sg_page() for looking up the page belonging
65 * to an sg entry.
66 *
67 **/
68static inline void sg_set_page(struct scatterlist *sg, struct page *page,
69 unsigned int len, unsigned int offset)
70{
71 sg_assign_page(sg, page);
72 sg->offset = offset;
73 sg->length = len;
74}
75
55#define sg_page(sg) ((struct page *) ((sg)->page_link & ~0x3)) 76#define sg_page(sg) ((struct page *) ((sg)->page_link & ~0x3))
56 77
57/** 78/**
@@ -64,9 +85,7 @@ static inline void sg_set_page(struct scatterlist *sg, struct page *page)
64static inline void sg_set_buf(struct scatterlist *sg, const void *buf, 85static inline void sg_set_buf(struct scatterlist *sg, const void *buf,
65 unsigned int buflen) 86 unsigned int buflen)
66{ 87{
67 sg_set_page(sg, virt_to_page(buf)); 88 sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf));
68 sg->offset = offset_in_page(buf);
69 sg->length = buflen;
70} 89}
71 90
72/* 91/*
@@ -237,7 +256,7 @@ static inline void sg_init_table(struct scatterlist *sgl, unsigned int nents)
237 * on the sg page. 256 * on the sg page.
238 * 257 *
239 **/ 258 **/
240static inline unsigned long sg_phys(struct scatterlist *sg) 259static inline dma_addr_t sg_phys(struct scatterlist *sg)
241{ 260{
242 return page_to_phys(sg_page(sg)) + sg->offset; 261 return page_to_phys(sg_page(sg)) + sg->offset;
243} 262}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 13df99fb2769..24e08d1d900d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -828,12 +828,17 @@ struct sched_class {
828 struct task_struct * (*pick_next_task) (struct rq *rq); 828 struct task_struct * (*pick_next_task) (struct rq *rq);
829 void (*put_prev_task) (struct rq *rq, struct task_struct *p); 829 void (*put_prev_task) (struct rq *rq, struct task_struct *p);
830 830
831#ifdef CONFIG_SMP
831 unsigned long (*load_balance) (struct rq *this_rq, int this_cpu, 832 unsigned long (*load_balance) (struct rq *this_rq, int this_cpu,
832 struct rq *busiest, 833 struct rq *busiest, unsigned long max_load_move,
833 unsigned long max_nr_move, unsigned long max_load_move,
834 struct sched_domain *sd, enum cpu_idle_type idle, 834 struct sched_domain *sd, enum cpu_idle_type idle,
835 int *all_pinned, int *this_best_prio); 835 int *all_pinned, int *this_best_prio);
836 836
837 int (*move_one_task) (struct rq *this_rq, int this_cpu,
838 struct rq *busiest, struct sched_domain *sd,
839 enum cpu_idle_type idle);
840#endif
841
837 void (*set_curr_task) (struct rq *rq); 842 void (*set_curr_task) (struct rq *rq);
838 void (*task_tick) (struct rq *rq, struct task_struct *p); 843 void (*task_tick) (struct rq *rq, struct task_struct *p);
839 void (*task_new) (struct rq *rq, struct task_struct *p); 844 void (*task_new) (struct rq *rq, struct task_struct *p);
@@ -1196,7 +1201,7 @@ static inline int rt_prio(int prio)
1196 return 0; 1201 return 0;
1197} 1202}
1198 1203
1199static inline int rt_task(struct task_struct *p) 1204static inline int rt_task(const struct task_struct *p)
1200{ 1205{
1201 return rt_prio(p->prio); 1206 return rt_prio(p->prio);
1202} 1207}
@@ -1211,22 +1216,22 @@ static inline void set_task_pgrp(struct task_struct *tsk, pid_t pgrp)
1211 tsk->signal->__pgrp = pgrp; 1216 tsk->signal->__pgrp = pgrp;
1212} 1217}
1213 1218
1214static inline struct pid *task_pid(struct task_struct *task) 1219static inline struct pid *task_pid(const struct task_struct *task)
1215{ 1220{
1216 return task->pids[PIDTYPE_PID].pid; 1221 return task->pids[PIDTYPE_PID].pid;
1217} 1222}
1218 1223
1219static inline struct pid *task_tgid(struct task_struct *task) 1224static inline struct pid *task_tgid(const struct task_struct *task)
1220{ 1225{
1221 return task->group_leader->pids[PIDTYPE_PID].pid; 1226 return task->group_leader->pids[PIDTYPE_PID].pid;
1222} 1227}
1223 1228
1224static inline struct pid *task_pgrp(struct task_struct *task) 1229static inline struct pid *task_pgrp(const struct task_struct *task)
1225{ 1230{
1226 return task->group_leader->pids[PIDTYPE_PGID].pid; 1231 return task->group_leader->pids[PIDTYPE_PGID].pid;
1227} 1232}
1228 1233
1229static inline struct pid *task_session(struct task_struct *task) 1234static inline struct pid *task_session(const struct task_struct *task)
1230{ 1235{
1231 return task->group_leader->pids[PIDTYPE_SID].pid; 1236 return task->group_leader->pids[PIDTYPE_SID].pid;
1232} 1237}
@@ -1255,7 +1260,7 @@ struct pid_namespace;
1255 * see also pid_nr() etc in include/linux/pid.h 1260 * see also pid_nr() etc in include/linux/pid.h
1256 */ 1261 */
1257 1262
1258static inline pid_t task_pid_nr(struct task_struct *tsk) 1263static inline pid_t task_pid_nr(const struct task_struct *tsk)
1259{ 1264{
1260 return tsk->pid; 1265 return tsk->pid;
1261} 1266}
@@ -1268,7 +1273,7 @@ static inline pid_t task_pid_vnr(struct task_struct *tsk)
1268} 1273}
1269 1274
1270 1275
1271static inline pid_t task_tgid_nr(struct task_struct *tsk) 1276static inline pid_t task_tgid_nr(const struct task_struct *tsk)
1272{ 1277{
1273 return tsk->tgid; 1278 return tsk->tgid;
1274} 1279}
@@ -1281,7 +1286,7 @@ static inline pid_t task_tgid_vnr(struct task_struct *tsk)
1281} 1286}
1282 1287
1283 1288
1284static inline pid_t task_pgrp_nr(struct task_struct *tsk) 1289static inline pid_t task_pgrp_nr(const struct task_struct *tsk)
1285{ 1290{
1286 return tsk->signal->__pgrp; 1291 return tsk->signal->__pgrp;
1287} 1292}
@@ -1294,7 +1299,7 @@ static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
1294} 1299}
1295 1300
1296 1301
1297static inline pid_t task_session_nr(struct task_struct *tsk) 1302static inline pid_t task_session_nr(const struct task_struct *tsk)
1298{ 1303{
1299 return tsk->signal->__session; 1304 return tsk->signal->__session;
1300} 1305}
@@ -1321,7 +1326,7 @@ static inline pid_t task_ppid_nr_ns(struct task_struct *tsk,
1321 * If pid_alive fails, then pointers within the task structure 1326 * If pid_alive fails, then pointers within the task structure
1322 * can be stale and must not be dereferenced. 1327 * can be stale and must not be dereferenced.
1323 */ 1328 */
1324static inline int pid_alive(struct task_struct *p) 1329static inline int pid_alive(const struct task_struct *p)
1325{ 1330{
1326 return p->pids[PIDTYPE_PID].pid != NULL; 1331 return p->pids[PIDTYPE_PID].pid != NULL;
1327} 1332}
@@ -1332,7 +1337,7 @@ static inline int pid_alive(struct task_struct *p)
1332 * 1337 *
1333 * Check if a task structure is the first user space task the kernel created. 1338 * Check if a task structure is the first user space task the kernel created.
1334 */ 1339 */
1335static inline int is_global_init(struct task_struct *tsk) 1340static inline int is_global_init(const struct task_struct *tsk)
1336{ 1341{
1337 return tsk->pid == 1; 1342 return tsk->pid == 1;
1338} 1343}
@@ -1469,7 +1474,7 @@ extern int rt_mutex_getprio(struct task_struct *p);
1469extern void rt_mutex_setprio(struct task_struct *p, int prio); 1474extern void rt_mutex_setprio(struct task_struct *p, int prio);
1470extern void rt_mutex_adjust_pi(struct task_struct *p); 1475extern void rt_mutex_adjust_pi(struct task_struct *p);
1471#else 1476#else
1472static inline int rt_mutex_getprio(struct task_struct *p) 1477static inline int rt_mutex_getprio(const struct task_struct *p)
1473{ 1478{
1474 return p->normal_prio; 1479 return p->normal_prio;
1475} 1480}
@@ -1721,7 +1726,7 @@ extern void wait_task_inactive(struct task_struct * p);
1721 * all we care about is that we have a task with the appropriate 1726 * all we care about is that we have a task with the appropriate
1722 * pid, we don't actually care if we have the right task. 1727 * pid, we don't actually care if we have the right task.
1723 */ 1728 */
1724static inline int has_group_leader_pid(struct task_struct *p) 1729static inline int has_group_leader_pid(const struct task_struct *p)
1725{ 1730{
1726 return p->pid == p->tgid; 1731 return p->pid == p->tgid;
1727} 1732}
@@ -1738,7 +1743,7 @@ static inline struct task_struct *next_thread(const struct task_struct *p)
1738 struct task_struct, thread_group); 1743 struct task_struct, thread_group);
1739} 1744}
1740 1745
1741static inline int thread_group_empty(struct task_struct *p) 1746static inline int thread_group_empty(const struct task_struct *p)
1742{ 1747{
1743 return list_empty(&p->thread_group); 1748 return list_empty(&p->thread_group);
1744} 1749}
diff --git a/init/Kconfig b/init/Kconfig
index b7dffa837926..8b88d0bedcbd 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -322,7 +322,6 @@ config CPUSETS
322config FAIR_GROUP_SCHED 322config FAIR_GROUP_SCHED
323 bool "Fair group CPU scheduler" 323 bool "Fair group CPU scheduler"
324 default y 324 default y
325 depends on EXPERIMENTAL
326 help 325 help
327 This feature lets CPU scheduler recognize task groups and control CPU 326 This feature lets CPU scheduler recognize task groups and control CPU
328 bandwidth allocation to such task groups. 327 bandwidth allocation to such task groups.
diff --git a/kernel/profile.c b/kernel/profile.c
index 631b75c25d7e..5e95330e5120 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -60,6 +60,7 @@ static int __init profile_setup(char * str)
60 int par; 60 int par;
61 61
62 if (!strncmp(str, sleepstr, strlen(sleepstr))) { 62 if (!strncmp(str, sleepstr, strlen(sleepstr))) {
63#ifdef CONFIG_SCHEDSTATS
63 prof_on = SLEEP_PROFILING; 64 prof_on = SLEEP_PROFILING;
64 if (str[strlen(sleepstr)] == ',') 65 if (str[strlen(sleepstr)] == ',')
65 str += strlen(sleepstr) + 1; 66 str += strlen(sleepstr) + 1;
@@ -68,6 +69,10 @@ static int __init profile_setup(char * str)
68 printk(KERN_INFO 69 printk(KERN_INFO
69 "kernel sleep profiling enabled (shift: %ld)\n", 70 "kernel sleep profiling enabled (shift: %ld)\n",
70 prof_shift); 71 prof_shift);
72#else
73 printk(KERN_WARNING
74 "kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
75#endif /* CONFIG_SCHEDSTATS */
71 } else if (!strncmp(str, schedstr, strlen(schedstr))) { 76 } else if (!strncmp(str, schedstr, strlen(schedstr))) {
72 prof_on = SCHED_PROFILING; 77 prof_on = SCHED_PROFILING;
73 if (str[strlen(schedstr)] == ',') 78 if (str[strlen(schedstr)] == ',')
diff --git a/kernel/sched.c b/kernel/sched.c
index 2810e562a991..b4fbbc440453 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -66,6 +66,7 @@
66#include <linux/pagemap.h> 66#include <linux/pagemap.h>
67 67
68#include <asm/tlb.h> 68#include <asm/tlb.h>
69#include <asm/irq_regs.h>
69 70
70/* 71/*
71 * Scheduler clock - returns current time in nanosec units. 72 * Scheduler clock - returns current time in nanosec units.
@@ -837,11 +838,18 @@ struct rq_iterator {
837 struct task_struct *(*next)(void *); 838 struct task_struct *(*next)(void *);
838}; 839};
839 840
840static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, 841#ifdef CONFIG_SMP
841 unsigned long max_nr_move, unsigned long max_load_move, 842static unsigned long
842 struct sched_domain *sd, enum cpu_idle_type idle, 843balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
843 int *all_pinned, unsigned long *load_moved, 844 unsigned long max_load_move, struct sched_domain *sd,
844 int *this_best_prio, struct rq_iterator *iterator); 845 enum cpu_idle_type idle, int *all_pinned,
846 int *this_best_prio, struct rq_iterator *iterator);
847
848static int
849iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
850 struct sched_domain *sd, enum cpu_idle_type idle,
851 struct rq_iterator *iterator);
852#endif
845 853
846#include "sched_stats.h" 854#include "sched_stats.h"
847#include "sched_idletask.c" 855#include "sched_idletask.c"
@@ -2223,17 +2231,17 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2223 return 1; 2231 return 1;
2224} 2232}
2225 2233
2226static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, 2234static unsigned long
2227 unsigned long max_nr_move, unsigned long max_load_move, 2235balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2228 struct sched_domain *sd, enum cpu_idle_type idle, 2236 unsigned long max_load_move, struct sched_domain *sd,
2229 int *all_pinned, unsigned long *load_moved, 2237 enum cpu_idle_type idle, int *all_pinned,
2230 int *this_best_prio, struct rq_iterator *iterator) 2238 int *this_best_prio, struct rq_iterator *iterator)
2231{ 2239{
2232 int pulled = 0, pinned = 0, skip_for_load; 2240 int pulled = 0, pinned = 0, skip_for_load;
2233 struct task_struct *p; 2241 struct task_struct *p;
2234 long rem_load_move = max_load_move; 2242 long rem_load_move = max_load_move;
2235 2243
2236 if (max_nr_move == 0 || max_load_move == 0) 2244 if (max_load_move == 0)
2237 goto out; 2245 goto out;
2238 2246
2239 pinned = 1; 2247 pinned = 1;
@@ -2266,7 +2274,7 @@ next:
2266 * We only want to steal up to the prescribed number of tasks 2274 * We only want to steal up to the prescribed number of tasks
2267 * and the prescribed amount of weighted load. 2275 * and the prescribed amount of weighted load.
2268 */ 2276 */
2269 if (pulled < max_nr_move && rem_load_move > 0) { 2277 if (rem_load_move > 0) {
2270 if (p->prio < *this_best_prio) 2278 if (p->prio < *this_best_prio)
2271 *this_best_prio = p->prio; 2279 *this_best_prio = p->prio;
2272 p = iterator->next(iterator->arg); 2280 p = iterator->next(iterator->arg);
@@ -2274,7 +2282,7 @@ next:
2274 } 2282 }
2275out: 2283out:
2276 /* 2284 /*
2277 * Right now, this is the only place pull_task() is called, 2285 * Right now, this is one of only two places pull_task() is called,
2278 * so we can safely collect pull_task() stats here rather than 2286 * so we can safely collect pull_task() stats here rather than
2279 * inside pull_task(). 2287 * inside pull_task().
2280 */ 2288 */
@@ -2282,8 +2290,8 @@ out:
2282 2290
2283 if (all_pinned) 2291 if (all_pinned)
2284 *all_pinned = pinned; 2292 *all_pinned = pinned;
2285 *load_moved = max_load_move - rem_load_move; 2293
2286 return pulled; 2294 return max_load_move - rem_load_move;
2287} 2295}
2288 2296
2289/* 2297/*
@@ -2305,7 +2313,7 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2305 do { 2313 do {
2306 total_load_moved += 2314 total_load_moved +=
2307 class->load_balance(this_rq, this_cpu, busiest, 2315 class->load_balance(this_rq, this_cpu, busiest,
2308 ULONG_MAX, max_load_move - total_load_moved, 2316 max_load_move - total_load_moved,
2309 sd, idle, all_pinned, &this_best_prio); 2317 sd, idle, all_pinned, &this_best_prio);
2310 class = class->next; 2318 class = class->next;
2311 } while (class && max_load_move > total_load_moved); 2319 } while (class && max_load_move > total_load_moved);
@@ -2313,6 +2321,32 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2313 return total_load_moved > 0; 2321 return total_load_moved > 0;
2314} 2322}
2315 2323
2324static int
2325iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
2326 struct sched_domain *sd, enum cpu_idle_type idle,
2327 struct rq_iterator *iterator)
2328{
2329 struct task_struct *p = iterator->start(iterator->arg);
2330 int pinned = 0;
2331
2332 while (p) {
2333 if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2334 pull_task(busiest, p, this_rq, this_cpu);
2335 /*
2336 * Right now, this is only the second place pull_task()
2337 * is called, so we can safely collect pull_task()
2338 * stats here rather than inside pull_task().
2339 */
2340 schedstat_inc(sd, lb_gained[idle]);
2341
2342 return 1;
2343 }
2344 p = iterator->next(iterator->arg);
2345 }
2346
2347 return 0;
2348}
2349
2316/* 2350/*
2317 * move_one_task tries to move exactly one task from busiest to this_rq, as 2351 * move_one_task tries to move exactly one task from busiest to this_rq, as
2318 * part of active balancing operations within "domain". 2352 * part of active balancing operations within "domain".
@@ -2324,12 +2358,9 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
2324 struct sched_domain *sd, enum cpu_idle_type idle) 2358 struct sched_domain *sd, enum cpu_idle_type idle)
2325{ 2359{
2326 const struct sched_class *class; 2360 const struct sched_class *class;
2327 int this_best_prio = MAX_PRIO;
2328 2361
2329 for (class = sched_class_highest; class; class = class->next) 2362 for (class = sched_class_highest; class; class = class->next)
2330 if (class->load_balance(this_rq, this_cpu, busiest, 2363 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
2331 1, ULONG_MAX, sd, idle, NULL,
2332 &this_best_prio))
2333 return 1; 2364 return 1;
2334 2365
2335 return 0; 2366 return 0;
@@ -3266,18 +3297,6 @@ static inline void idle_balance(int cpu, struct rq *rq)
3266{ 3297{
3267} 3298}
3268 3299
3269/* Avoid "used but not defined" warning on UP */
3270static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3271 unsigned long max_nr_move, unsigned long max_load_move,
3272 struct sched_domain *sd, enum cpu_idle_type idle,
3273 int *all_pinned, unsigned long *load_moved,
3274 int *this_best_prio, struct rq_iterator *iterator)
3275{
3276 *load_moved = 0;
3277
3278 return 0;
3279}
3280
3281#endif 3300#endif
3282 3301
3283DEFINE_PER_CPU(struct kernel_stat, kstat); 3302DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -3507,12 +3526,19 @@ EXPORT_SYMBOL(sub_preempt_count);
3507 */ 3526 */
3508static noinline void __schedule_bug(struct task_struct *prev) 3527static noinline void __schedule_bug(struct task_struct *prev)
3509{ 3528{
3510 printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n", 3529 struct pt_regs *regs = get_irq_regs();
3511 prev->comm, preempt_count(), task_pid_nr(prev)); 3530
3531 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
3532 prev->comm, prev->pid, preempt_count());
3533
3512 debug_show_held_locks(prev); 3534 debug_show_held_locks(prev);
3513 if (irqs_disabled()) 3535 if (irqs_disabled())
3514 print_irqtrace_events(prev); 3536 print_irqtrace_events(prev);
3515 dump_stack(); 3537
3538 if (regs)
3539 show_regs(regs);
3540 else
3541 dump_stack();
3516} 3542}
3517 3543
3518/* 3544/*
@@ -3820,7 +3846,7 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
3820} 3846}
3821EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ 3847EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
3822 3848
3823void fastcall complete(struct completion *x) 3849void complete(struct completion *x)
3824{ 3850{
3825 unsigned long flags; 3851 unsigned long flags;
3826 3852
@@ -3832,7 +3858,7 @@ void fastcall complete(struct completion *x)
3832} 3858}
3833EXPORT_SYMBOL(complete); 3859EXPORT_SYMBOL(complete);
3834 3860
3835void fastcall complete_all(struct completion *x) 3861void complete_all(struct completion *x)
3836{ 3862{
3837 unsigned long flags; 3863 unsigned long flags;
3838 3864
@@ -3884,13 +3910,13 @@ wait_for_common(struct completion *x, long timeout, int state)
3884 return timeout; 3910 return timeout;
3885} 3911}
3886 3912
3887void fastcall __sched wait_for_completion(struct completion *x) 3913void __sched wait_for_completion(struct completion *x)
3888{ 3914{
3889 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); 3915 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
3890} 3916}
3891EXPORT_SYMBOL(wait_for_completion); 3917EXPORT_SYMBOL(wait_for_completion);
3892 3918
3893unsigned long fastcall __sched 3919unsigned long __sched
3894wait_for_completion_timeout(struct completion *x, unsigned long timeout) 3920wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3895{ 3921{
3896 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); 3922 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
@@ -3906,7 +3932,7 @@ int __sched wait_for_completion_interruptible(struct completion *x)
3906} 3932}
3907EXPORT_SYMBOL(wait_for_completion_interruptible); 3933EXPORT_SYMBOL(wait_for_completion_interruptible);
3908 3934
3909unsigned long fastcall __sched 3935unsigned long __sched
3910wait_for_completion_interruptible_timeout(struct completion *x, 3936wait_for_completion_interruptible_timeout(struct completion *x,
3911 unsigned long timeout) 3937 unsigned long timeout)
3912{ 3938{
@@ -5461,11 +5487,12 @@ static void register_sched_domain_sysctl(void)
5461 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); 5487 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
5462 char buf[32]; 5488 char buf[32];
5463 5489
5490 WARN_ON(sd_ctl_dir[0].child);
5491 sd_ctl_dir[0].child = entry;
5492
5464 if (entry == NULL) 5493 if (entry == NULL)
5465 return; 5494 return;
5466 5495
5467 sd_ctl_dir[0].child = entry;
5468
5469 for_each_online_cpu(i) { 5496 for_each_online_cpu(i) {
5470 snprintf(buf, 32, "cpu%d", i); 5497 snprintf(buf, 32, "cpu%d", i);
5471 entry->procname = kstrdup(buf, GFP_KERNEL); 5498 entry->procname = kstrdup(buf, GFP_KERNEL);
@@ -5473,14 +5500,19 @@ static void register_sched_domain_sysctl(void)
5473 entry->child = sd_alloc_ctl_cpu_table(i); 5500 entry->child = sd_alloc_ctl_cpu_table(i);
5474 entry++; 5501 entry++;
5475 } 5502 }
5503
5504 WARN_ON(sd_sysctl_header);
5476 sd_sysctl_header = register_sysctl_table(sd_ctl_root); 5505 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
5477} 5506}
5478 5507
5508/* may be called multiple times per register */
5479static void unregister_sched_domain_sysctl(void) 5509static void unregister_sched_domain_sysctl(void)
5480{ 5510{
5481 unregister_sysctl_table(sd_sysctl_header); 5511 if (sd_sysctl_header)
5512 unregister_sysctl_table(sd_sysctl_header);
5482 sd_sysctl_header = NULL; 5513 sd_sysctl_header = NULL;
5483 sd_free_ctl_entry(&sd_ctl_dir[0].child); 5514 if (sd_ctl_dir[0].child)
5515 sd_free_ctl_entry(&sd_ctl_dir[0].child);
5484} 5516}
5485#else 5517#else
5486static void register_sched_domain_sysctl(void) 5518static void register_sched_domain_sysctl(void)
@@ -5611,101 +5643,101 @@ int nr_cpu_ids __read_mostly = NR_CPUS;
5611EXPORT_SYMBOL(nr_cpu_ids); 5643EXPORT_SYMBOL(nr_cpu_ids);
5612 5644
5613#ifdef CONFIG_SCHED_DEBUG 5645#ifdef CONFIG_SCHED_DEBUG
5614static void sched_domain_debug(struct sched_domain *sd, int cpu) 5646
5647static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level)
5615{ 5648{
5616 int level = 0; 5649 struct sched_group *group = sd->groups;
5650 cpumask_t groupmask;
5651 char str[NR_CPUS];
5617 5652
5618 if (!sd) { 5653 cpumask_scnprintf(str, NR_CPUS, sd->span);
5619 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); 5654 cpus_clear(groupmask);
5620 return; 5655
5656 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
5657
5658 if (!(sd->flags & SD_LOAD_BALANCE)) {
5659 printk("does not load-balance\n");
5660 if (sd->parent)
5661 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5662 " has parent");
5663 return -1;
5621 } 5664 }
5622 5665
5623 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 5666 printk(KERN_CONT "span %s\n", str);
5667
5668 if (!cpu_isset(cpu, sd->span)) {
5669 printk(KERN_ERR "ERROR: domain->span does not contain "
5670 "CPU%d\n", cpu);
5671 }
5672 if (!cpu_isset(cpu, group->cpumask)) {
5673 printk(KERN_ERR "ERROR: domain->groups does not contain"
5674 " CPU%d\n", cpu);
5675 }
5624 5676
5677 printk(KERN_DEBUG "%*s groups:", level + 1, "");
5625 do { 5678 do {
5626 int i; 5679 if (!group) {
5627 char str[NR_CPUS]; 5680 printk("\n");
5628 struct sched_group *group = sd->groups; 5681 printk(KERN_ERR "ERROR: group is NULL\n");
5629 cpumask_t groupmask;
5630
5631 cpumask_scnprintf(str, NR_CPUS, sd->span);
5632 cpus_clear(groupmask);
5633
5634 printk(KERN_DEBUG);
5635 for (i = 0; i < level + 1; i++)
5636 printk(" ");
5637 printk("domain %d: ", level);
5638
5639 if (!(sd->flags & SD_LOAD_BALANCE)) {
5640 printk("does not load-balance\n");
5641 if (sd->parent)
5642 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5643 " has parent");
5644 break; 5682 break;
5645 } 5683 }
5646 5684
5647 printk("span %s\n", str); 5685 if (!group->__cpu_power) {
5686 printk(KERN_CONT "\n");
5687 printk(KERN_ERR "ERROR: domain->cpu_power not "
5688 "set\n");
5689 break;
5690 }
5648 5691
5649 if (!cpu_isset(cpu, sd->span)) 5692 if (!cpus_weight(group->cpumask)) {
5650 printk(KERN_ERR "ERROR: domain->span does not contain " 5693 printk(KERN_CONT "\n");
5651 "CPU%d\n", cpu); 5694 printk(KERN_ERR "ERROR: empty group\n");
5652 if (!cpu_isset(cpu, group->cpumask)) 5695 break;
5653 printk(KERN_ERR "ERROR: domain->groups does not contain" 5696 }
5654 " CPU%d\n", cpu);
5655 5697
5656 printk(KERN_DEBUG); 5698 if (cpus_intersects(groupmask, group->cpumask)) {
5657 for (i = 0; i < level + 2; i++) 5699 printk(KERN_CONT "\n");
5658 printk(" "); 5700 printk(KERN_ERR "ERROR: repeated CPUs\n");
5659 printk("groups:"); 5701 break;
5660 do { 5702 }
5661 if (!group) {
5662 printk("\n");
5663 printk(KERN_ERR "ERROR: group is NULL\n");
5664 break;
5665 }
5666 5703
5667 if (!group->__cpu_power) { 5704 cpus_or(groupmask, groupmask, group->cpumask);
5668 printk(KERN_CONT "\n");
5669 printk(KERN_ERR "ERROR: domain->cpu_power not "
5670 "set\n");
5671 break;
5672 }
5673 5705
5674 if (!cpus_weight(group->cpumask)) { 5706 cpumask_scnprintf(str, NR_CPUS, group->cpumask);
5675 printk(KERN_CONT "\n"); 5707 printk(KERN_CONT " %s", str);
5676 printk(KERN_ERR "ERROR: empty group\n");
5677 break;
5678 }
5679 5708
5680 if (cpus_intersects(groupmask, group->cpumask)) { 5709 group = group->next;
5681 printk(KERN_CONT "\n"); 5710 } while (group != sd->groups);
5682 printk(KERN_ERR "ERROR: repeated CPUs\n"); 5711 printk(KERN_CONT "\n");
5683 break;
5684 }
5685 5712
5686 cpus_or(groupmask, groupmask, group->cpumask); 5713 if (!cpus_equal(sd->span, groupmask))
5714 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
5687 5715
5688 cpumask_scnprintf(str, NR_CPUS, group->cpumask); 5716 if (sd->parent && !cpus_subset(groupmask, sd->parent->span))
5689 printk(KERN_CONT " %s", str); 5717 printk(KERN_ERR "ERROR: parent span is not a superset "
5718 "of domain->span\n");
5719 return 0;
5720}
5690 5721
5691 group = group->next; 5722static void sched_domain_debug(struct sched_domain *sd, int cpu)
5692 } while (group != sd->groups); 5723{
5693 printk(KERN_CONT "\n"); 5724 int level = 0;
5694 5725
5695 if (!cpus_equal(sd->span, groupmask)) 5726 if (!sd) {
5696 printk(KERN_ERR "ERROR: groups don't span " 5727 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5697 "domain->span\n"); 5728 return;
5729 }
5730
5731 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5698 5732
5733 for (;;) {
5734 if (sched_domain_debug_one(sd, cpu, level))
5735 break;
5699 level++; 5736 level++;
5700 sd = sd->parent; 5737 sd = sd->parent;
5701 if (!sd) 5738 if (!sd)
5702 continue; 5739 break;
5703 5740 }
5704 if (!cpus_subset(groupmask, sd->span))
5705 printk(KERN_ERR "ERROR: parent span is not a superset "
5706 "of domain->span\n");
5707
5708 } while (sd);
5709} 5741}
5710#else 5742#else
5711# define sched_domain_debug(sd, cpu) do { } while (0) 5743# define sched_domain_debug(sd, cpu) do { } while (0)
@@ -6424,13 +6456,17 @@ static cpumask_t fallback_doms;
6424 */ 6456 */
6425static int arch_init_sched_domains(const cpumask_t *cpu_map) 6457static int arch_init_sched_domains(const cpumask_t *cpu_map)
6426{ 6458{
6459 int err;
6460
6427 ndoms_cur = 1; 6461 ndoms_cur = 1;
6428 doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); 6462 doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
6429 if (!doms_cur) 6463 if (!doms_cur)
6430 doms_cur = &fallback_doms; 6464 doms_cur = &fallback_doms;
6431 cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); 6465 cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
6466 err = build_sched_domains(doms_cur);
6432 register_sched_domain_sysctl(); 6467 register_sched_domain_sysctl();
6433 return build_sched_domains(doms_cur); 6468
6469 return err;
6434} 6470}
6435 6471
6436static void arch_destroy_sched_domains(const cpumask_t *cpu_map) 6472static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
@@ -6479,6 +6515,9 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
6479{ 6515{
6480 int i, j; 6516 int i, j;
6481 6517
6518 /* always unregister in case we don't destroy any domains */
6519 unregister_sched_domain_sysctl();
6520
6482 if (doms_new == NULL) { 6521 if (doms_new == NULL) {
6483 ndoms_new = 1; 6522 ndoms_new = 1;
6484 doms_new = &fallback_doms; 6523 doms_new = &fallback_doms;
@@ -6514,6 +6553,8 @@ match2:
6514 kfree(doms_cur); 6553 kfree(doms_cur);
6515 doms_cur = doms_new; 6554 doms_cur = doms_new;
6516 ndoms_cur = ndoms_new; 6555 ndoms_cur = ndoms_new;
6556
6557 register_sched_domain_sysctl();
6517} 6558}
6518 6559
6519#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 6560#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -7101,25 +7142,25 @@ unsigned long sched_group_shares(struct task_group *tg)
7101#ifdef CONFIG_FAIR_CGROUP_SCHED 7142#ifdef CONFIG_FAIR_CGROUP_SCHED
7102 7143
7103/* return corresponding task_group object of a cgroup */ 7144/* return corresponding task_group object of a cgroup */
7104static inline struct task_group *cgroup_tg(struct cgroup *cont) 7145static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
7105{ 7146{
7106 return container_of(cgroup_subsys_state(cont, cpu_cgroup_subsys_id), 7147 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
7107 struct task_group, css); 7148 struct task_group, css);
7108} 7149}
7109 7150
7110static struct cgroup_subsys_state * 7151static struct cgroup_subsys_state *
7111cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 7152cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
7112{ 7153{
7113 struct task_group *tg; 7154 struct task_group *tg;
7114 7155
7115 if (!cont->parent) { 7156 if (!cgrp->parent) {
7116 /* This is early initialization for the top cgroup */ 7157 /* This is early initialization for the top cgroup */
7117 init_task_group.css.cgroup = cont; 7158 init_task_group.css.cgroup = cgrp;
7118 return &init_task_group.css; 7159 return &init_task_group.css;
7119 } 7160 }
7120 7161
7121 /* we support only 1-level deep hierarchical scheduler atm */ 7162 /* we support only 1-level deep hierarchical scheduler atm */
7122 if (cont->parent->parent) 7163 if (cgrp->parent->parent)
7123 return ERR_PTR(-EINVAL); 7164 return ERR_PTR(-EINVAL);
7124 7165
7125 tg = sched_create_group(); 7166 tg = sched_create_group();
@@ -7127,21 +7168,21 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
7127 return ERR_PTR(-ENOMEM); 7168 return ERR_PTR(-ENOMEM);
7128 7169
7129 /* Bind the cgroup to task_group object we just created */ 7170 /* Bind the cgroup to task_group object we just created */
7130 tg->css.cgroup = cont; 7171 tg->css.cgroup = cgrp;
7131 7172
7132 return &tg->css; 7173 return &tg->css;
7133} 7174}
7134 7175
7135static void cpu_cgroup_destroy(struct cgroup_subsys *ss, 7176static void cpu_cgroup_destroy(struct cgroup_subsys *ss,
7136 struct cgroup *cont) 7177 struct cgroup *cgrp)
7137{ 7178{
7138 struct task_group *tg = cgroup_tg(cont); 7179 struct task_group *tg = cgroup_tg(cgrp);
7139 7180
7140 sched_destroy_group(tg); 7181 sched_destroy_group(tg);
7141} 7182}
7142 7183
7143static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, 7184static int cpu_cgroup_can_attach(struct cgroup_subsys *ss,
7144 struct cgroup *cont, struct task_struct *tsk) 7185 struct cgroup *cgrp, struct task_struct *tsk)
7145{ 7186{
7146 /* We don't support RT-tasks being in separate groups */ 7187 /* We don't support RT-tasks being in separate groups */
7147 if (tsk->sched_class != &fair_sched_class) 7188 if (tsk->sched_class != &fair_sched_class)
@@ -7151,38 +7192,21 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys *ss,
7151} 7192}
7152 7193
7153static void 7194static void
7154cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cont, 7195cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7155 struct cgroup *old_cont, struct task_struct *tsk) 7196 struct cgroup *old_cont, struct task_struct *tsk)
7156{ 7197{
7157 sched_move_task(tsk); 7198 sched_move_task(tsk);
7158} 7199}
7159 7200
7160static ssize_t cpu_shares_write(struct cgroup *cont, struct cftype *cftype, 7201static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype,
7161 struct file *file, const char __user *userbuf, 7202 u64 shareval)
7162 size_t nbytes, loff_t *ppos)
7163{ 7203{
7164 unsigned long shareval; 7204 return sched_group_set_shares(cgroup_tg(cgrp), shareval);
7165 struct task_group *tg = cgroup_tg(cont);
7166 char buffer[2*sizeof(unsigned long) + 1];
7167 int rc;
7168
7169 if (nbytes > 2*sizeof(unsigned long)) /* safety check */
7170 return -E2BIG;
7171
7172 if (copy_from_user(buffer, userbuf, nbytes))
7173 return -EFAULT;
7174
7175 buffer[nbytes] = 0; /* nul-terminate */
7176 shareval = simple_strtoul(buffer, NULL, 10);
7177
7178 rc = sched_group_set_shares(tg, shareval);
7179
7180 return (rc < 0 ? rc : nbytes);
7181} 7205}
7182 7206
7183static u64 cpu_shares_read_uint(struct cgroup *cont, struct cftype *cft) 7207static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
7184{ 7208{
7185 struct task_group *tg = cgroup_tg(cont); 7209 struct task_group *tg = cgroup_tg(cgrp);
7186 7210
7187 return (u64) tg->shares; 7211 return (u64) tg->shares;
7188} 7212}
@@ -7190,7 +7214,7 @@ static u64 cpu_shares_read_uint(struct cgroup *cont, struct cftype *cft)
7190static struct cftype cpu_shares = { 7214static struct cftype cpu_shares = {
7191 .name = "shares", 7215 .name = "shares",
7192 .read_uint = cpu_shares_read_uint, 7216 .read_uint = cpu_shares_read_uint,
7193 .write = cpu_shares_write, 7217 .write_uint = cpu_shares_write_uint,
7194}; 7218};
7195 7219
7196static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) 7220static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 166ed6db600b..9971831b560e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -876,6 +876,7 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
876 } 876 }
877} 877}
878 878
879#ifdef CONFIG_SMP
879/************************************************** 880/**************************************************
880 * Fair scheduling class load-balancing methods: 881 * Fair scheduling class load-balancing methods:
881 */ 882 */
@@ -936,12 +937,11 @@ static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
936 937
937static unsigned long 938static unsigned long
938load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 939load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
939 unsigned long max_nr_move, unsigned long max_load_move, 940 unsigned long max_load_move,
940 struct sched_domain *sd, enum cpu_idle_type idle, 941 struct sched_domain *sd, enum cpu_idle_type idle,
941 int *all_pinned, int *this_best_prio) 942 int *all_pinned, int *this_best_prio)
942{ 943{
943 struct cfs_rq *busy_cfs_rq; 944 struct cfs_rq *busy_cfs_rq;
944 unsigned long load_moved, total_nr_moved = 0, nr_moved;
945 long rem_load_move = max_load_move; 945 long rem_load_move = max_load_move;
946 struct rq_iterator cfs_rq_iterator; 946 struct rq_iterator cfs_rq_iterator;
947 947
@@ -969,25 +969,48 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
969#else 969#else
970# define maxload rem_load_move 970# define maxload rem_load_move
971#endif 971#endif
972 /* pass busy_cfs_rq argument into 972 /*
973 * pass busy_cfs_rq argument into
973 * load_balance_[start|next]_fair iterators 974 * load_balance_[start|next]_fair iterators
974 */ 975 */
975 cfs_rq_iterator.arg = busy_cfs_rq; 976 cfs_rq_iterator.arg = busy_cfs_rq;
976 nr_moved = balance_tasks(this_rq, this_cpu, busiest, 977 rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
977 max_nr_move, maxload, sd, idle, all_pinned, 978 maxload, sd, idle, all_pinned,
978 &load_moved, this_best_prio, &cfs_rq_iterator); 979 this_best_prio,
979 980 &cfs_rq_iterator);
980 total_nr_moved += nr_moved;
981 max_nr_move -= nr_moved;
982 rem_load_move -= load_moved;
983 981
984 if (max_nr_move <= 0 || rem_load_move <= 0) 982 if (rem_load_move <= 0)
985 break; 983 break;
986 } 984 }
987 985
988 return max_load_move - rem_load_move; 986 return max_load_move - rem_load_move;
989} 987}
990 988
989static int
990move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
991 struct sched_domain *sd, enum cpu_idle_type idle)
992{
993 struct cfs_rq *busy_cfs_rq;
994 struct rq_iterator cfs_rq_iterator;
995
996 cfs_rq_iterator.start = load_balance_start_fair;
997 cfs_rq_iterator.next = load_balance_next_fair;
998
999 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
1000 /*
1001 * pass busy_cfs_rq argument into
1002 * load_balance_[start|next]_fair iterators
1003 */
1004 cfs_rq_iterator.arg = busy_cfs_rq;
1005 if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
1006 &cfs_rq_iterator))
1007 return 1;
1008 }
1009
1010 return 0;
1011}
1012#endif
1013
991/* 1014/*
992 * scheduler tick hitting a task of our scheduling class: 1015 * scheduler tick hitting a task of our scheduling class:
993 */ 1016 */
@@ -1063,7 +1086,10 @@ static const struct sched_class fair_sched_class = {
1063 .pick_next_task = pick_next_task_fair, 1086 .pick_next_task = pick_next_task_fair,
1064 .put_prev_task = put_prev_task_fair, 1087 .put_prev_task = put_prev_task_fair,
1065 1088
1089#ifdef CONFIG_SMP
1066 .load_balance = load_balance_fair, 1090 .load_balance = load_balance_fair,
1091 .move_one_task = move_one_task_fair,
1092#endif
1067 1093
1068 .set_curr_task = set_curr_task_fair, 1094 .set_curr_task = set_curr_task_fair,
1069 .task_tick = task_tick_fair, 1095 .task_tick = task_tick_fair,
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 6e2ead41516e..bf9c25c15b8b 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -37,15 +37,24 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
37{ 37{
38} 38}
39 39
40#ifdef CONFIG_SMP
40static unsigned long 41static unsigned long
41load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, 42load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
42 unsigned long max_nr_move, unsigned long max_load_move, 43 unsigned long max_load_move,
43 struct sched_domain *sd, enum cpu_idle_type idle, 44 struct sched_domain *sd, enum cpu_idle_type idle,
44 int *all_pinned, int *this_best_prio) 45 int *all_pinned, int *this_best_prio)
45{ 46{
46 return 0; 47 return 0;
47} 48}
48 49
50static int
51move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
52 struct sched_domain *sd, enum cpu_idle_type idle)
53{
54 return 0;
55}
56#endif
57
49static void task_tick_idle(struct rq *rq, struct task_struct *curr) 58static void task_tick_idle(struct rq *rq, struct task_struct *curr)
50{ 59{
51} 60}
@@ -69,7 +78,10 @@ const struct sched_class idle_sched_class = {
69 .pick_next_task = pick_next_task_idle, 78 .pick_next_task = pick_next_task_idle,
70 .put_prev_task = put_prev_task_idle, 79 .put_prev_task = put_prev_task_idle,
71 80
81#ifdef CONFIG_SMP
72 .load_balance = load_balance_idle, 82 .load_balance = load_balance_idle,
83 .move_one_task = move_one_task_idle,
84#endif
73 85
74 .set_curr_task = set_curr_task_idle, 86 .set_curr_task = set_curr_task_idle,
75 .task_tick = task_tick_idle, 87 .task_tick = task_tick_idle,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d0097a0634e5..8abd752a0ebd 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -98,6 +98,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
98 p->se.exec_start = 0; 98 p->se.exec_start = 0;
99} 99}
100 100
101#ifdef CONFIG_SMP
101/* 102/*
102 * Load-balancing iterator. Note: while the runqueue stays locked 103 * Load-balancing iterator. Note: while the runqueue stays locked
103 * during the whole iteration, the current task might be 104 * during the whole iteration, the current task might be
@@ -172,13 +173,11 @@ static struct task_struct *load_balance_next_rt(void *arg)
172 173
173static unsigned long 174static unsigned long
174load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, 175load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
175 unsigned long max_nr_move, unsigned long max_load_move, 176 unsigned long max_load_move,
176 struct sched_domain *sd, enum cpu_idle_type idle, 177 struct sched_domain *sd, enum cpu_idle_type idle,
177 int *all_pinned, int *this_best_prio) 178 int *all_pinned, int *this_best_prio)
178{ 179{
179 int nr_moved;
180 struct rq_iterator rt_rq_iterator; 180 struct rq_iterator rt_rq_iterator;
181 unsigned long load_moved;
182 181
183 rt_rq_iterator.start = load_balance_start_rt; 182 rt_rq_iterator.start = load_balance_start_rt;
184 rt_rq_iterator.next = load_balance_next_rt; 183 rt_rq_iterator.next = load_balance_next_rt;
@@ -187,12 +186,24 @@ load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
187 */ 186 */
188 rt_rq_iterator.arg = busiest; 187 rt_rq_iterator.arg = busiest;
189 188
190 nr_moved = balance_tasks(this_rq, this_cpu, busiest, max_nr_move, 189 return balance_tasks(this_rq, this_cpu, busiest, max_load_move, sd,
191 max_load_move, sd, idle, all_pinned, &load_moved, 190 idle, all_pinned, this_best_prio, &rt_rq_iterator);
192 this_best_prio, &rt_rq_iterator); 191}
192
193static int
194move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
195 struct sched_domain *sd, enum cpu_idle_type idle)
196{
197 struct rq_iterator rt_rq_iterator;
198
199 rt_rq_iterator.start = load_balance_start_rt;
200 rt_rq_iterator.next = load_balance_next_rt;
201 rt_rq_iterator.arg = busiest;
193 202
194 return load_moved; 203 return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
204 &rt_rq_iterator);
195} 205}
206#endif
196 207
197static void task_tick_rt(struct rq *rq, struct task_struct *p) 208static void task_tick_rt(struct rq *rq, struct task_struct *p)
198{ 209{
@@ -236,7 +247,10 @@ const struct sched_class rt_sched_class = {
236 .pick_next_task = pick_next_task_rt, 247 .pick_next_task = pick_next_task_rt,
237 .put_prev_task = put_prev_task_rt, 248 .put_prev_task = put_prev_task_rt,
238 249
250#ifdef CONFIG_SMP
239 .load_balance = load_balance_rt, 251 .load_balance = load_balance_rt,
252 .move_one_task = move_one_task_rt,
253#endif
240 254
241 .set_curr_task = set_curr_task_rt, 255 .set_curr_task = set_curr_task_rt,
242 .task_tick = task_tick_rt, 256 .task_tick = task_tick_rt,
diff --git a/kernel/user.c b/kernel/user.c
index e91331c457e2..0f3aa0234107 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -129,7 +129,7 @@ static inline void uids_mutex_unlock(void)
129} 129}
130 130
131/* return cpu shares held by the user */ 131/* return cpu shares held by the user */
132ssize_t cpu_shares_show(struct kset *kset, char *buffer) 132static ssize_t cpu_shares_show(struct kset *kset, char *buffer)
133{ 133{
134 struct user_struct *up = container_of(kset, struct user_struct, kset); 134 struct user_struct *up = container_of(kset, struct user_struct, kset);
135 135
@@ -137,7 +137,8 @@ ssize_t cpu_shares_show(struct kset *kset, char *buffer)
137} 137}
138 138
139/* modify cpu shares held by the user */ 139/* modify cpu shares held by the user */
140ssize_t cpu_shares_store(struct kset *kset, const char *buffer, size_t size) 140static ssize_t cpu_shares_store(struct kset *kset, const char *buffer,
141 size_t size)
141{ 142{
142 struct user_struct *up = container_of(kset, struct user_struct, kset); 143 struct user_struct *up = container_of(kset, struct user_struct, kset);
143 unsigned long shares; 144 unsigned long shares;
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 1faa5087dc86..1e5f207b9074 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -9,6 +9,14 @@ config PRINTK_TIME
9 operations. This is useful for identifying long delays 9 operations. This is useful for identifying long delays
10 in kernel startup. 10 in kernel startup.
11 11
12config ENABLE_WARN_DEPRECATED
13 bool "Enable __deprecated logic"
14 default y
15 help
16 Enable the __deprecated logic in the kernel build.
17 Disable this to suppress the "warning: 'foo' is deprecated
18 (declared at kernel/power/somefile.c:1234)" messages.
19
12config ENABLE_MUST_CHECK 20config ENABLE_MUST_CHECK
13 bool "Enable __must_check logic" 21 bool "Enable __must_check logic"
14 default y 22 default y
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 4e2c84fcf276..7b7c6c44c2da 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2045,9 +2045,7 @@ skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
2045 if (copy > 0) { 2045 if (copy > 0) {
2046 if (copy > len) 2046 if (copy > len)
2047 copy = len; 2047 copy = len;
2048 sg_set_page(&sg[elt], virt_to_page(skb->data + offset)); 2048 sg_set_buf(sg, skb->data + offset, copy);
2049 sg[elt].offset = (unsigned long)(skb->data + offset) % PAGE_SIZE;
2050 sg[elt].length = copy;
2051 elt++; 2049 elt++;
2052 if ((len -= copy) == 0) 2050 if ((len -= copy) == 0)
2053 return elt; 2051 return elt;
@@ -2065,9 +2063,8 @@ skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
2065 2063
2066 if (copy > len) 2064 if (copy > len)
2067 copy = len; 2065 copy = len;
2068 sg_set_page(&sg[elt], frag->page); 2066 sg_set_page(&sg[elt], frag->page, copy,
2069 sg[elt].offset = frag->page_offset+offset-start; 2067 frag->page_offset+offset-start);
2070 sg[elt].length = copy;
2071 elt++; 2068 elt++;
2072 if (!(len -= copy)) 2069 if (!(len -= copy))
2073 return elt; 2070 return elt;
diff --git a/net/ieee80211/ieee80211_crypt_tkip.c b/net/ieee80211/ieee80211_crypt_tkip.c
index 811777682e2b..4cce3534e408 100644
--- a/net/ieee80211/ieee80211_crypt_tkip.c
+++ b/net/ieee80211/ieee80211_crypt_tkip.c
@@ -25,7 +25,7 @@
25#include <net/ieee80211.h> 25#include <net/ieee80211.h>
26 26
27#include <linux/crypto.h> 27#include <linux/crypto.h>
28#include <asm/scatterlist.h> 28#include <linux/scatterlist.h>
29#include <linux/crc32.h> 29#include <linux/crc32.h>
30 30
31MODULE_AUTHOR("Jouni Malinen"); 31MODULE_AUTHOR("Jouni Malinen");
@@ -537,13 +537,8 @@ static int michael_mic(struct crypto_hash *tfm_michael, u8 * key, u8 * hdr,
537 return -1; 537 return -1;
538 } 538 }
539 sg_init_table(sg, 2); 539 sg_init_table(sg, 2);
540 sg_set_page(&sg[0], virt_to_page(hdr)); 540 sg_set_buf(&sg[0], hdr, 16);
541 sg[0].offset = offset_in_page(hdr); 541 sg_set_buf(&sg[1], data, data_len);
542 sg[0].length = 16;
543
544 sg_set_page(&sg[1], virt_to_page(data));
545 sg[1].offset = offset_in_page(data);
546 sg[1].length = data_len;
547 542
548 if (crypto_hash_setkey(tfm_michael, key, 8)) 543 if (crypto_hash_setkey(tfm_michael, key, 8))
549 return -1; 544 return -1;
diff --git a/net/ieee80211/ieee80211_crypt_wep.c b/net/ieee80211/ieee80211_crypt_wep.c
index 9693429489ed..866fc04c44f9 100644
--- a/net/ieee80211/ieee80211_crypt_wep.c
+++ b/net/ieee80211/ieee80211_crypt_wep.c
@@ -22,7 +22,7 @@
22#include <net/ieee80211.h> 22#include <net/ieee80211.h>
23 23
24#include <linux/crypto.h> 24#include <linux/crypto.h>
25#include <asm/scatterlist.h> 25#include <linux/scatterlist.h>
26#include <linux/crc32.h> 26#include <linux/crc32.h>
27 27
28MODULE_AUTHOR("Jouni Malinen"); 28MODULE_AUTHOR("Jouni Malinen");
diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index cbd64b216cce..621113a109b2 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -727,9 +727,7 @@ void sctp_auth_calculate_hmac(const struct sctp_association *asoc,
727 /* set up scatter list */ 727 /* set up scatter list */
728 end = skb_tail_pointer(skb); 728 end = skb_tail_pointer(skb);
729 sg_init_table(&sg, 1); 729 sg_init_table(&sg, 1);
730 sg_set_page(&sg, virt_to_page(auth)); 730 sg_set_buf(&sg, auth, end - (unsigned char *)auth);
731 sg.offset = (unsigned long)(auth) % PAGE_SIZE;
732 sg.length = end - (unsigned char *)auth;
733 731
734 desc.tfm = asoc->ep->auth_hmacs[hmac_id]; 732 desc.tfm = asoc->ep->auth_hmacs[hmac_id];
735 desc.flags = 0; 733 desc.flags = 0;
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 658476c4d587..c055212875f6 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -1514,9 +1514,7 @@ static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep,
1514 1514
1515 /* Sign the message. */ 1515 /* Sign the message. */
1516 sg_init_table(&sg, 1); 1516 sg_init_table(&sg, 1);
1517 sg_set_page(&sg, virt_to_page(&cookie->c)); 1517 sg_set_buf(&sg, &cookie->c, bodysize);
1518 sg.offset = (unsigned long)(&cookie->c) % PAGE_SIZE;
1519 sg.length = bodysize;
1520 keylen = SCTP_SECRET_SIZE; 1518 keylen = SCTP_SECRET_SIZE;
1521 key = (char *)ep->secret_key[ep->current_key]; 1519 key = (char *)ep->secret_key[ep->current_key];
1522 desc.tfm = sctp_sk(ep->base.sk)->hmac; 1520 desc.tfm = sctp_sk(ep->base.sk)->hmac;
@@ -1587,9 +1585,7 @@ struct sctp_association *sctp_unpack_cookie(
1587 /* Check the signature. */ 1585 /* Check the signature. */
1588 keylen = SCTP_SECRET_SIZE; 1586 keylen = SCTP_SECRET_SIZE;
1589 sg_init_table(&sg, 1); 1587 sg_init_table(&sg, 1);
1590 sg_set_page(&sg, virt_to_page(bear_cookie)); 1588 sg_set_buf(&sg, bear_cookie, bodysize);
1591 sg.offset = (unsigned long)(bear_cookie) % PAGE_SIZE;
1592 sg.length = bodysize;
1593 key = (char *)ep->secret_key[ep->current_key]; 1589 key = (char *)ep->secret_key[ep->current_key];
1594 desc.tfm = sctp_sk(ep->base.sk)->hmac; 1590 desc.tfm = sctp_sk(ep->base.sk)->hmac;
1595 desc.flags = 0; 1591 desc.flags = 0;
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index 32be431affcf..24711be4b2dc 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -199,7 +199,7 @@ encryptor(struct scatterlist *sg, void *data)
199 } else { 199 } else {
200 in_page = sg_page(sg); 200 in_page = sg_page(sg);
201 } 201 }
202 sg_set_page(&desc->infrags[desc->fragno], in_page); 202 sg_assign_page(&desc->infrags[desc->fragno], in_page);
203 desc->fragno++; 203 desc->fragno++;
204 desc->fraglen += sg->length; 204 desc->fraglen += sg->length;
205 desc->pos += sg->length; 205 desc->pos += sg->length;
@@ -215,11 +215,10 @@ encryptor(struct scatterlist *sg, void *data)
215 if (ret) 215 if (ret)
216 return ret; 216 return ret;
217 if (fraglen) { 217 if (fraglen) {
218 sg_set_page(&desc->outfrags[0], sg_page(sg)); 218 sg_set_page(&desc->outfrags[0], sg_page(sg), fraglen,
219 desc->outfrags[0].offset = sg->offset + sg->length - fraglen; 219 sg->offset + sg->length - fraglen);
220 desc->outfrags[0].length = fraglen;
221 desc->infrags[0] = desc->outfrags[0]; 220 desc->infrags[0] = desc->outfrags[0];
222 sg_set_page(&desc->infrags[0], in_page); 221 sg_assign_page(&desc->infrags[0], in_page);
223 desc->fragno = 1; 222 desc->fragno = 1;
224 desc->fraglen = fraglen; 223 desc->fraglen = fraglen;
225 } else { 224 } else {
@@ -287,9 +286,8 @@ decryptor(struct scatterlist *sg, void *data)
287 if (ret) 286 if (ret)
288 return ret; 287 return ret;
289 if (fraglen) { 288 if (fraglen) {
290 sg_set_page(&desc->frags[0], sg_page(sg)); 289 sg_set_page(&desc->frags[0], sg_page(sg), fraglen,
291 desc->frags[0].offset = sg->offset + sg->length - fraglen; 290 sg->offset + sg->length - fraglen);
292 desc->frags[0].length = fraglen;
293 desc->fragno = 1; 291 desc->fragno = 1;
294 desc->fraglen = fraglen; 292 desc->fraglen = fraglen;
295 } else { 293 } else {
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 3d1f7cdf9dd0..f38dac30481b 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -1059,9 +1059,7 @@ xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len,
1059 do { 1059 do {
1060 if (thislen > page_len) 1060 if (thislen > page_len)
1061 thislen = page_len; 1061 thislen = page_len;
1062 sg_set_page(sg, buf->pages[i]); 1062 sg_set_page(sg, buf->pages[i], thislen, page_offset);
1063 sg->offset = page_offset;
1064 sg->length = thislen;
1065 ret = actor(sg, data); 1063 ret = actor(sg, data);
1066 if (ret) 1064 if (ret)
1067 goto out; 1065 goto out;
diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c
index 313d4bed3aa9..fa45989a716a 100644
--- a/net/xfrm/xfrm_algo.c
+++ b/net/xfrm/xfrm_algo.c
@@ -553,9 +553,7 @@ int skb_icv_walk(const struct sk_buff *skb, struct hash_desc *desc,
553 if (copy > len) 553 if (copy > len)
554 copy = len; 554 copy = len;
555 555
556 sg_set_page(&sg, virt_to_page(skb->data + offset)); 556 sg_set_buf(&sg, skb->data + offset, copy);
557 sg.offset = (unsigned long)(skb->data + offset) % PAGE_SIZE;
558 sg.length = copy;
559 557
560 err = icv_update(desc, &sg, copy); 558 err = icv_update(desc, &sg, copy);
561 if (unlikely(err)) 559 if (unlikely(err))
@@ -578,9 +576,8 @@ int skb_icv_walk(const struct sk_buff *skb, struct hash_desc *desc,
578 if (copy > len) 576 if (copy > len)
579 copy = len; 577 copy = len;
580 578
581 sg_set_page(&sg, frag->page); 579 sg_set_page(&sg, frag->page, copy,
582 sg.offset = frag->page_offset + offset-start; 580 frag->page_offset + offset-start);
583 sg.length = copy;
584 581
585 err = icv_update(desc, &sg, copy); 582 err = icv_update(desc, &sg, copy);
586 if (unlikely(err)) 583 if (unlikely(err))