x86, SGI UV: TLB shootdown using broadcast assist unit

TLB shootdown for SGI UV. Depends on patch (in tip/x86/irq): x86-update-macros-used-by-uv-platform.patch Jack Steiner May 29 This patch provides the ability to flush TLB's in cpu's that are not on the local node. The hardware mechanism for distributing the flush messages is the UV's "broadcast assist unit". The hook to intercept TLB shootdown requests is a 2-line change to native_flush_tlb_others() (arch/x86/kernel/tlb_64.c). This code has been tested on a hardware simulator. The real hardware is not yet available. The shootdown statistics are provided through /proc/sgi_uv/ptc_statistics. The use of /sys was considered, but would have required the use of many /sys files. The debugfs was also considered, but these statistics should be available on an ongoing basis, not just for debugging. Issues to be fixed later: - The IRQ for the messaging interrupt is currently hardcoded as 200 (see UV_BAU_MESSAGE). It should be dynamically assigned in the future. - The use of appropriate udelay()'s is untested, as they are a problem in the simulator. Signed-off-by: Cliff Wickman <cpw@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Cliff Wickman <cpw@sgi.com> 2008-06-02 09:56:14 -0400
committer: Ingo Molnar <mingo@elte.hu> 2008-07-08 06:23:22 -0400
commit: 1812924bb1823950c1dc95c478b71b037057356e (patch)
tree: 74ecf29e332a320d7850008ca4f8607dace88de6 /arch/x86/kernel/tlb_uv.c
parent: d98b940ab29a245de84a1c138b866dcc29217601 (diff)
1 files changed, 736 insertions, 0 deletions
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
new file mode 100644
index 000000000000..28e7c68d9d78
--- /dev/null
+++ b/arch/x86/kernel/tlb_uv.c
@@ -0,0 +1,736 @@
+/*
+ *      SGI UltraViolet TLB flush routines.
+ *
+ *      (c) 2008 Cliff Wickman <cpw@sgi.com>, SGI.
+ *
+ *      This code is released under the GNU General Public License version 2 or
+ *      later.
+ */
+#include <linux/mc146818rtc.h>
+#include <linux/proc_fs.h>
+#include <linux/kernel.h>
+#include <asm/mach-bigsmp/mach_apic.h>
+#include <asm/mmu_context.h>
+#include <asm/idle.h>
+#include <asm/genapic.h>
+#include <asm/uv/uv_hub.h>
+#include <asm/uv/uv_mmrs.h>
+#include <asm/uv/uv_bau.h>
+struct bau_control **uv_bau_table_bases;
+static int uv_bau_retry_limit;
+static int uv_nshift;           /* position of pnode (which is nasid>>1) */
+static unsigned long uv_mmask;
+char *status_table[] = {
+        "IDLE",
+        "ACTIVE",
+        "DESTINATION TIMEOUT",
+        "SOURCE TIMEOUT"
+};
+DEFINE_PER_CPU(struct ptc_stats, ptcstats);
+DEFINE_PER_CPU(struct bau_control, bau_control);
+/*
+ * Free a software acknowledge hardware resource by clearing its Pending
+ * bit. This will return a reply to the sender.
+ * If the message has timed out, a reply has already been sent by the
+ * hardware but the resource has not been released. In that case our
+ * clear of the Timeout bit (as well) will free the resource. No reply will
+ * be sent (the hardware will only do one reply per message).
+ */
+static void
+uv_reply_to_message(int resource,
+                    struct bau_payload_queue_entry *msg,
+                    struct bau_msg_status *msp)
+{
+        int fw;
+        fw = (1 << (resource + UV_SW_ACK_NPENDING)) | (1 << resource);
+        msg->replied_to = 1;
+        msg->sw_ack_vector = 0;
+        if (msp)
+                msp->seen_by.bits = 0;
+        uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, fw);
+        return;
+}
+/*
+ * Do all the things a cpu should do for a TLB shootdown message.
+ * Other cpu's may come here at the same time for this message.
+ */
+static void
+uv_bau_process_message(struct bau_payload_queue_entry *msg,
+                       int msg_slot, int sw_ack_slot)
+{
+        int cpu;
+        unsigned long this_cpu_mask;
+        struct bau_msg_status *msp;
+        msp = __get_cpu_var(bau_control).msg_statuses + msg_slot;
+        cpu = uv_blade_processor_id();
+        msg->number_of_cpus =
+            uv_blade_nr_online_cpus(uv_node_to_blade_id(numa_node_id()));
+        this_cpu_mask = (unsigned long)1 << cpu;
+        if (msp->seen_by.bits & this_cpu_mask)
+                return;
+        atomic_or_long(&msp->seen_by.bits, this_cpu_mask);
+        if (msg->replied_to == 1)
+                return;
+        if (msg->address == TLB_FLUSH_ALL) {
+                local_flush_tlb();
+                __get_cpu_var(ptcstats).alltlb++;
+        } else {
+                __flush_tlb_one(msg->address);
+                __get_cpu_var(ptcstats).onetlb++;
+        }
+        __get_cpu_var(ptcstats).requestee++;
+        atomic_inc_short(&msg->acknowledge_count);
+        if (msg->number_of_cpus == msg->acknowledge_count)
+                uv_reply_to_message(sw_ack_slot, msg, msp);
+        return;
+}
+/*
+ * Examine the payload queue on all the distribution nodes to see
+ * which messages have not been seen, and which cpu(s) have not seen them.
+ *
+ * Returns the number of cpu's that have not responded.
+ */
+static int
+uv_examine_destinations(struct bau_target_nodemask *distribution)
+{
+        int sender;
+        int i;
+        int j;
+        int k;
+        int count = 0;
+        struct bau_control *bau_tablesp;
+        struct bau_payload_queue_entry *msg;
+        struct bau_msg_status *msp;
+        sender = smp_processor_id();
+        for (i = 0; i < (sizeof(struct bau_target_nodemask) * BITSPERBYTE);
+             i++) {
+                if (bau_node_isset(i, distribution)) {
+                        bau_tablesp = uv_bau_table_bases[i];
+                        for (msg = bau_tablesp->va_queue_first, j = 0;
+                             j < DESTINATION_PAYLOAD_QUEUE_SIZE; msg++, j++) {
+                                if ((msg->sending_cpu == sender) &&
+                                    (!msg->replied_to)) {
+                                        msp = bau_tablesp->msg_statuses + j;
+                                        printk(KERN_DEBUG
+                                "blade %d: address:%#lx %d of %d, not cpu(s): ",
+                                               i, msg->address,
+                                               msg->acknowledge_count,
+                                               msg->number_of_cpus);
+                                        for (k = 0; k < msg->number_of_cpus;
+                                             k++) {
+                                                if (!((long)1 << k & msp->
+                                                      seen_by.bits)) {
+                                                        count++;
+                                                        printk("%d ", k);
+                                                }
+                                        }
+                                        printk("\n");
+                                }
+                        }
+                }
+        }
+        return count;
+}
+/**
+ * uv_flush_tlb_others - globally purge translation cache of a virtual
+ * address or all TLB's
+ * @cpumaskp: mask of all cpu's in which the address is to be removed
+ * @mm: mm_struct containing virtual address range
+ * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
+ *
+ * This is the entry point for initiating any UV global TLB shootdown.
+ *
+ * Purges the translation caches of all specified processors of the given
+ * virtual address, or purges all TLB's on specified processors.
+ *
+ * The caller has derived the cpumaskp from the mm_struct and has subtracted
+ * the local cpu from the mask.  This function is called only if there
+ * are bits set in the mask. (e.g. flush_tlb_page())
+ *
+ * The cpumaskp is converted into a nodemask of the nodes containing
+ * the cpus.
+ */
+int
+uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm, unsigned long va)
+{
+        int i;
+        int blade;
+        int cpu;
+        int bit;
+        int right_shift;
+        int this_blade;
+        int exams = 0;
+        int tries = 0;
+        long source_timeouts = 0;
+        long destination_timeouts = 0;
+        unsigned long index;
+        unsigned long mmr_offset;
+        unsigned long descriptor_status;
+        struct bau_activation_descriptor *bau_desc;
+        ktime_t time1, time2;
+        cpu = uv_blade_processor_id();
+        this_blade = uv_numa_blade_id();
+        bau_desc = __get_cpu_var(bau_control).descriptor_base;
+        bau_desc += (UV_ITEMS_PER_DESCRIPTOR * cpu);
+        bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
+        i = 0;
+        for_each_cpu_mask(bit, *cpumaskp) {
+                blade = uv_cpu_to_blade_id(bit);
+                if (blade > (UV_DISTRIBUTION_SIZE - 1))
+                        BUG();
+                if (blade == this_blade)
+                        continue;
+                bau_node_set(blade, &bau_desc->distribution);
+                /* leave the bits for the remote cpu's in the mask until
+                   success; on failure we fall back to the IPI method */
+                i++;
+        }
+        if (i == 0)
+                goto none_to_flush;
+        __get_cpu_var(ptcstats).requestor++;
+        __get_cpu_var(ptcstats).ntargeted += i;
+        bau_desc->payload.address = va;
+        bau_desc->payload.sending_cpu = smp_processor_id();
+        if (cpu < UV_CPUS_PER_ACT_STATUS) {
+                mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
+                right_shift = cpu * UV_ACT_STATUS_SIZE;
+        } else {
+                mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
+                right_shift =
+                    ((cpu - UV_CPUS_PER_ACT_STATUS) * UV_ACT_STATUS_SIZE);
+        }
+        time1 = ktime_get();
+retry:
+        tries++;
+        index = ((unsigned long)
+                 1 << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | cpu;
+        uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
+        while ((descriptor_status = (((unsigned long)
+                                      uv_read_local_mmr(mmr_offset) >>
+                                      right_shift) & UV_ACT_STATUS_MASK)) !=
+               DESC_STATUS_IDLE) {
+                if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
+                        source_timeouts++;
+                        if (source_timeouts > SOURCE_TIMEOUT_LIMIT)
+                                source_timeouts = 0;
+                        __get_cpu_var(ptcstats).s_retry++;
+                        goto retry;
+                }
+                /* spin here looking for progress at the destinations */
+                if (descriptor_status == DESC_STATUS_DESTINATION_TIMEOUT) {
+                        destination_timeouts++;
+                        if (destination_timeouts > DESTINATION_TIMEOUT_LIMIT) {
+                                /* returns # of cpus not responding */
+                                if (uv_examine_destinations
+                                    (&bau_desc->distribution) == 0) {
+                                        __get_cpu_var(ptcstats).d_retry++;
+                                        goto retry;
+                                }
+                                exams++;
+                                if (exams >= uv_bau_retry_limit) {
+                                        printk(KERN_DEBUG
+                                               "uv_flush_tlb_others");
+                                        printk("giving up on cpu %d\n",
+                                               smp_processor_id());
+                                        goto unsuccessful;
+                                }
+                                /* delays can hang up the simulator
+                                   udelay(1000);
+                                 */
+                                destination_timeouts = 0;
+                        }
+                }
+        }
+        if (tries > 1)
+                __get_cpu_var(ptcstats).retriesok++;
+        /* on success, clear the remote cpu's from the mask so we don't
+           use the IPI method of shootdown on them */
+        for_each_cpu_mask(bit, *cpumaskp) {
+                blade = uv_cpu_to_blade_id(bit);
+                if (blade == this_blade)
+                        continue;
+                cpu_clear(bit, *cpumaskp);
+        }
+unsuccessful:
+        time2 = ktime_get();
+        __get_cpu_var(ptcstats).sflush_ns += (time2.tv64 - time1.tv64);
+none_to_flush:
+        if (cpus_empty(*cpumaskp))
+                return 1;
+        /* Cause the caller to do an IPI-style TLB shootdown on
+           the cpu's still in the mask */
+        __get_cpu_var(ptcstats).ptc_i++;
+        return 0;
+}
+/*
+ * The BAU message interrupt comes here. (registered by set_intr_gate)
+ * See entry_64.S
+ *
+ * We received a broadcast assist message.
+ *
+ * Interrupts may have been disabled; this interrupt could represent
+ * the receipt of several messages.
+ *
+ * All cores/threads on this node get this interrupt.
+ * The last one to see it does the s/w ack.
+ * (the resource will not be freed until noninterruptable cpus see this
+ *  interrupt; hardware will timeout the s/w ack and reply ERROR)
+ */
+void
+uv_bau_message_interrupt(struct pt_regs *regs)
+{
+        struct bau_payload_queue_entry *pqp;
+        struct bau_payload_queue_entry *msg;
+        struct pt_regs *old_regs = set_irq_regs(regs);
+        ktime_t time1, time2;
+        int msg_slot;
+        int sw_ack_slot;
+        int fw;
+        int count = 0;
+        unsigned long local_pnode;
+        ack_APIC_irq();
+        exit_idle();
+        irq_enter();
+        time1 = ktime_get();
+        local_pnode = uv_blade_to_pnode(uv_numa_blade_id());
+        pqp = __get_cpu_var(bau_control).va_queue_first;
+        msg = __get_cpu_var(bau_control).bau_msg_head;
+        while (msg->sw_ack_vector) {
+                count++;
+                fw = msg->sw_ack_vector;
+                msg_slot = msg - pqp;
+                sw_ack_slot = ffs(fw) - 1;
+                uv_bau_process_message(msg, msg_slot, sw_ack_slot);
+                msg++;
+                if (msg > __get_cpu_var(bau_control).va_queue_last)
+                        msg = __get_cpu_var(bau_control).va_queue_first;
+                __get_cpu_var(bau_control).bau_msg_head = msg;
+        }
+        if (!count)
+                __get_cpu_var(ptcstats).nomsg++;
+        else if (count > 1)
+                __get_cpu_var(ptcstats).multmsg++;
+        time2 = ktime_get();
+        __get_cpu_var(ptcstats).dflush_ns += (time2.tv64 - time1.tv64);
+        irq_exit();
+        set_irq_regs(old_regs);
+        return;
+}
+static void
+uv_enable_timeouts(void)
+{
+        int i;
+        int blade;
+        int last_blade;
+        int pnode;
+        int cur_cpu = 0;
+        unsigned long apicid;
+        /* better if we had each_online_blade */
+        last_blade = -1;
+        for_each_online_node(i) {
+                blade = uv_node_to_blade_id(i);
+                if (blade == last_blade)
+                        continue;
+                last_blade = blade;
+                apicid = per_cpu(x86_cpu_to_apicid, cur_cpu);
+                pnode = uv_blade_to_pnode(blade);
+                cur_cpu += uv_blade_nr_possible_cpus(i);
+        }
+        return;
+}
+static void *
+uv_ptc_seq_start(struct seq_file *file, loff_t *offset)
+{
+        if (*offset < num_possible_cpus())
+                return offset;
+        return NULL;
+}
+static void *
+uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset)
+{
+        (*offset)++;
+        if (*offset < num_possible_cpus())
+                return offset;
+        return NULL;
+}
+static void
+uv_ptc_seq_stop(struct seq_file *file, void *data)
+{
+}
+/*
+ * Display the statistics thru /proc
+ * data points to the cpu number
+ */
+static int
+uv_ptc_seq_show(struct seq_file *file, void *data)
+{
+        struct ptc_stats *stat;
+        int cpu;
+        cpu = *(loff_t *)data;
+        if (!cpu) {
+                seq_printf(file,
+                "# cpu requestor requestee one all sretry dretry ptc_i ");
+                seq_printf(file,
+                "sw_ack sflush_us dflush_us sok dnomsg dmult starget\n");
+        }
+        if (cpu < num_possible_cpus() && cpu_online(cpu)) {
+                stat = &per_cpu(ptcstats, cpu);
+                seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld ",
+                           cpu, stat->requestor,
+                           stat->requestee, stat->onetlb, stat->alltlb,
+                           stat->s_retry, stat->d_retry, stat->ptc_i);
+                seq_printf(file, "%lx %ld %ld %ld %ld %ld %ld\n",
+                           uv_read_global_mmr64(uv_blade_to_pnode
+                                        (uv_cpu_to_blade_id(cpu)),
+                                        UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
+                           stat->sflush_ns / 1000, stat->dflush_ns / 1000,
+                           stat->retriesok, stat->nomsg,
+                           stat->multmsg, stat->ntargeted);
+        }
+        return 0;
+}
+/*
+ *  0: display meaning of the statistics
+ * >0: retry limit
+ */
+static ssize_t
+uv_ptc_proc_write(struct file *file, const char __user *user,
+                  size_t count, loff_t *data)
+{
+        long newmode;
+        char optstr[64];
+        if (copy_from_user(optstr, user, count))
+                return -EFAULT;
+        optstr[count - 1] = '\0';
+        if (strict_strtoul(optstr, 10, &newmode) < 0) {
+                printk(KERN_DEBUG "%s is invalid\n", optstr);
+                return -EINVAL;
+        }
+        if (newmode == 0) {
+                printk(KERN_DEBUG "# cpu:      cpu number\n");
+                printk(KERN_DEBUG
+                "requestor:  times this cpu was the flush requestor\n");
+                printk(KERN_DEBUG
+                "requestee:  times this cpu was requested to flush its TLBs\n");
+                printk(KERN_DEBUG
+                "one:        times requested to flush a single address\n");
+                printk(KERN_DEBUG
+                "all:        times requested to flush all TLB's\n");
+                printk(KERN_DEBUG
+                "sretry:     number of retries of source-side timeouts\n");
+                printk(KERN_DEBUG
+                "dretry:     number of retries of destination-side timeouts\n");
+                printk(KERN_DEBUG
+                "ptc_i:      times UV fell through to IPI-style flushes\n");
+                printk(KERN_DEBUG
+                "sw_ack:     image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n");
+                printk(KERN_DEBUG
+                "sflush_us:  microseconds spent in uv_flush_tlb_others()\n");
+                printk(KERN_DEBUG
+                "dflush_us:  microseconds spent in handling flush requests\n");
+                printk(KERN_DEBUG "sok:        successes on retry\n");
+                printk(KERN_DEBUG "dnomsg:     interrupts with no message\n");
+                printk(KERN_DEBUG
+                "dmult:      interrupts with multiple messages\n");
+                printk(KERN_DEBUG "starget:    nodes targeted\n");
+        } else {
+                uv_bau_retry_limit = newmode;
+                printk(KERN_DEBUG "timeout retry limit:%d\n",
+                       uv_bau_retry_limit);
+        }
+        return count;
+}
+static const struct seq_operations uv_ptc_seq_ops = {
+        .start = uv_ptc_seq_start,
+        .next = uv_ptc_seq_next,
+        .stop = uv_ptc_seq_stop,
+        .show = uv_ptc_seq_show
+};
+static int
+uv_ptc_proc_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &uv_ptc_seq_ops);
+}
+static const struct file_operations proc_uv_ptc_operations = {
+        .open = uv_ptc_proc_open,
+        .read = seq_read,
+        .write = uv_ptc_proc_write,
+        .llseek = seq_lseek,
+        .release = seq_release,
+};
+static struct proc_dir_entry *proc_uv_ptc;
+static int __init
+uv_ptc_init(void)
+{
+        static struct proc_dir_entry *sgi_proc_dir;
+        sgi_proc_dir = NULL;
+        if (!is_uv_system())
+                return 0;
+        sgi_proc_dir = proc_mkdir("sgi_uv", NULL);
+        if (!sgi_proc_dir)
+                return -EINVAL;
+        proc_uv_ptc = create_proc_entry(UV_PTC_BASENAME, 0444, NULL);
+        if (!proc_uv_ptc) {
+                printk(KERN_ERR "unable to create %s proc entry\n",
+                       UV_PTC_BASENAME);
+                return -EINVAL;
+        }
+        proc_uv_ptc->proc_fops = &proc_uv_ptc_operations;
+        return 0;
+}
+static void __exit
+uv_ptc_exit(void)
+{
+        remove_proc_entry(UV_PTC_BASENAME, NULL);
+}
+module_init(uv_ptc_init);
+module_exit(uv_ptc_exit);
+/*
+ * Initialization of BAU-related structures
+ */
+int __init
+uv_bau_init(void)
+{
+        int i;
+        int j;
+        int blade;
+        int nblades;
+        int *ip;
+        int pnode;
+        int last_blade;
+        int cur_cpu = 0;
+        unsigned long pa;
+        unsigned long n;
+        unsigned long m;
+        unsigned long mmr_image;
+        unsigned long apicid;
+        char *cp;
+        struct bau_control *bau_tablesp;
+        struct bau_activation_descriptor *adp, *ad2;
+        struct bau_payload_queue_entry *pqp;
+        struct bau_msg_status *msp;
+        struct bau_control *bcp;
+        if (!is_uv_system())
+                return 0;
+        uv_bau_retry_limit = 1;
+        if ((sizeof(struct bau_local_cpumask) * BITSPERBYTE) <
+            MAX_CPUS_PER_NODE) {
+                printk(KERN_ERR
+                        "uv_bau_init: bau_local_cpumask.bits too small\n");
+                BUG();
+        }
+        uv_nshift = uv_hub_info->n_val;
+        uv_mmask = ((unsigned long)1 << uv_hub_info->n_val) - 1;
+        nblades = 0;
+        last_blade = -1;
+        for_each_online_node(i) {
+                blade = uv_node_to_blade_id(i);
+                if (blade == last_blade)
+                        continue;
+                last_blade = blade;
+                nblades++;
+        }
+        uv_bau_table_bases = (struct bau_control **)
+            kmalloc(nblades * sizeof(struct bau_control *), GFP_KERNEL);
+        if (!uv_bau_table_bases)
+                BUG();
+        /* better if we had each_online_blade */
+        last_blade = -1;
+        for_each_online_node(i) {
+                blade = uv_node_to_blade_id(i);
+                if (blade == last_blade)
+                        continue;
+                last_blade = blade;
+                bau_tablesp =
+                    kmalloc_node(sizeof(struct bau_control), GFP_KERNEL, i);
+                if (!bau_tablesp)
+                        BUG();
+                bau_tablesp->msg_statuses =
+                    kmalloc_node(sizeof(struct bau_msg_status) *
+                                 DESTINATION_PAYLOAD_QUEUE_SIZE, GFP_KERNEL, i);
+                if (!bau_tablesp->msg_statuses)
+                        BUG();
+                for (j = 0, msp = bau_tablesp->msg_statuses;
+                     j < DESTINATION_PAYLOAD_QUEUE_SIZE; j++, msp++) {
+                        bau_cpubits_clear(&msp->seen_by, (int)
+                                          uv_blade_nr_possible_cpus(blade));
+                }
+                bau_tablesp->watching =
+                    kmalloc_node(sizeof(int) * DESTINATION_NUM_RESOURCES,
+                                 GFP_KERNEL, i);
+                if (!bau_tablesp->watching)
+                        BUG();
+                for (j = 0, ip = bau_tablesp->watching;
+                     j < DESTINATION_PAYLOAD_QUEUE_SIZE; j++, ip++) {
+                        *ip = 0;
+                }
+                uv_bau_table_bases[i] = bau_tablesp;
+                pnode = uv_blade_to_pnode(blade);
+                if (sizeof(struct bau_activation_descriptor) != 64)
+                        BUG();
+                adp = (struct bau_activation_descriptor *)
+                    kmalloc_node(16384, GFP_KERNEL, i);
+                if (!adp)
+                        BUG();
+                if ((unsigned long)adp & 0xfff)
+                        BUG();
+                pa = __pa((unsigned long)adp);
+                n = pa >> uv_nshift;
+                m = pa & uv_mmask;
+                mmr_image = uv_read_global_mmr64(pnode,
+                                                 UVH_LB_BAU_SB_DESCRIPTOR_BASE);
+                if (mmr_image)
+                        uv_write_global_mmr64(pnode, (unsigned long)
+                                              UVH_LB_BAU_SB_DESCRIPTOR_BASE,
+                                              (n << UV_DESC_BASE_PNODE_SHIFT |
+                                               m));
+                for (j = 0, ad2 = adp; j < UV_ACTIVATION_DESCRIPTOR_SIZE;
+                     j++, ad2++) {
+                        memset(ad2, 0,
+                               sizeof(struct bau_activation_descriptor));
+                        ad2->header.sw_ack_flag = 1;
+                        ad2->header.base_dest_nodeid =
+                            uv_blade_to_pnode(uv_cpu_to_blade_id(0));
+                        ad2->header.command = UV_NET_ENDPOINT_INTD;
+                        ad2->header.int_both = 1;
+                        /* all others need to be set to zero:
+                           fairness chaining multilevel count replied_to */
+                }
+                pqp = (struct bau_payload_queue_entry *)
+                    kmalloc_node((DESTINATION_PAYLOAD_QUEUE_SIZE + 1) *
+                                 sizeof(struct bau_payload_queue_entry),
+                                 GFP_KERNEL, i);
+                if (!pqp)
+                        BUG();
+                if (sizeof(struct bau_payload_queue_entry) != 32)
+                        BUG();
+                if ((unsigned long)(&((struct bau_payload_queue_entry *)0)->
+                                    sw_ack_vector) != 15)
+                        BUG();
+                cp = (char *)pqp + 31;
+                pqp = (struct bau_payload_queue_entry *)
+                    (((unsigned long)cp >> 5) << 5);
+                bau_tablesp->va_queue_first = pqp;
+                uv_write_global_mmr64(pnode,
+                                      UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
+                                      ((unsigned long)pnode <<
+                                       UV_PAYLOADQ_PNODE_SHIFT) |
+                                      uv_physnodeaddr(pqp));
+                uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL,
+                                      uv_physnodeaddr(pqp));
+                bau_tablesp->va_queue_last =
+                    pqp + (DESTINATION_PAYLOAD_QUEUE_SIZE - 1);
+                uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST,
+                                      (unsigned long)
+                                      uv_physnodeaddr(bau_tablesp->
+                                                      va_queue_last));
+                memset(pqp, 0, sizeof(struct bau_payload_queue_entry) *
+                       DESTINATION_PAYLOAD_QUEUE_SIZE);
+                /* this initialization can't be in firmware because the
+                   messaging IRQ will be determined by the OS */
+                apicid = per_cpu(x86_cpu_to_apicid, cur_cpu);
+                pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG);
+                if ((pa & 0xff) != UV_BAU_MESSAGE) {
+                        uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
+                                              ((apicid << 32) |
+                                               UV_BAU_MESSAGE));
+                }
+                for (j = cur_cpu; j < (cur_cpu + uv_blade_nr_possible_cpus(i));
+                     j++) {
+                        bcp = (struct bau_control *)&per_cpu(bau_control, j);
+                        bcp->bau_msg_head = bau_tablesp->va_queue_first;
+                        bcp->va_queue_first = bau_tablesp->va_queue_first;
+                        bcp->va_queue_last = bau_tablesp->va_queue_last;
+                        bcp->watching = bau_tablesp->watching;
+                        bcp->msg_statuses = bau_tablesp->msg_statuses;
+                        bcp->descriptor_base = adp;
+                }
+                cur_cpu += uv_blade_nr_possible_cpus(i);
+        }
+        set_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1);
+        uv_enable_timeouts();
+        return 0;
+}
+__initcall(uv_bau_init);
author	Cliff Wickman <cpw@sgi.com>	2008-06-02 09:56:14 -0400
committer	Ingo Molnar <mingo@elte.hu>	2008-07-08 06:23:22 -0400
commit	1812924bb1823950c1dc95c478b71b037057356e (patch)
tree	74ecf29e332a320d7850008ca4f8607dace88de6 /arch/x86/kernel/tlb_uv.c
parent	d98b940ab29a245de84a1c138b866dcc29217601 (diff)

diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c new file mode 100644 index 000000000000..28e7c68d9d78 --- /dev/null +++ b/arch/x86/kernel/tlb_uv.c
@@ -0,0 +1,736 @@
	1	/*
	2	* SGI UltraViolet TLB flush routines.
	3	*
	4	* (c) 2008 Cliff Wickman <cpw@sgi.com>, SGI.
	5	*
	6	* This code is released under the GNU General Public License version 2 or
	7	* later.
	8	*/
	9	#include <linux/mc146818rtc.h>
	10	#include <linux/proc_fs.h>
	11	#include <linux/kernel.h>
	12
	13	#include <asm/mach-bigsmp/mach_apic.h>
	14	#include <asm/mmu_context.h>
	15	#include <asm/idle.h>
	16	#include <asm/genapic.h>
	17	#include <asm/uv/uv_hub.h>
	18	#include <asm/uv/uv_mmrs.h>
	19	#include <asm/uv/uv_bau.h>
	20
	21	struct bau_control **uv_bau_table_bases;
	22	static int uv_bau_retry_limit;
	23	static int uv_nshift; /* position of pnode (which is nasid>>1) */
	24	static unsigned long uv_mmask;
	25
	26	char *status_table[] = {
	27	"IDLE",
	28	"ACTIVE",
	29	"DESTINATION TIMEOUT",
	30	"SOURCE TIMEOUT"
	31	};
	32
	33	DEFINE_PER_CPU(struct ptc_stats, ptcstats);
	34	DEFINE_PER_CPU(struct bau_control, bau_control);
	35
	36	/*
	37	* Free a software acknowledge hardware resource by clearing its Pending
	38	* bit. This will return a reply to the sender.
	39	* If the message has timed out, a reply has already been sent by the
	40	* hardware but the resource has not been released. In that case our
	41	* clear of the Timeout bit (as well) will free the resource. No reply will
	42	* be sent (the hardware will only do one reply per message).
	43	*/
	44	static void
	45	uv_reply_to_message(int resource,
	46	struct bau_payload_queue_entry *msg,
	47	struct bau_msg_status *msp)
	48	{
	49	int fw;
	50
	51	fw = (1 << (resource + UV_SW_ACK_NPENDING)) \| (1 << resource);
	52	msg->replied_to = 1;
	53	msg->sw_ack_vector = 0;
	54	if (msp)
	55	msp->seen_by.bits = 0;
	56	uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, fw);
	57	return;
	58	}
	59
	60	/*
	61	* Do all the things a cpu should do for a TLB shootdown message.
	62	* Other cpu's may come here at the same time for this message.
	63	*/
	64	static void
	65	uv_bau_process_message(struct bau_payload_queue_entry *msg,
	66	int msg_slot, int sw_ack_slot)
	67	{
	68	int cpu;
	69	unsigned long this_cpu_mask;
	70	struct bau_msg_status *msp;
	71
	72	msp = __get_cpu_var(bau_control).msg_statuses + msg_slot;
	73	cpu = uv_blade_processor_id();
	74	msg->number_of_cpus =
	75	uv_blade_nr_online_cpus(uv_node_to_blade_id(numa_node_id()));
	76	this_cpu_mask = (unsigned long)1 << cpu;
	77	if (msp->seen_by.bits & this_cpu_mask)
	78	return;
	79	atomic_or_long(&msp->seen_by.bits, this_cpu_mask);
	80
	81	if (msg->replied_to == 1)
	82	return;
	83
	84	if (msg->address == TLB_FLUSH_ALL) {
	85	local_flush_tlb();
	86	__get_cpu_var(ptcstats).alltlb++;
	87	} else {
	88	__flush_tlb_one(msg->address);
	89	__get_cpu_var(ptcstats).onetlb++;
	90	}
	91
	92	__get_cpu_var(ptcstats).requestee++;
	93
	94	atomic_inc_short(&msg->acknowledge_count);
	95	if (msg->number_of_cpus == msg->acknowledge_count)
	96	uv_reply_to_message(sw_ack_slot, msg, msp);
	97	return;
	98	}
	99
	100	/*
	101	* Examine the payload queue on all the distribution nodes to see
	102	* which messages have not been seen, and which cpu(s) have not seen them.
	103	*
	104	* Returns the number of cpu's that have not responded.
	105	*/
	106	static int
	107	uv_examine_destinations(struct bau_target_nodemask *distribution)
	108	{
	109	int sender;
	110	int i;
	111	int j;
	112	int k;
	113	int count = 0;
	114	struct bau_control *bau_tablesp;
	115	struct bau_payload_queue_entry *msg;
	116	struct bau_msg_status *msp;
	117
	118	sender = smp_processor_id();
	119	for (i = 0; i < (sizeof(struct bau_target_nodemask) * BITSPERBYTE);
	120	i++) {
	121	if (bau_node_isset(i, distribution)) {
	122	bau_tablesp = uv_bau_table_bases[i];
	123	for (msg = bau_tablesp->va_queue_first, j = 0;
	124	j < DESTINATION_PAYLOAD_QUEUE_SIZE; msg++, j++) {
	125	if ((msg->sending_cpu == sender) &&
	126	(!msg->replied_to)) {
	127	msp = bau_tablesp->msg_statuses + j;
	128	printk(KERN_DEBUG
	129	"blade %d: address:%#lx %d of %d, not cpu(s): ",
	130	i, msg->address,
	131	msg->acknowledge_count,
	132	msg->number_of_cpus);
	133	for (k = 0; k < msg->number_of_cpus;
	134	k++) {
	135	if (!((long)1 << k & msp->
	136	seen_by.bits)) {
	137	count++;
	138	printk("%d ", k);
	139	}
	140	}
	141	printk("\n");
	142	}
	143	}
	144	}
	145	}
	146	return count;
	147	}
	148
	149	/**
	150	* uv_flush_tlb_others - globally purge translation cache of a virtual
	151	* address or all TLB's
	152	* @cpumaskp: mask of all cpu's in which the address is to be removed
	153	* @mm: mm_struct containing virtual address range
	154	* @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
	155	*
	156	* This is the entry point for initiating any UV global TLB shootdown.
	157	*
	158	* Purges the translation caches of all specified processors of the given
	159	* virtual address, or purges all TLB's on specified processors.
	160	*
	161	* The caller has derived the cpumaskp from the mm_struct and has subtracted
	162	* the local cpu from the mask. This function is called only if there
	163	* are bits set in the mask. (e.g. flush_tlb_page())
	164	*
	165	* The cpumaskp is converted into a nodemask of the nodes containing
	166	* the cpus.
	167	*/
	168	int
	169	uv_flush_tlb_others(cpumask_t cpumaskp, struct mm_struct mm, unsigned long va)
	170	{
	171	int i;
	172	int blade;
	173	int cpu;
	174	int bit;
	175	int right_shift;
	176	int this_blade;
	177	int exams = 0;
	178	int tries = 0;
	179	long source_timeouts = 0;
	180	long destination_timeouts = 0;
	181	unsigned long index;
	182	unsigned long mmr_offset;
	183	unsigned long descriptor_status;
	184	struct bau_activation_descriptor *bau_desc;
	185	ktime_t time1, time2;
	186
	187	cpu = uv_blade_processor_id();
	188	this_blade = uv_numa_blade_id();
	189	bau_desc = __get_cpu_var(bau_control).descriptor_base;
	190	bau_desc += (UV_ITEMS_PER_DESCRIPTOR * cpu);
	191
	192	bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
	193
	194	i = 0;
	195	for_each_cpu_mask(bit, *cpumaskp) {
	196	blade = uv_cpu_to_blade_id(bit);
	197	if (blade > (UV_DISTRIBUTION_SIZE - 1))
	198	BUG();
	199	if (blade == this_blade)
	200	continue;
	201	bau_node_set(blade, &bau_desc->distribution);
	202	/* leave the bits for the remote cpu's in the mask until
	203	success; on failure we fall back to the IPI method */
	204	i++;
	205	}
	206	if (i == 0)
	207	goto none_to_flush;
	208	__get_cpu_var(ptcstats).requestor++;
	209	__get_cpu_var(ptcstats).ntargeted += i;
	210
	211	bau_desc->payload.address = va;
	212	bau_desc->payload.sending_cpu = smp_processor_id();
	213
	214	if (cpu < UV_CPUS_PER_ACT_STATUS) {
	215	mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
	216	right_shift = cpu * UV_ACT_STATUS_SIZE;
	217	} else {
	218	mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
	219	right_shift =
	220	((cpu - UV_CPUS_PER_ACT_STATUS) * UV_ACT_STATUS_SIZE);
	221	}
	222	time1 = ktime_get();
	223
	224	retry:
	225	tries++;
	226	index = ((unsigned long)
	227	1 << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) \| cpu;
	228	uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
	229
	230	while ((descriptor_status = (((unsigned long)
	231	uv_read_local_mmr(mmr_offset) >>
	232	right_shift) & UV_ACT_STATUS_MASK)) !=
	233	DESC_STATUS_IDLE) {
	234	if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
	235	source_timeouts++;
	236	if (source_timeouts > SOURCE_TIMEOUT_LIMIT)
	237	source_timeouts = 0;
	238	__get_cpu_var(ptcstats).s_retry++;
	239	goto retry;
	240	}
	241	/* spin here looking for progress at the destinations */
	242	if (descriptor_status == DESC_STATUS_DESTINATION_TIMEOUT) {
	243	destination_timeouts++;
	244	if (destination_timeouts > DESTINATION_TIMEOUT_LIMIT) {
	245	/* returns # of cpus not responding */
	246	if (uv_examine_destinations
	247	(&bau_desc->distribution) == 0) {
	248	__get_cpu_var(ptcstats).d_retry++;
	249	goto retry;
	250	}
	251	exams++;
	252	if (exams >= uv_bau_retry_limit) {
	253	printk(KERN_DEBUG
	254	"uv_flush_tlb_others");
	255	printk("giving up on cpu %d\n",
	256	smp_processor_id());
	257	goto unsuccessful;
	258	}
	259	/* delays can hang up the simulator
	260	udelay(1000);
	261	*/
	262	destination_timeouts = 0;
	263	}
	264	}
	265	}
	266	if (tries > 1)
	267	__get_cpu_var(ptcstats).retriesok++;
	268	/* on success, clear the remote cpu's from the mask so we don't
	269	use the IPI method of shootdown on them */
	270	for_each_cpu_mask(bit, *cpumaskp) {
	271	blade = uv_cpu_to_blade_id(bit);
	272	if (blade == this_blade)
	273	continue;
	274	cpu_clear(bit, *cpumaskp);
	275	}
	276
	277	unsuccessful:
	278	time2 = ktime_get();
	279	__get_cpu_var(ptcstats).sflush_ns += (time2.tv64 - time1.tv64);
	280
	281	none_to_flush:
	282	if (cpus_empty(*cpumaskp))
	283	return 1;
	284
	285	/* Cause the caller to do an IPI-style TLB shootdown on
	286	the cpu's still in the mask */
	287	__get_cpu_var(ptcstats).ptc_i++;
	288	return 0;
	289	}
	290
	291	/*
	292	* The BAU message interrupt comes here. (registered by set_intr_gate)
	293	* See entry_64.S
	294	*
	295	* We received a broadcast assist message.
	296	*
	297	* Interrupts may have been disabled; this interrupt could represent
	298	* the receipt of several messages.
	299	*
	300	* All cores/threads on this node get this interrupt.
	301	* The last one to see it does the s/w ack.
	302	* (the resource will not be freed until noninterruptable cpus see this
	303	* interrupt; hardware will timeout the s/w ack and reply ERROR)
	304	*/
	305	void
	306	uv_bau_message_interrupt(struct pt_regs *regs)
	307	{
	308	struct bau_payload_queue_entry *pqp;
	309	struct bau_payload_queue_entry *msg;
	310	struct pt_regs *old_regs = set_irq_regs(regs);
	311	ktime_t time1, time2;
	312	int msg_slot;
	313	int sw_ack_slot;
	314	int fw;
	315	int count = 0;
	316	unsigned long local_pnode;
	317
	318	ack_APIC_irq();
	319	exit_idle();
	320	irq_enter();
	321
	322	time1 = ktime_get();
	323
	324	local_pnode = uv_blade_to_pnode(uv_numa_blade_id());
	325
	326	pqp = __get_cpu_var(bau_control).va_queue_first;
	327	msg = __get_cpu_var(bau_control).bau_msg_head;
	328	while (msg->sw_ack_vector) {
	329	count++;
	330	fw = msg->sw_ack_vector;
	331	msg_slot = msg - pqp;
	332	sw_ack_slot = ffs(fw) - 1;
	333
	334	uv_bau_process_message(msg, msg_slot, sw_ack_slot);
	335
	336	msg++;
	337	if (msg > __get_cpu_var(bau_control).va_queue_last)
	338	msg = __get_cpu_var(bau_control).va_queue_first;
	339	__get_cpu_var(bau_control).bau_msg_head = msg;
	340	}
	341	if (!count)
	342	__get_cpu_var(ptcstats).nomsg++;
	343	else if (count > 1)
	344	__get_cpu_var(ptcstats).multmsg++;
	345
	346	time2 = ktime_get();
	347	__get_cpu_var(ptcstats).dflush_ns += (time2.tv64 - time1.tv64);
	348
	349	irq_exit();
	350	set_irq_regs(old_regs);
	351	return;
	352	}
	353
	354	static void
	355	uv_enable_timeouts(void)
	356	{
	357	int i;
	358	int blade;
	359	int last_blade;
	360	int pnode;
	361	int cur_cpu = 0;
	362	unsigned long apicid;
	363
	364	/* better if we had each_online_blade */
	365	last_blade = -1;
	366	for_each_online_node(i) {
	367	blade = uv_node_to_blade_id(i);
	368	if (blade == last_blade)
	369	continue;
	370	last_blade = blade;
	371	apicid = per_cpu(x86_cpu_to_apicid, cur_cpu);
	372	pnode = uv_blade_to_pnode(blade);
	373	cur_cpu += uv_blade_nr_possible_cpus(i);
	374	}
	375	return;
	376	}
	377
	378	static void *
	379	uv_ptc_seq_start(struct seq_file file, loff_t offset)
	380	{
	381	if (*offset < num_possible_cpus())
	382	return offset;
	383	return NULL;
	384	}
	385
	386	static void *
	387	uv_ptc_seq_next(struct seq_file file, void data, loff_t *offset)
	388	{
	389	(*offset)++;
	390	if (*offset < num_possible_cpus())
	391	return offset;
	392	return NULL;
	393	}
	394
	395	static void
	396	uv_ptc_seq_stop(struct seq_file file, void data)
	397	{
	398	}
	399
	400	/*
	401	* Display the statistics thru /proc
	402	* data points to the cpu number
	403	*/
	404	static int
	405	uv_ptc_seq_show(struct seq_file file, void data)
	406	{
	407	struct ptc_stats *stat;
	408	int cpu;
	409
	410	cpu = (loff_t )data;
	411
	412	if (!cpu) {
	413	seq_printf(file,
	414	"# cpu requestor requestee one all sretry dretry ptc_i ");
	415	seq_printf(file,
	416	"sw_ack sflush_us dflush_us sok dnomsg dmult starget\n");
	417	}
	418	if (cpu < num_possible_cpus() && cpu_online(cpu)) {
	419	stat = &per_cpu(ptcstats, cpu);
	420	seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld ",
	421	cpu, stat->requestor,
	422	stat->requestee, stat->onetlb, stat->alltlb,
	423	stat->s_retry, stat->d_retry, stat->ptc_i);
	424	seq_printf(file, "%lx %ld %ld %ld %ld %ld %ld\n",
	425	uv_read_global_mmr64(uv_blade_to_pnode
	426	(uv_cpu_to_blade_id(cpu)),
	427	UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
	428	stat->sflush_ns / 1000, stat->dflush_ns / 1000,
	429	stat->retriesok, stat->nomsg,
	430	stat->multmsg, stat->ntargeted);
	431	}
	432
	433	return 0;
	434	}
	435
	436	/*
	437	* 0: display meaning of the statistics
	438	* >0: retry limit
	439	*/
	440	static ssize_t
	441	uv_ptc_proc_write(struct file file, const char __user user,
	442	size_t count, loff_t *data)
	443	{
	444	long newmode;
	445	char optstr[64];
	446
	447	if (copy_from_user(optstr, user, count))
	448	return -EFAULT;
	449	optstr[count - 1] = '\0';
	450	if (strict_strtoul(optstr, 10, &newmode) < 0) {
	451	printk(KERN_DEBUG "%s is invalid\n", optstr);
	452	return -EINVAL;
	453	}
	454
	455	if (newmode == 0) {
	456	printk(KERN_DEBUG "# cpu: cpu number\n");
	457	printk(KERN_DEBUG
	458	"requestor: times this cpu was the flush requestor\n");
	459	printk(KERN_DEBUG
	460	"requestee: times this cpu was requested to flush its TLBs\n");
	461	printk(KERN_DEBUG
	462	"one: times requested to flush a single address\n");
	463	printk(KERN_DEBUG
	464	"all: times requested to flush all TLB's\n");
	465	printk(KERN_DEBUG
	466	"sretry: number of retries of source-side timeouts\n");
	467	printk(KERN_DEBUG
	468	"dretry: number of retries of destination-side timeouts\n");
	469	printk(KERN_DEBUG
	470	"ptc_i: times UV fell through to IPI-style flushes\n");
	471	printk(KERN_DEBUG
	472	"sw_ack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n");
	473	printk(KERN_DEBUG
	474	"sflush_us: microseconds spent in uv_flush_tlb_others()\n");
	475	printk(KERN_DEBUG
	476	"dflush_us: microseconds spent in handling flush requests\n");
	477	printk(KERN_DEBUG "sok: successes on retry\n");
	478	printk(KERN_DEBUG "dnomsg: interrupts with no message\n");
	479	printk(KERN_DEBUG
	480	"dmult: interrupts with multiple messages\n");
	481	printk(KERN_DEBUG "starget: nodes targeted\n");
	482	} else {
	483	uv_bau_retry_limit = newmode;
	484	printk(KERN_DEBUG "timeout retry limit:%d\n",
	485	uv_bau_retry_limit);
	486	}
	487
	488	return count;
	489	}
	490
	491	static const struct seq_operations uv_ptc_seq_ops = {
	492	.start = uv_ptc_seq_start,
	493	.next = uv_ptc_seq_next,
	494	.stop = uv_ptc_seq_stop,
	495	.show = uv_ptc_seq_show
	496	};
	497
	498	static int
	499	uv_ptc_proc_open(struct inode inode, struct file file)
	500	{
	501	return seq_open(file, &uv_ptc_seq_ops);
	502	}
	503
	504	static const struct file_operations proc_uv_ptc_operations = {
	505	.open = uv_ptc_proc_open,
	506	.read = seq_read,
	507	.write = uv_ptc_proc_write,
	508	.llseek = seq_lseek,
	509	.release = seq_release,
	510	};
	511
	512	static struct proc_dir_entry *proc_uv_ptc;
	513
	514	static int __init
	515	uv_ptc_init(void)
	516	{
	517	static struct proc_dir_entry *sgi_proc_dir;
	518
	519	sgi_proc_dir = NULL;
	520
	521	if (!is_uv_system())
	522	return 0;
	523
	524	sgi_proc_dir = proc_mkdir("sgi_uv", NULL);
	525	if (!sgi_proc_dir)
	526	return -EINVAL;
	527
	528	proc_uv_ptc = create_proc_entry(UV_PTC_BASENAME, 0444, NULL);
	529	if (!proc_uv_ptc) {
	530	printk(KERN_ERR "unable to create %s proc entry\n",
	531	UV_PTC_BASENAME);
	532	return -EINVAL;
	533	}
	534	proc_uv_ptc->proc_fops = &proc_uv_ptc_operations;
	535	return 0;
	536	}
	537
	538	static void __exit
	539	uv_ptc_exit(void)
	540	{
	541	remove_proc_entry(UV_PTC_BASENAME, NULL);
	542	}
	543
	544	module_init(uv_ptc_init);
	545	module_exit(uv_ptc_exit);
	546
	547	/*
	548	* Initialization of BAU-related structures
	549	*/
	550	int __init
	551	uv_bau_init(void)
	552	{
	553	int i;
	554	int j;
	555	int blade;
	556	int nblades;
	557	int *ip;
	558	int pnode;
	559	int last_blade;
	560	int cur_cpu = 0;
	561	unsigned long pa;
	562	unsigned long n;
	563	unsigned long m;
	564	unsigned long mmr_image;
	565	unsigned long apicid;
	566	char *cp;
	567	struct bau_control *bau_tablesp;
	568	struct bau_activation_descriptor adp, ad2;
	569	struct bau_payload_queue_entry *pqp;
	570	struct bau_msg_status *msp;
	571	struct bau_control *bcp;
	572
	573	if (!is_uv_system())
	574	return 0;
	575
	576	uv_bau_retry_limit = 1;
	577
	578	if ((sizeof(struct bau_local_cpumask) * BITSPERBYTE) <
	579	MAX_CPUS_PER_NODE) {
	580	printk(KERN_ERR
	581	"uv_bau_init: bau_local_cpumask.bits too small\n");
	582	BUG();
	583	}
	584
	585	uv_nshift = uv_hub_info->n_val;
	586	uv_mmask = ((unsigned long)1 << uv_hub_info->n_val) - 1;
	587	nblades = 0;
	588	last_blade = -1;
	589	for_each_online_node(i) {
	590	blade = uv_node_to_blade_id(i);
	591	if (blade == last_blade)
	592	continue;
	593	last_blade = blade;
	594	nblades++;
	595	}
	596
	597	uv_bau_table_bases = (struct bau_control **)
	598	kmalloc(nblades * sizeof(struct bau_control *), GFP_KERNEL);
	599	if (!uv_bau_table_bases)
	600	BUG();
	601
	602	/* better if we had each_online_blade */
	603	last_blade = -1;
	604	for_each_online_node(i) {
	605	blade = uv_node_to_blade_id(i);
	606	if (blade == last_blade)
	607	continue;
	608	last_blade = blade;
	609
	610	bau_tablesp =
	611	kmalloc_node(sizeof(struct bau_control), GFP_KERNEL, i);
	612	if (!bau_tablesp)
	613	BUG();
	614
	615	bau_tablesp->msg_statuses =
	616	kmalloc_node(sizeof(struct bau_msg_status) *
	617	DESTINATION_PAYLOAD_QUEUE_SIZE, GFP_KERNEL, i);
	618	if (!bau_tablesp->msg_statuses)
	619	BUG();
	620	for (j = 0, msp = bau_tablesp->msg_statuses;
	621	j < DESTINATION_PAYLOAD_QUEUE_SIZE; j++, msp++) {
	622	bau_cpubits_clear(&msp->seen_by, (int)
	623	uv_blade_nr_possible_cpus(blade));
	624	}
	625
	626	bau_tablesp->watching =
	627	kmalloc_node(sizeof(int) * DESTINATION_NUM_RESOURCES,
	628	GFP_KERNEL, i);
	629	if (!bau_tablesp->watching)
	630	BUG();
	631	for (j = 0, ip = bau_tablesp->watching;
	632	j < DESTINATION_PAYLOAD_QUEUE_SIZE; j++, ip++) {
	633	*ip = 0;
	634	}
	635
	636	uv_bau_table_bases[i] = bau_tablesp;
	637
	638	pnode = uv_blade_to_pnode(blade);
	639
	640	if (sizeof(struct bau_activation_descriptor) != 64)
	641	BUG();
	642
	643	adp = (struct bau_activation_descriptor *)
	644	kmalloc_node(16384, GFP_KERNEL, i);
	645	if (!adp)
	646	BUG();
	647	if ((unsigned long)adp & 0xfff)
	648	BUG();
	649	pa = __pa((unsigned long)adp);
	650	n = pa >> uv_nshift;
	651	m = pa & uv_mmask;
	652
	653	mmr_image = uv_read_global_mmr64(pnode,
	654	UVH_LB_BAU_SB_DESCRIPTOR_BASE);
	655	if (mmr_image)
	656	uv_write_global_mmr64(pnode, (unsigned long)
	657	UVH_LB_BAU_SB_DESCRIPTOR_BASE,
	658	(n << UV_DESC_BASE_PNODE_SHIFT \|
	659	m));
	660	for (j = 0, ad2 = adp; j < UV_ACTIVATION_DESCRIPTOR_SIZE;
	661	j++, ad2++) {
	662	memset(ad2, 0,
	663	sizeof(struct bau_activation_descriptor));
	664	ad2->header.sw_ack_flag = 1;
	665	ad2->header.base_dest_nodeid =
	666	uv_blade_to_pnode(uv_cpu_to_blade_id(0));
	667	ad2->header.command = UV_NET_ENDPOINT_INTD;
	668	ad2->header.int_both = 1;
	669	/* all others need to be set to zero:
	670	fairness chaining multilevel count replied_to */
	671	}
	672
	673	pqp = (struct bau_payload_queue_entry *)
	674	kmalloc_node((DESTINATION_PAYLOAD_QUEUE_SIZE + 1) *
	675	sizeof(struct bau_payload_queue_entry),
	676	GFP_KERNEL, i);
	677	if (!pqp)
	678	BUG();
	679	if (sizeof(struct bau_payload_queue_entry) != 32)
	680	BUG();
	681	if ((unsigned long)(&((struct bau_payload_queue_entry *)0)->
	682	sw_ack_vector) != 15)
	683	BUG();
	684
	685	cp = (char *)pqp + 31;
	686	pqp = (struct bau_payload_queue_entry *)
	687	(((unsigned long)cp >> 5) << 5);
	688	bau_tablesp->va_queue_first = pqp;
	689	uv_write_global_mmr64(pnode,
	690	UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
	691	((unsigned long)pnode <<
	692	UV_PAYLOADQ_PNODE_SHIFT) \|
	693	uv_physnodeaddr(pqp));
	694	uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL,
	695	uv_physnodeaddr(pqp));
	696	bau_tablesp->va_queue_last =
	697	pqp + (DESTINATION_PAYLOAD_QUEUE_SIZE - 1);
	698	uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST,
	699	(unsigned long)
	700	uv_physnodeaddr(bau_tablesp->
	701	va_queue_last));
	702	memset(pqp, 0, sizeof(struct bau_payload_queue_entry) *
	703	DESTINATION_PAYLOAD_QUEUE_SIZE);
	704
	705	/* this initialization can't be in firmware because the
	706	messaging IRQ will be determined by the OS */
	707	apicid = per_cpu(x86_cpu_to_apicid, cur_cpu);
	708	pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG);
	709	if ((pa & 0xff) != UV_BAU_MESSAGE) {
	710	uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
	711	((apicid << 32) \|
	712	UV_BAU_MESSAGE));
	713	}
	714
	715	for (j = cur_cpu; j < (cur_cpu + uv_blade_nr_possible_cpus(i));
	716	j++) {
	717	bcp = (struct bau_control *)&per_cpu(bau_control, j);
	718	bcp->bau_msg_head = bau_tablesp->va_queue_first;
	719	bcp->va_queue_first = bau_tablesp->va_queue_first;
	720
	721	bcp->va_queue_last = bau_tablesp->va_queue_last;
	722	bcp->watching = bau_tablesp->watching;
	723	bcp->msg_statuses = bau_tablesp->msg_statuses;
	724	bcp->descriptor_base = adp;
	725	}
	726	cur_cpu += uv_blade_nr_possible_cpus(i);
	727	}
	728
	729	set_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1);
	730
	731	uv_enable_timeouts();
	732
	733	return 0;
	734	}
	735
	736	__initcall(uv_bau_init);