gpu: nvgpu: vgpu: cyclestat snapshot support

Add support for cyclestats snapshots in the virtual case Bug 1700143 JIRA EVLR-278 Change-Id: I376a8804d57324f43eb16452d857a3b7bb0ecc90 Signed-off-by: Peter Daifuku <pdaifuku@nvidia.com> Reviewed-on: http://git-master/r/1211547 GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
author: Peter Daifuku <pdaifuku@nvidia.com> 2016-08-31 20:04:56 -0400
committer: mobile promotions <svcmobile_promotions@nvidia.com> 2016-09-08 19:04:09 -0400
commit: 9aa7de15c2a644e9c7e9c157e49087e66d4ac3d0 (patch)
tree: e5080886f09aa75c6a3cc83e5b27f8f7553678a4
parent: 70cad5fbb593602a49f91e57c04d1da0334b3a49 (diff)
14 files changed, 505 insertions, 143 deletions
diff --git a/drivers/gpu/nvgpu/Makefile.nvgpu b/drivers/gpu/nvgpu/Makefile.nvgpu
index 5ca2f56f..b8e38919 100644
--- a/drivers/gpu/nvgpu/Makefile.nvgpu
+++ b/drivers/gpu/nvgpu/Makefile.nvgpu
@@ -105,6 +105,7 @@ nvgpu-$(CONFIG_TEGRA_GR_VIRTUALIZATION) += \
        vgpu/dbg_vgpu.o \
        vgpu/fecs_trace_vgpu.o \
        vgpu/tsg_vgpu.o \
+        vgpu/css_vgpu.o \
        vgpu/gk20a/vgpu_hal_gk20a.o \
        vgpu/gk20a/vgpu_gr_gk20a.o \
        vgpu/gm20b/vgpu_hal_gm20b.o \
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 49711af9..d23a8026 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -603,7 +603,7 @@ static int gk20a_flush_cycle_stats_snapshot(struct channel_gk20a *ch)
        mutex_lock(&ch->cs_client_mutex);
        if (ch->cs_client)
-                ret = gr_gk20a_css_flush(ch->g, ch->cs_client);
+                ret = gr_gk20a_css_flush(ch, ch->cs_client);
        else
                ret = -EBADF;
        mutex_unlock(&ch->cs_client_mutex);
@@ -622,7 +622,7 @@ static int gk20a_attach_cycle_stats_snapshot(struct channel_gk20a *ch,
        if (ch->cs_client) {
                ret = -EEXIST;
        } else {
-                ret = gr_gk20a_css_attach(ch->g,
+                ret = gr_gk20a_css_attach(ch,
                                        dmabuf_fd,
                                        perfmon_id_count,
                                        perfmon_id_start,
@@ -639,7 +639,7 @@ static int gk20a_free_cycle_stats_snapshot(struct channel_gk20a *ch)
        mutex_lock(&ch->cs_client_mutex);
        if (ch->cs_client) {
-                ret = gr_gk20a_css_detach(ch->g, ch->cs_client);
+                ret = gr_gk20a_css_detach(ch, ch->cs_client);
                ch->cs_client = NULL;
        } else {
                ret = 0;
diff --git a/drivers/gpu/nvgpu/gk20a/css_gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/css_gr_gk20a.c
index 62f60761..71614d6e 100644
--- a/drivers/gpu/nvgpu/gk20a/css_gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/css_gr_gk20a.c
@@ -25,93 +25,13 @@
 #include "gk20a.h"
 #include "hw_perf_gk20a.h"
 #include "hw_mc_gk20a.h"
+#include "css_gr_gk20a.h"
-/* cycle stats fifo header (must match NvSnapshotBufferFifo) */
-struct gk20a_cs_snapshot_fifo {
-        /* layout description of the buffer */
-        u32     start;
-        u32     end;
-        /* snafu bits */
-        u32     hw_overflow_events_occured;
-        u32     sw_overflow_events_occured;
-        /* the kernel copies new entries to put and
-         * increment the put++. if put == get then
-         * overflowEventsOccured++
-         */
-        u32     put;
-        u32     _reserved10;
-        u32     _reserved11;
-        u32     _reserved12;
-        /* the driver/client reads from get until
-         * put==get, get++ */
-        u32     get;
-        u32     _reserved20;
-        u32     _reserved21;
-        u32     _reserved22;
-        /* unused */
-        u32     _reserved30;
-        u32     _reserved31;
-        u32     _reserved32;
-        u32     _reserved33;
-};
-/* cycle stats fifo entry (must match NvSnapshotBufferFifoEntry) */
-struct gk20a_cs_snapshot_fifo_entry {
-        /* global 48 timestamp */
-        u32     timestamp31_00:32;
-        u32     timestamp39_32:8;
-        /* id of perfmon, should correlate with CSS_MAX_PERFMON_IDS */
-        u32     perfmon_id:8;
-        /* typically samples_counter is wired to #pmtrigger count */
-        u32     samples_counter:12;
-        /* DS=Delay Sample, SZ=Size (0=32B, 1=16B) */
-        u32     ds:1;
-        u32     sz:1;
-        u32     zero0:1;
-        u32     zero1:1;
-        /* counter results */
-        u32     event_cnt:32;
-        u32     trigger0_cnt:32;
-        u32     trigger1_cnt:32;
-        u32     sample_cnt:32;
-        /* Local PmTrigger results for Maxwell+ or padding otherwise */
-        u16     local_trigger_b_count:16;
-        u16     book_mark_b:16;
-        u16     local_trigger_a_count:16;
-        u16     book_mark_a:16;
-};
-/* cycle stats snapshot client data (e.g. associated with channel) */
-struct gk20a_cs_snapshot_client {
-        struct list_head        list;
-        u32                     dmabuf_fd;
-        struct dma_buf          *dma_handler;
-        struct gk20a_cs_snapshot_fifo   *snapshot;
-        u32                     snapshot_size;
-        u32                     perfmon_start;
-        u32                     perfmon_count;
-};
 /* check client for pointed perfmon ownership */
 #define CONTAINS_PERFMON(cl, pm)                                \
                ((cl)->perfmon_start <= (pm) &&                 \
                ((pm) - (cl)->perfmon_start) < (cl)->perfmon_count)
-/* the minimal size of HW buffer - should be enough to avoid HW overflows */
-#define CSS_MIN_HW_SNAPSHOT_SIZE        (8 * 1024 * 1024)
 /* the minimal size of client buffer */
 #define CSS_MIN_CLIENT_SNAPSHOT_SIZE                            \
                (sizeof(struct gk20a_cs_snapshot_fifo) +        \
@@ -131,20 +51,6 @@ struct gk20a_cs_snapshot_client {
 /* should correlate with size of gk20a_cs_snapshot_fifo_entry::perfmon_id */
 #define CSS_MAX_PERFMON_IDS     256
-/* local definitions to avoid hardcodes sizes and shifts */
-#define PM_BITMAP_SIZE  DIV_ROUND_UP(CSS_MAX_PERFMON_IDS, BITS_PER_LONG)
-/* cycle stats snapshot control structure for one HW entry and many clients */
-struct gk20a_cs_snapshot {
-        unsigned long perfmon_ids[PM_BITMAP_SIZE];
-        struct list_head        clients;
-        struct mem_desc         hw_memdesc;
-        /* pointer to allocated cpu_va memory where GPU place data */
-        struct gk20a_cs_snapshot_fifo_entry     *hw_snapshot;
-        struct gk20a_cs_snapshot_fifo_entry     *hw_end;
-        struct gk20a_cs_snapshot_fifo_entry     *hw_get;
-};
 /* reports whether the hw queue overflowed */
 static inline bool css_hw_get_overflow_status(struct gk20a *g)
 {
@@ -215,10 +121,13 @@ static int css_gr_create_shared_data(struct gr_gk20a *gr)
        return 0;
 }
-static int css_hw_enable_snapshot(struct gr_gk20a *gr, u32 snapshot_size)
+static int css_hw_enable_snapshot(struct channel_gk20a *ch,
+                                struct gk20a_cs_snapshot_client *cs_client)
 {
-        struct gk20a *g = gr->g;
+        struct gk20a *g = ch->g;
+        struct gr_gk20a *gr = &g->gr;
        struct gk20a_cs_snapshot *data = gr->cs_data;
+        u32 snapshot_size = cs_client->snapshot_size;
        int ret;
        u32 virt_addr_lo;
@@ -317,9 +226,11 @@ static void css_hw_disable_snapshot(struct gr_gk20a *gr)
 static void css_gr_free_shared_data(struct gr_gk20a *gr)
 {
+        struct gk20a *g = gr->g;
        if (gr->cs_data) {
                /* the clients list is expected to be empty */
-                css_hw_disable_snapshot(gr);
+                g->ops.css.disable_snapshot(gr);
                /* release the objects */
                kfree(gr->cs_data);
@@ -344,12 +255,15 @@ css_gr_search_client(struct list_head *clients, u32 perfmon)
        return NULL;
 }
-static int css_gr_flush_snapshots(struct gr_gk20a *gr)
+static int css_gr_flush_snapshots(struct channel_gk20a *ch)
 {
-        struct gk20a *g = gr->g;
+        struct gk20a *g = ch->g;
+        struct gr_gk20a *gr = &g->gr;
        struct gk20a_cs_snapshot *css = gr->cs_data;
        struct gk20a_cs_snapshot_client *cur;
-        u32 pending;
+        u32 pending, completed;
+        bool hw_overflow;
+        int err;
        /* variables for iterating over HW entries */
        u32 sid;
@@ -360,24 +274,25 @@ static int css_gr_flush_snapshots(struct gr_gk20a *gr)
        struct gk20a_cs_snapshot_fifo *dst;
        struct gk20a_cs_snapshot_fifo_entry *dst_get;
        struct gk20a_cs_snapshot_fifo_entry *dst_put;
+        struct gk20a_cs_snapshot_fifo_entry *dst_nxt;
        struct gk20a_cs_snapshot_fifo_entry *dst_head;
        struct gk20a_cs_snapshot_fifo_entry *dst_tail;
        if (!css)
                return -EINVAL;
-        if (!css->hw_snapshot)
-                return -EINVAL;
        if (list_empty(&css->clients))
                return -EBADF;
        /* check data available */
-        pending = css_hw_get_pending_snapshots(g);
+        err = g->ops.css.check_data_available(ch, &pending, &hw_overflow);
+        if (err)
+                return err;
        if (!pending)
                return 0;
-        if (css_hw_get_overflow_status(g)) {
+        if (hw_overflow) {
                struct list_head *pos;
                list_for_each(pos, &css->clients) {
@@ -387,11 +302,12 @@ static int css_gr_flush_snapshots(struct gr_gk20a *gr)
                }
                gk20a_warn(dev_from_gk20a(g),
-                           "cyclestats: hardware overflow detected\n");
+                        "cyclestats: hardware overflow detected\n");
        }
-        /* proceed all items in HW buffer */
+        /* process all items in HW buffer */
        sid = 0;
+        completed = 0;
        cur = NULL;
        dst = NULL;
        dst_put = NULL;
@@ -419,7 +335,11 @@ static int css_gr_flush_snapshots(struct gr_gk20a *gr)
                                dst_get = CSS_FIFO_ENTRY(dst, dst->get);
                                dst_put = CSS_FIFO_ENTRY(dst, dst->put);
                                dst_head = CSS_FIFO_ENTRY(dst, dst->start);
-                                dst_tail = CSS_FIFO_ENTRY(dst, dst->end) - 1;
+                                dst_tail = CSS_FIFO_ENTRY(dst, dst->end);
+                                dst_nxt = dst_put + 1;
+                                if (dst_nxt == dst_tail)
+                                        dst_nxt = dst_head;
                        } else {
                                /* client not found - skipping this entry */
                                gk20a_warn(dev_from_gk20a(g),
@@ -430,8 +350,7 @@ static int css_gr_flush_snapshots(struct gr_gk20a *gr)
                }
                /* check for software overflows */
-                if (dst_put + 1 == dst_get ||
+                if (dst_nxt == dst_get) {
-                        (dst_put == dst_tail && dst_get == dst_head)) {
                        /* no data copy, no pointer updates */
                        dst->sw_overflow_events_occured++;
                        gk20a_warn(dev_from_gk20a(g),
@@ -439,10 +358,12 @@ static int css_gr_flush_snapshots(struct gr_gk20a *gr)
                                                        src->perfmon_id);
                } else {
                        *dst_put = *src;
-                        if (dst_put == dst_tail)
+                        completed++;
-                                dst_put = dst_head;
-                        else
+                        dst_put = dst_nxt++;
-                                dst_put++;
+                        if (dst_nxt == dst_tail)
+                                dst_nxt = dst_head;
                }
 next_hw_fifo_entry:
@@ -465,14 +386,17 @@ next_hw_fifo_entry:
                                (css->hw_end - css->hw_get) * sizeof(*src));
        }
        gr->cs_data->hw_get = src;
-        css_hw_set_handled_snapshots(g, sid);
-        if (pending != sid) {
+        if (g->ops.css.set_handled_snapshots)
+                g->ops.css.set_handled_snapshots(g, sid);
+        if (completed != sid) {
                /* not all entries proceed correctly. some of problems */
                /* reported as overflows, some as orphaned perfmons,   */
                /* but it will be better notify with summary about it  */
                gk20a_warn(dev_from_gk20a(g),
-                           "cyclestats: done %u from %u entries\n",
+                           "cyclestats: completed %u from %u entries\n",
-                                                        sid, pending);
+                                                        completed, pending);
        }
        return 0;
@@ -511,7 +435,8 @@ static u32 css_gr_release_perfmon_ids(struct gk20a_cs_snapshot *data,
 }
-static int css_gr_free_client_data(struct gk20a_cs_snapshot *data,
+static int css_gr_free_client_data(struct gk20a *g,
+                                struct gk20a_cs_snapshot *data,
                                struct gk20a_cs_snapshot_client *client)
 {
        int ret = 0;
@@ -519,8 +444,9 @@ static int css_gr_free_client_data(struct gk20a_cs_snapshot *data,
        if (client->list.next && client->list.prev)
                list_del(&client->list);
-        if (client->perfmon_start && client->perfmon_count) {
+        if (client->perfmon_start && client->perfmon_count
-                if (client->perfmon_count != css_gr_release_perfmon_ids(data,
+                                        && g->ops.css.release_perfmon_ids) {
+                if (client->perfmon_count != g->ops.css.release_perfmon_ids(data,
                                client->perfmon_start, client->perfmon_count))
                        ret = -EINVAL;
        }
@@ -536,7 +462,8 @@ static int css_gr_free_client_data(struct gk20a_cs_snapshot *data,
        return ret;
 }
-static int css_gr_create_client_data(struct gk20a_cs_snapshot *data,
+static int css_gr_create_client_data(struct gk20a *g,
+                        struct gk20a_cs_snapshot *data,
                        u32 dmabuf_fd, u32 perfmon_count,
                        struct gk20a_cs_snapshot_client **client)
 {
@@ -581,8 +508,12 @@ static int css_gr_create_client_data(struct gk20a_cs_snapshot *data,
        cur->snapshot->put = cur->snapshot->start;
        cur->perfmon_count = perfmon_count;
-        if (cur->perfmon_count) {
-                cur->perfmon_start = css_gr_allocate_perfmon_ids(data,
+        /* In virtual case, perfmon ID allocation is handled by the server
+         * at the time of the attach (allocate_perfmon_ids is NULL in this case)
+         */
+        if (cur->perfmon_count && g->ops.css.allocate_perfmon_ids) {
+                cur->perfmon_start = g->ops.css.allocate_perfmon_ids(data,
                                                        cur->perfmon_count);
                if (!cur->perfmon_start) {
                        ret = -ENOENT;
@@ -598,19 +529,20 @@ static int css_gr_create_client_data(struct gk20a_cs_snapshot *data,
 failed:
        *client = NULL;
        if (cur)
-                css_gr_free_client_data(data, cur);
+                css_gr_free_client_data(g, data, cur);
        return ret;
 }
-int gr_gk20a_css_attach(struct gk20a *g,
+int gr_gk20a_css_attach(struct channel_gk20a *ch,
                        u32 dmabuf_fd,
                        u32 perfmon_count,
                        u32 *perfmon_start,
                        struct gk20a_cs_snapshot_client **cs_client)
 {
        int ret = 0;
+        struct gk20a *g = ch->g;
        struct gr_gk20a *gr;
        /* we must have a placeholder to store pointer to client structure */
@@ -630,14 +562,14 @@ int gr_gk20a_css_attach(struct gk20a *g,
        if (ret)
                goto failed;
-        ret = css_gr_create_client_data(gr->cs_data,
+        ret = css_gr_create_client_data(g, gr->cs_data,
                                     dmabuf_fd,
                                     perfmon_count,
                                     cs_client);
        if (ret)
                goto failed;
-        ret = css_hw_enable_snapshot(gr, (*cs_client)->snapshot_size);
+        ret = g->ops.css.enable_snapshot(ch, *cs_client);
        if (ret)
                goto failed;
@@ -651,7 +583,7 @@ int gr_gk20a_css_attach(struct gk20a *g,
 failed:
        if (gr->cs_data) {
                if (*cs_client) {
-                        css_gr_free_client_data(gr->cs_data, *cs_client);
+                        css_gr_free_client_data(g, gr->cs_data, *cs_client);
                        *cs_client = NULL;
                }
@@ -666,10 +598,11 @@ failed:
        return ret;
 }
-int gr_gk20a_css_detach(struct gk20a *g,
+int gr_gk20a_css_detach(struct channel_gk20a *ch,
                                struct gk20a_cs_snapshot_client *cs_client)
 {
        int ret = 0;
+        struct gk20a *g = ch->g;
        struct gr_gk20a *gr;
        if (!cs_client)
@@ -680,7 +613,10 @@ int gr_gk20a_css_detach(struct gk20a *g,
        if (gr->cs_data) {
                struct gk20a_cs_snapshot *data = gr->cs_data;
-                ret = css_gr_free_client_data(data, cs_client);
+                if (g->ops.css.detach_snapshot)
+                        g->ops.css.detach_snapshot(ch, cs_client);
+                ret = css_gr_free_client_data(g, data, cs_client);
                if (list_empty(&data->clients))
                        css_gr_free_shared_data(gr);
        } else {
@@ -691,10 +627,11 @@ int gr_gk20a_css_detach(struct gk20a *g,
        return ret;
 }
-int gr_gk20a_css_flush(struct gk20a *g,
+int gr_gk20a_css_flush(struct channel_gk20a *ch,
                                struct gk20a_cs_snapshot_client *cs_client)
 {
        int ret = 0;
+        struct gk20a *g = ch->g;
        struct gr_gk20a *gr;
        if (!cs_client)
@@ -702,7 +639,7 @@ int gr_gk20a_css_flush(struct gk20a *g,
        gr = &g->gr;
        mutex_lock(&gr->cs_lock);
-        ret = css_gr_flush_snapshots(gr);
+        ret = css_gr_flush_snapshots(ch);
        mutex_unlock(&gr->cs_lock);
        return ret;
@@ -718,3 +655,31 @@ void gr_gk20a_free_cyclestats_snapshot_data(struct gk20a *g)
        mutex_unlock(&gr->cs_lock);
        mutex_destroy(&gr->cs_lock);
 }
+static int css_hw_check_data_available(struct channel_gk20a *ch, u32 *pending,
+                                        bool *hw_overflow)
+{
+        struct gk20a *g = ch->g;
+        struct gr_gk20a *gr = &g->gr;
+        struct gk20a_cs_snapshot *css = gr->cs_data;
+        if (!css->hw_snapshot)
+                return -EINVAL;
+        *pending = css_hw_get_pending_snapshots(g);
+        if (!*pending)
+                return 0;
+        *hw_overflow = css_hw_get_overflow_status(g);
+        return 0;
+}
+void gk20a_init_css_ops(struct gpu_ops *gops)
+{
+        gops->css.enable_snapshot = css_hw_enable_snapshot;
+        gops->css.disable_snapshot = css_hw_disable_snapshot;
+        gops->css.check_data_available = css_hw_check_data_available;
+        gops->css.set_handled_snapshots = css_hw_set_handled_snapshots;
+        gops->css.allocate_perfmon_ids = css_gr_allocate_perfmon_ids;
+        gops->css.release_perfmon_ids = css_gr_release_perfmon_ids;
+}
diff --git a/drivers/gpu/nvgpu/gk20a/css_gr_gk20a.h b/drivers/gpu/nvgpu/gk20a/css_gr_gk20a.h
new file mode 100644
index 00000000..be638abf
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/css_gr_gk20a.h
@@ -0,0 +1,119 @@
+/*
+ * GK20A Cycle stats snapshots support (subsystem for gr_gk20a).
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef CSS_GR_GK20A_H
+#define CSS_GR_GK20A_H
+/* the minimal size of HW buffer - should be enough to avoid HW overflows */
+#define CSS_MIN_HW_SNAPSHOT_SIZE        (8 * 1024 * 1024)
+/* cycle stats fifo header (must match NvSnapshotBufferFifo) */
+struct gk20a_cs_snapshot_fifo {
+        /* layout description of the buffer */
+        u32     start;
+        u32     end;
+        /* snafu bits */
+        u32     hw_overflow_events_occured;
+        u32     sw_overflow_events_occured;
+        /* the kernel copies new entries to put and
+         * increment the put++. if put == get then
+         * overflowEventsOccured++
+         */
+        u32     put;
+        u32     _reserved10;
+        u32     _reserved11;
+        u32     _reserved12;
+        /* the driver/client reads from get until
+         * put==get, get++ */
+        u32     get;
+        u32     _reserved20;
+        u32     _reserved21;
+        u32     _reserved22;
+        /* unused */
+        u32     _reserved30;
+        u32     _reserved31;
+        u32     _reserved32;
+        u32     _reserved33;
+};
+/* cycle stats fifo entry (must match NvSnapshotBufferFifoEntry) */
+struct gk20a_cs_snapshot_fifo_entry {
+        /* global 48 timestamp */
+        u32     timestamp31_00:32;
+        u32     timestamp39_32:8;
+        /* id of perfmon, should correlate with CSS_MAX_PERFMON_IDS */
+        u32     perfmon_id:8;
+        /* typically samples_counter is wired to #pmtrigger count */
+        u32     samples_counter:12;
+        /* DS=Delay Sample, SZ=Size (0=32B, 1=16B) */
+        u32     ds:1;
+        u32     sz:1;
+        u32     zero0:1;
+        u32     zero1:1;
+        /* counter results */
+        u32     event_cnt:32;
+        u32     trigger0_cnt:32;
+        u32     trigger1_cnt:32;
+        u32     sample_cnt:32;
+        /* Local PmTrigger results for Maxwell+ or padding otherwise */
+        u16     local_trigger_b_count:16;
+        u16     book_mark_b:16;
+        u16     local_trigger_a_count:16;
+        u16     book_mark_a:16;
+};
+/* cycle stats snapshot client data (e.g. associated with channel) */
+struct gk20a_cs_snapshot_client {
+        struct list_head        list;
+        u32                     dmabuf_fd;
+        struct dma_buf          *dma_handler;
+        struct gk20a_cs_snapshot_fifo   *snapshot;
+        u32                     snapshot_size;
+        u32                     perfmon_start;
+        u32                     perfmon_count;
+};
+/* should correlate with size of gk20a_cs_snapshot_fifo_entry::perfmon_id */
+#define CSS_MAX_PERFMON_IDS     256
+/* local definitions to avoid hardcodes sizes and shifts */
+#define PM_BITMAP_SIZE  DIV_ROUND_UP(CSS_MAX_PERFMON_IDS, BITS_PER_LONG)
+/* cycle stats snapshot control structure for one HW entry and many clients */
+struct gk20a_cs_snapshot {
+        unsigned long perfmon_ids[PM_BITMAP_SIZE];
+        struct list_head        clients;
+        struct mem_desc         hw_memdesc;
+        /* pointer to allocated cpu_va memory where GPU place data */
+        struct gk20a_cs_snapshot_fifo_entry     *hw_snapshot;
+        struct gk20a_cs_snapshot_fifo_entry     *hw_end;
+        struct gk20a_cs_snapshot_fifo_entry     *hw_get;
+};
+void gk20a_init_css_ops(struct gpu_ops *gops);
+#endif /* CSS_GR_GK20A_H */
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index 669ef1b9..1ca8ff77 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -657,6 +657,24 @@ struct gpu_ops {
        struct {
                int (*init)(struct gk20a *g);
        } bios;
+#if defined(CONFIG_GK20A_CYCLE_STATS)
+        struct {
+                int (*enable_snapshot)(struct channel_gk20a *ch,
+                                struct gk20a_cs_snapshot_client *client);
+                void (*disable_snapshot)(struct gr_gk20a *gr);
+                int (*check_data_available)(struct channel_gk20a *ch,
+                                                u32 *pending,
+                                                bool *hw_overflow);
+                void (*set_handled_snapshots)(struct gk20a *g, u32 num);
+                u32 (*allocate_perfmon_ids)(struct gk20a_cs_snapshot *data,
+                                       u32 count);
+                u32 (*release_perfmon_ids)(struct gk20a_cs_snapshot *data,
+                                      u32 start,
+                                      u32 count);
+                int (*detach_snapshot)(struct channel_gk20a *ch,
+                                struct gk20a_cs_snapshot_client *client);
+        } css;
+#endif
 };
 struct nvgpu_bios_ucode {
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
index 2a351bc3..c337a74a 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
@@ -603,16 +603,16 @@ int gr_gk20a_halt_pipe(struct gk20a *g);
 int gr_gk20a_debugfs_init(struct gk20a *g);
 #if defined(CONFIG_GK20A_CYCLE_STATS)
-int gr_gk20a_css_attach(struct gk20a *g,        /* in - main hw structure */
+int gr_gk20a_css_attach(struct channel_gk20a *ch,   /* in - main hw structure */
-                        u32 dmabuf_fd,          /* in - dma mapped memory */
+                        u32 dmabuf_fd,              /* in - dma mapped memory */
-                        u32 perfmon_id_count,   /* in - number of perfmons*/
+                        u32 perfmon_id_count,       /* in - number of perfmons*/
-                        u32 *perfmon_id_start,  /* out- index of first pm */
+                        u32 *perfmon_id_start,      /* out- index of first pm */
                        /* out - pointer to client data used in later     */
                        struct gk20a_cs_snapshot_client **css_client);
-int gr_gk20a_css_detach(struct gk20a *g,
+int gr_gk20a_css_detach(struct channel_gk20a *ch,
                                struct gk20a_cs_snapshot_client *css_client);
-int gr_gk20a_css_flush(struct gk20a *g,
+int gr_gk20a_css_flush(struct channel_gk20a *ch,
                                struct gk20a_cs_snapshot_client *css_client);
 void gr_gk20a_free_cyclestats_snapshot_data(struct gk20a *g);
diff --git a/drivers/gpu/nvgpu/gk20a/hal_gk20a.c b/drivers/gpu/nvgpu/gk20a/hal_gk20a.c
index 4da7ffad..550dffa6 100644
--- a/drivers/gpu/nvgpu/gk20a/hal_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/hal_gk20a.c
@@ -32,6 +32,7 @@
 #include "hw_proj_gk20a.h"
 #include "tsg_gk20a.h"
 #include "dbg_gpu_gk20a.h"
+#include "css_gr_gk20a.h"
 static struct gpu_ops gk20a_ops = {
        .clock_gating = {
@@ -157,6 +158,9 @@ int gk20a_init_hal(struct gk20a *g)
        gk20a_init_dbg_session_ops(gops);
        gk20a_init_therm_ops(gops);
        gk20a_init_tsg_ops(gops);
+#if defined(CONFIG_GK20A_CYCLE_STATS)
+        gk20a_init_css_ops(gops);
+#endif
        gops->name = "gk20a";
        gops->chip_init_gpu_characteristics = gk20a_init_gpu_characteristics;
        gops->get_litter_value = gk20a_get_litter_value;
diff --git a/drivers/gpu/nvgpu/gm206/hal_gm206.c b/drivers/gpu/nvgpu/gm206/hal_gm206.c
index 3c6897ea..6b43c8e9 100644
--- a/drivers/gpu/nvgpu/gm206/hal_gm206.c
+++ b/drivers/gpu/nvgpu/gm206/hal_gm206.c
@@ -41,6 +41,7 @@
 #include "gr_gm206.h"
 #include "hw_proj_gm206.h"
 #include "gk20a/dbg_gpu_gk20a.h"
+#include "gk20a/css_gr_gk20a.h"
 static struct gpu_ops gm206_ops = {
        .clock_gating = {
@@ -199,6 +200,9 @@ int gm206_init_hal(struct gk20a *g)
        gm20b_init_cde_ops(gops);
        gm20b_init_therm_ops(gops);
        gk20a_init_tsg_ops(gops);
+#if defined(CONFIG_GK20A_CYCLE_STATS)
+        gk20a_init_css_ops(gops);
+#endif
        gm206_init_bios(gops);
        switch(ver){
        case GK20A_GPUID_GM206:
diff --git a/drivers/gpu/nvgpu/gm20b/hal_gm20b.c b/drivers/gpu/nvgpu/gm20b/hal_gm20b.c
index 34e3b605..e30ca96f 100644
--- a/drivers/gpu/nvgpu/gm20b/hal_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/hal_gm20b.c
@@ -36,6 +36,7 @@
 #include "therm_gm20b.h"
 #include "hw_proj_gm20b.h"
 #include "gk20a/dbg_gpu_gk20a.h"
+#include "gk20a/css_gr_gk20a.h"
 #define FUSE_OPT_PRIV_SEC_DIS_0 0x264
 #define PRIV_SECURITY_DISABLE 0x01
@@ -226,6 +227,9 @@ int gm20b_init_hal(struct gk20a *g)
        gm20b_init_cde_ops(gops);
        gm20b_init_therm_ops(gops);
        gk20a_init_tsg_ops(gops);
+#if defined(CONFIG_GK20A_CYCLE_STATS)
+        gk20a_init_css_ops(gops);
+#endif
        gops->name = "gm20b";
        gops->chip_init_gpu_characteristics = gk20a_init_gpu_characteristics;
        gops->get_litter_value = gm20b_get_litter_value;
diff --git a/drivers/gpu/nvgpu/vgpu/css_vgpu.c b/drivers/gpu/nvgpu/vgpu/css_vgpu.c
new file mode 100644
index 00000000..486d3e88
--- /dev/null
+++ b/drivers/gpu/nvgpu/vgpu/css_vgpu.c
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#if defined(CONFIG_GK20A_CYCLE_STATS)
+#include <linux/kernel.h>
+#include <linux/nvhost.h>
+#include <linux/tegra-ivc.h>
+#include <linux/tegra_vgpu.h>
+#include "gk20a/gk20a.h"
+#include "gk20a/channel_gk20a.h"
+#include "gk20a/platform_gk20a.h"
+#include "gk20a/css_gr_gk20a.h"
+#include "vgpu.h"
+struct vgpu_hw_snapshot_buffer {
+        struct tegra_hv_ivm_cookie *cookie;
+        void *buf;
+        struct gk20a_cs_snapshot_fifo_entry *end;
+        struct gk20a_cs_snapshot_fifo_entry *src_get;
+        struct gk20a_cs_snapshot_fifo_entry *src_put;
+};
+struct tegra_hv_ivm_cookie *css_cookie;
+int vgpu_css_init_snapshot_buffer(struct gr_gk20a *gr)
+{
+        struct gk20a *g = gr->g;
+        struct device *dev = g->dev;
+        struct gk20a_cs_snapshot *data = gr->cs_data;
+        struct device_node *np = dev->of_node;
+        struct of_phandle_args args;
+        struct device_node *hv_np;
+        void *buf = NULL;
+        u32 mempool;
+        int err;
+        gk20a_dbg_fn("");
+        if (data->hw_snapshot)
+                return 0;
+        err = of_parse_phandle_with_fixed_args(np,
+                        "mempool-css", 1, 0, &args);
+        if (err) {
+                dev_info(dev_from_gk20a(g), "dt missing mempool-css\n");
+                goto fail;
+        }
+        hv_np = args.np;
+        mempool = args.args[0];
+        css_cookie = tegra_hv_mempool_reserve(hv_np, mempool);
+        if (IS_ERR(css_cookie)) {
+                dev_info(dev_from_gk20a(g),
+                        "mempool  %u reserve failed\n", mempool);
+                err = -EINVAL;
+                goto fail;
+        }
+        /* Make sure buffer size is large enough */
+        if (css_cookie->size < CSS_MIN_HW_SNAPSHOT_SIZE) {
+                dev_info(dev_from_gk20a(g), "mempool size %lld too small\n",
+                        css_cookie->size);
+                err = -ENOMEM;
+                goto fail;
+        }
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 0)
+        buf = ioremap_cached(css_cookie->ipa, css_cookie->size);
+#else
+        buf = ioremap_cache(css_cookie->ipa, css_cookie->size);
+#endif
+        if (!buf) {
+                dev_info(dev_from_gk20a(g), "ioremap_cache failed\n");
+                err = -EINVAL;
+                goto fail;
+        }
+        data->hw_snapshot = buf;
+        data->hw_end = data->hw_snapshot +
+                css_cookie->size / sizeof(struct gk20a_cs_snapshot_fifo_entry);
+        data->hw_get = data->hw_snapshot;
+        memset(data->hw_snapshot, 0xff, css_cookie->size);
+        return 0;
+fail:
+        if (!IS_ERR_OR_NULL(css_cookie))
+                tegra_hv_mempool_unreserve(css_cookie);
+        return err;
+}
+static void vgpu_css_release_snapshot_buffer(struct gr_gk20a *gr)
+{
+        struct gk20a_cs_snapshot *data = gr->cs_data;
+        if (!data->hw_snapshot)
+                return;
+        iounmap(data->hw_snapshot);
+        data->hw_snapshot = NULL;
+        tegra_hv_mempool_unreserve(css_cookie);
+        gk20a_dbg_info("cyclestats(vgpu): buffer for snapshots released\n");
+}
+static int vgpu_css_flush_snapshots(struct channel_gk20a *ch,
+                        u32 *pending, bool *hw_overflow)
+{
+        struct gk20a *g = ch->g;
+        struct tegra_vgpu_cmd_msg msg = {};
+        struct tegra_vgpu_channel_cyclestats_snapshot_params *p;
+        struct gr_gk20a *gr = &g->gr;
+        struct gk20a_cs_snapshot *data = gr->cs_data;
+        int err;
+        gk20a_dbg_fn("");
+        msg.cmd = TEGRA_VGPU_CMD_CHANNEL_CYCLESTATS_SNAPSHOT;
+        msg.handle = vgpu_get_handle(g);
+        p = &msg.params.cyclestats_snapshot;
+        p->handle = ch->virt_ctx;
+        p->subcmd = NVGPU_IOCTL_CHANNEL_CYCLE_STATS_SNAPSHOT_CMD_FLUSH;
+        p->buf_info = (uintptr_t)data->hw_get - (uintptr_t)data->hw_snapshot;
+        err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
+        err = (err || msg.ret) ? -1 : 0;
+        *pending = p->buf_info;
+        *hw_overflow = p->hw_overflow;
+        return err;
+}
+static int vgpu_css_attach(struct channel_gk20a *ch,
+                struct gk20a_cs_snapshot_client *cs_client)
+{
+        struct gk20a *g = ch->g;
+        struct tegra_vgpu_cmd_msg msg = {};
+        struct tegra_vgpu_channel_cyclestats_snapshot_params *p =
+                                &msg.params.cyclestats_snapshot;
+        int err;
+        gk20a_dbg_fn("");
+        msg.cmd = TEGRA_VGPU_CMD_CHANNEL_CYCLESTATS_SNAPSHOT;
+        msg.handle = vgpu_get_handle(g);
+        p->handle = ch->virt_ctx;
+        p->subcmd = NVGPU_IOCTL_CHANNEL_CYCLE_STATS_SNAPSHOT_CMD_ATTACH;
+        p->perfmon_count = cs_client->perfmon_count;
+        err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
+        err = err ? err : msg.ret;
+        if (err)
+                gk20a_err(dev_from_gk20a(g), "%s failed", __func__);
+        else
+                cs_client->perfmon_start = p->perfmon_start;
+        return err;
+}
+static int vgpu_css_detach(struct channel_gk20a *ch,
+                struct gk20a_cs_snapshot_client *cs_client)
+{
+        struct gk20a *g = ch->g;
+        struct tegra_vgpu_cmd_msg msg = {};
+        struct tegra_vgpu_channel_cyclestats_snapshot_params *p =
+                                &msg.params.cyclestats_snapshot;
+        int err;
+        gk20a_dbg_fn("");
+        msg.cmd = TEGRA_VGPU_CMD_CHANNEL_CYCLESTATS_SNAPSHOT;
+        msg.handle = vgpu_get_handle(g);
+        p->handle = ch->virt_ctx;
+        p->subcmd = NVGPU_IOCTL_CHANNEL_CYCLE_STATS_SNAPSHOT_CMD_DETACH;
+        p->perfmon_start = cs_client->perfmon_start;
+        p->perfmon_count = cs_client->perfmon_count;
+        err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
+        err = err ? err : msg.ret;
+        if (err)
+                gk20a_err(dev_from_gk20a(g), "%s failed", __func__);
+        return err;
+}
+static int vgpu_css_enable_snapshot_buffer(struct channel_gk20a *ch,
+                                struct gk20a_cs_snapshot_client *cs_client)
+{
+        int ret;
+        ret = vgpu_css_attach(ch, cs_client);
+        if (ret)
+                return ret;
+        ret = vgpu_css_init_snapshot_buffer(&ch->g->gr);
+        return ret;
+}
+void vgpu_init_css_ops(struct gpu_ops *gops)
+{
+        gops->css.enable_snapshot = vgpu_css_enable_snapshot_buffer;
+        gops->css.disable_snapshot = vgpu_css_release_snapshot_buffer;
+        gops->css.check_data_available = vgpu_css_flush_snapshots;
+        gops->css.detach_snapshot = vgpu_css_detach;
+}
+#endif /* CONFIG_GK20A_CYCLE_STATS */
diff --git a/drivers/gpu/nvgpu/vgpu/gr_vgpu.c b/drivers/gpu/nvgpu/vgpu/gr_vgpu.c
index 783b2f55..89223091 100644
--- a/drivers/gpu/nvgpu/vgpu/gr_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/gr_vgpu.c
@@ -875,6 +875,10 @@ static int vgpu_gr_init_gr_setup_sw(struct gk20a *g)
        gr->g = g;
+#if defined(CONFIG_GK20A_CYCLE_STATS)
+        mutex_init(&g->gr.cs_lock);
+#endif
        err = vgpu_gr_init_gr_config(g, gr);
        if (err)
                goto clean_up;
diff --git a/drivers/gpu/nvgpu/vgpu/vgpu.c b/drivers/gpu/nvgpu/vgpu/vgpu.c
index 27d98eb8..bd332583 100644
--- a/drivers/gpu/nvgpu/vgpu/vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/vgpu.c
@@ -334,6 +334,9 @@ void vgpu_init_hal_common(struct gk20a *g)
        vgpu_init_dbg_session_ops(gops);
        vgpu_init_fecs_trace_ops(gops);
        vgpu_init_tsg_ops(gops);
+#if defined(CONFIG_GK20A_CYCLE_STATS)
+        vgpu_init_css_ops(gops);
+#endif
        gops->chip_init_gpu_characteristics = vgpu_init_gpu_characteristics;
        gops->read_ptimer = vgpu_read_ptimer;
 }
diff --git a/drivers/gpu/nvgpu/vgpu/vgpu.h b/drivers/gpu/nvgpu/vgpu/vgpu.h
index 6f1059b8..4a7a6b6c 100644
--- a/drivers/gpu/nvgpu/vgpu/vgpu.h
+++ b/drivers/gpu/nvgpu/vgpu/vgpu.h
@@ -85,6 +85,9 @@ void vgpu_init_ltc_ops(struct gpu_ops *gops);
 void vgpu_init_mm_ops(struct gpu_ops *gops);
 void vgpu_init_debug_ops(struct gpu_ops *gops);
 void vgpu_init_tsg_ops(struct gpu_ops *gops);
+#if defined(CONFIG_GK20A_CYCLE_STATS)
+void vgpu_init_css_ops(struct gpu_ops *gops);
+#endif
 int vgpu_init_mm_support(struct gk20a *g);
 int vgpu_init_gr_support(struct gk20a *g);
 int vgpu_init_fifo_support(struct gk20a *g);
@@ -161,6 +164,11 @@ static inline void vgpu_init_mm_ops(struct gpu_ops *gops)
 static inline void vgpu_init_debug_ops(struct gpu_ops *gops)
 {
 }
+#if defined(CONFIG_GK20A_CYCLE_STATS)
+static inline void vgpu_init_css_ops(struct gpu_ops *gops)
+{
+}
+#endif
 static inline int vgpu_init_mm_support(struct gk20a *g)
 {
        return -ENOSYS;
diff --git a/include/linux/tegra_vgpu.h b/include/linux/tegra_vgpu.h
index fdab9b06..be8b9ad1 100644
--- a/include/linux/tegra_vgpu.h
+++ b/include/linux/tegra_vgpu.h
@@ -97,6 +97,7 @@ enum {
        TEGRA_VGPU_CMD_SET_POWERGATE = 60,
        TEGRA_VGPU_CMD_SET_GPU_CLK_RATE = 61,
        TEGRA_VGPU_CMD_GET_CONSTANTS = 62,
+        TEGRA_VGPU_CMD_CHANNEL_CYCLESTATS_SNAPSHOT = 63,
 };
 struct tegra_vgpu_connect_params {
@@ -437,6 +438,15 @@ struct tegra_vgpu_constants_params {
        u16 gpc_tpc_mask[TEGRA_VGPU_MAX_GPC_COUNT];
 };
+struct tegra_vgpu_channel_cyclestats_snapshot_params {
+        u64 handle;
+        u32 perfmon_start;
+        u32 perfmon_count;
+        u32 buf_info; /* client->srvr: get ptr; srvr->client: num pending */
+        u8 subcmd;
+        u8 hw_overflow;
+};
 struct tegra_vgpu_cmd_msg {
        u32 cmd;
        int ret;
@@ -481,6 +491,7 @@ struct tegra_vgpu_cmd_msg {
                struct tegra_vgpu_set_powergate_params set_powergate;
                struct tegra_vgpu_gpu_clk_rate_params gpu_clk_rate;
                struct tegra_vgpu_constants_params constants;
+                struct tegra_vgpu_channel_cyclestats_snapshot_params cyclestats_snapshot;
                char padding[192];
        } params;
 };
author	Peter Daifuku <pdaifuku@nvidia.com>	2016-08-31 20:04:56 -0400
committer	mobile promotions <svcmobile_promotions@nvidia.com>	2016-09-08 19:04:09 -0400
commit	9aa7de15c2a644e9c7e9c157e49087e66d4ac3d0 (patch)
tree	e5080886f09aa75c6a3cc83e5b27f8f7553678a4
parent	70cad5fbb593602a49f91e57c04d1da0334b3a49 (diff)