From b3e023a8055d4346b30924a03a99286926e76a15 Mon Sep 17 00:00:00 2001 From: Arto Merilainen Date: Mon, 21 Jul 2014 10:21:09 +0300 Subject: gpu: nvgpu: CDE support This patch adds support for executing a precompiled GPU program to allow exporting GPU buffers to other graphics units that have color decompression engine (CDE) support. Bug 1409151 Change-Id: Id0c930923f2449b85a6555de71d7ec93eed238ae Signed-off-by: Arto Merilainen Reviewed-on: http://git-master/r/360418 Reviewed-by: Lauri Peltonen Reviewed-by: Terje Bergstrom --- drivers/gpu/nvgpu/Kconfig | 8 + drivers/gpu/nvgpu/gk20a/Makefile | 1 + drivers/gpu/nvgpu/gk20a/cde_gk20a.c | 924 ++++++++++++++++++++++++++++++++++++ drivers/gpu/nvgpu/gk20a/cde_gk20a.h | 254 ++++++++++ drivers/gpu/nvgpu/gk20a/gk20a.c | 3 + drivers/gpu/nvgpu/gk20a/gk20a.h | 4 + 6 files changed, 1194 insertions(+) create mode 100644 drivers/gpu/nvgpu/gk20a/cde_gk20a.c create mode 100644 drivers/gpu/nvgpu/gk20a/cde_gk20a.h (limited to 'drivers/gpu') diff --git a/drivers/gpu/nvgpu/Kconfig b/drivers/gpu/nvgpu/Kconfig index 315c4683..b863077a 100644 --- a/drivers/gpu/nvgpu/Kconfig +++ b/drivers/gpu/nvgpu/Kconfig @@ -12,6 +12,14 @@ config GK20A_DEFAULT_TIMEOUT help Default timeout for jobs in milliseconds. Set to zero for no timeout. +config GK20A_CDE + depends on GK20A + bool "Support compression bit switzzling through CDE" + default n + help + Say Y to allow compression bit swizzling + using pre-compiled shader. + config GK20A_PMU bool "Support GK20A PMU" depends on GK20A diff --git a/drivers/gpu/nvgpu/gk20a/Makefile b/drivers/gpu/nvgpu/gk20a/Makefile index 246f9447..aa9237b4 100644 --- a/drivers/gpu/nvgpu/gk20a/Makefile +++ b/drivers/gpu/nvgpu/gk20a/Makefile @@ -34,6 +34,7 @@ nvgpu-y := \ hal.o \ hal_gk20a.o \ gk20a_allocator.o \ + cde_gk20a.o \ platform_gk20a_generic.o \ tsg_gk20a.o nvgpu-$(CONFIG_TEGRA_GK20A) += platform_gk20a_tegra.o diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c new file mode 100644 index 00000000..d01426be --- /dev/null +++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c @@ -0,0 +1,924 @@ +/* + * Color decompression engine support + * + * Copyright (c) 2014, NVIDIA Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include +#include + +#include "gk20a.h" +#include "channel_gk20a.h" +#include "mm_gk20a.h" +#include "cde_gk20a.h" +#include "fence_gk20a.h" +#include "gr_gk20a.h" +#include "debug_gk20a.h" + +#include "hw_ccsr_gk20a.h" +#include "hw_pbdma_gk20a.h" + +void gk20a_cde_dump(struct gk20a_cde_ctx *cde_ctx) +{ + int i; + for (i = 0; i < cde_ctx->num_bufs; i++) { + struct gk20a_cde_mem_desc *target_mem = cde_ctx->mem + i; + u32 *target_mem_ptr = target_mem->cpuva; + int j = 0; + + gk20a_dbg(gpu_dbg_cde, "cde: buffer=%d, size=%zu, gpuva=%llx\n", + i, target_mem->num_bytes, target_mem->gpu_va); + + for (j = 0; j < target_mem->num_bytes / sizeof(u32); j++) + gk20a_dbg(gpu_dbg_cde, "0x%08x ", target_mem_ptr[j]); + gk20a_dbg(gpu_dbg_cde, "\n\n"); + } +} + +static void gk20a_deinit_cde_img(struct gk20a_cde_ctx *cde_ctx) +{ + struct device *dev = &cde_ctx->pdev->dev; + int i; + + for (i = 0; i < cde_ctx->num_bufs; i++) { + struct gk20a_cde_mem_desc *mem = cde_ctx->mem + i; + gk20a_gmmu_unmap(cde_ctx->vm, mem->gpu_va, mem->num_bytes, 1); + gk20a_free_sgtable(&mem->sgt); + dma_free_coherent(dev, mem->num_bytes, mem->cpuva, mem->iova); + } + + for (i = 0; i < cde_ctx->num_obj_ids; i++) + gk20a_free_obj_ctx(cde_ctx->ch, + &(struct nvhost_free_obj_ctx_args) + { cde_ctx->obj_ids[i] }); + + kfree(cde_ctx->init_cmd); + kfree(cde_ctx->convert_cmd); + + cde_ctx->convert_cmd = NULL; + cde_ctx->init_cmd = NULL; + cde_ctx->num_bufs = 0; + cde_ctx->num_obj_ids = 0; + cde_ctx->num_params = 0; + cde_ctx->init_cmd_num_entries = 0; + cde_ctx->convert_cmd_num_entries = 0; +} + +static int gk20a_cde_remove(struct gk20a_cde_ctx *cde_ctx) +{ + struct gk20a *g = cde_ctx->g; + struct channel_gk20a *ch = cde_ctx->ch; + struct vm_gk20a *vm = ch->vm; + + /* free the channel */ + gk20a_free_channel(cde_ctx->ch, true); + + /* ..then release mapped memory */ + gk20a_deinit_cde_img(cde_ctx); + gk20a_gmmu_unmap(vm, cde_ctx->backing_store_vaddr, + g->gr.compbit_store.size, 1); + + return 0; +} + +int gk20a_cde_destroy(struct gk20a *g) +{ + struct gk20a_cde_app *cde_app = &g->cde_app; + struct gk20a_cde_ctx *cde_ctx = cde_app->cde_ctx; + int ret, i; + + if (!cde_app->initialised) + return 0; + + for (i = 0; i < ARRAY_SIZE(cde_app->cde_ctx); i++, cde_ctx++) + ret = gk20a_cde_remove(cde_ctx); + + cde_app->initialised = false; + return ret; +} + +static int gk20a_init_cde_buf(struct gk20a_cde_ctx *cde_ctx, + const struct firmware *img, + struct gk20a_cde_hdr_buf *buf) +{ + struct device *dev = &cde_ctx->pdev->dev; + struct gk20a_cde_mem_desc *mem; + int err; + + /* check that the file can hold the buf */ + if (buf->data_byte_offset != 0 && + buf->data_byte_offset + buf->num_bytes > img->size) { + gk20a_warn(&cde_ctx->pdev->dev, "cde: invalid data section. buffer idx = %d", + cde_ctx->num_bufs); + return -EINVAL; + } + + /* check that we have enough buf elems available */ + if (cde_ctx->num_bufs > MAX_CDE_BUFS) { + gk20a_warn(&cde_ctx->pdev->dev, "cde: invalid data section. buffer idx = %d", + cde_ctx->num_bufs); + return -ENOMEM; + } + + /* allocate buf */ + mem = cde_ctx->mem + cde_ctx->num_bufs; + mem->num_bytes = buf->num_bytes; + mem->cpuva = dma_alloc_coherent(dev, mem->num_bytes, &mem->iova, + GFP_KERNEL); + if (!mem->cpuva) { + gk20a_warn(&cde_ctx->pdev->dev, "cde: could not allocate device memory. buffer idx = %d", + cde_ctx->num_bufs); + return -ENOMEM; + } + + err = gk20a_get_sgtable(dev, &mem->sgt, mem->cpuva, mem->iova, + mem->num_bytes); + if (err) { + gk20a_warn(&cde_ctx->pdev->dev, "cde: could not get sg table. buffer idx = %d", + cde_ctx->num_bufs); + err = -ENOMEM; + goto err_get_sgtable; + } + + mem->gpu_va = gk20a_gmmu_map(cde_ctx->vm, &mem->sgt, mem->num_bytes, 0, + gk20a_mem_flag_none); + if (!mem->gpu_va) { + gk20a_warn(&cde_ctx->pdev->dev, "cde: could not map buffer to gpuva. buffer idx = %d", + cde_ctx->num_bufs); + err = -ENOMEM; + goto err_map_buffer; + } + + /* copy the content */ + if (buf->data_byte_offset != 0) + memcpy(mem->cpuva, img->data + buf->data_byte_offset, + buf->num_bytes); + + cde_ctx->num_bufs++; + + return 0; + +err_map_buffer: + gk20a_free_sgtable(&mem->sgt); + kfree(mem->sgt); +err_get_sgtable: + dma_free_coherent(dev, mem->num_bytes, &mem->cpuva, mem->iova); + return err; +} + +static int gk20a_replace_data(struct gk20a_cde_ctx *cde_ctx, void *target, + int type, s32 shift, u64 mask, u64 value) +{ + u32 *target_mem_ptr = target; + u64 *target_mem_ptr_u64 = target; + u64 current_value, new_value; + + value = (shift >= 0) ? value << shift : value >> -shift; + value &= mask; + + /* read current data from the location */ + if (type == TYPE_PARAM_TYPE_U32) + current_value = *target_mem_ptr; + else if (type == TYPE_PARAM_TYPE_U64_LITTLE) + current_value = *target_mem_ptr_u64; + else if (type == TYPE_PARAM_TYPE_U64_BIG) { + current_value = *target_mem_ptr_u64; + current_value = (u64)(current_value >> 32) | + (u64)(current_value << 32); + } else { + gk20a_warn(&cde_ctx->pdev->dev, "cde: unknown type. type=%d", + type); + return -EINVAL; + } + + current_value &= ~mask; + new_value = current_value | value; + + /* store the element data back */ + if (type == TYPE_PARAM_TYPE_U32) + *target_mem_ptr = (u32)new_value; + else if (type == TYPE_PARAM_TYPE_U64_LITTLE) + *target_mem_ptr_u64 = new_value; + else { + new_value = (u64)(new_value >> 32) | + (u64)(new_value << 32); + *target_mem_ptr_u64 = new_value; + } + + return 0; +} + +static int gk20a_init_cde_replace(struct gk20a_cde_ctx *cde_ctx, + const struct firmware *img, + struct gk20a_cde_hdr_replace *replace) +{ + struct gk20a_cde_mem_desc *source_mem; + struct gk20a_cde_mem_desc *target_mem; + u32 *target_mem_ptr; + u64 vaddr; + int err; + + if (replace->target_buf >= cde_ctx->num_bufs || + replace->source_buf >= cde_ctx->num_bufs) { + gk20a_warn(&cde_ctx->pdev->dev, "cde: invalid buffer. target_buf=%u, source_buf=%u, num_bufs=%d", + replace->target_buf, replace->source_buf, + cde_ctx->num_bufs); + return -EINVAL; + } + + source_mem = cde_ctx->mem + replace->source_buf; + target_mem = cde_ctx->mem + replace->target_buf; + target_mem_ptr = target_mem->cpuva; + + if (source_mem->num_bytes < (replace->source_byte_offset + 3) || + target_mem->num_bytes < (replace->target_byte_offset + 3)) { + gk20a_warn(&cde_ctx->pdev->dev, "cde: invalid buffer offsets. target_buf_offs=%lld, source_buf_offs=%lld, source_buf_size=%zu, dest_buf_size=%zu", + replace->target_byte_offset, + replace->source_byte_offset, + source_mem->num_bytes, + target_mem->num_bytes); + return -EINVAL; + } + + /* calculate the target pointer */ + target_mem_ptr += (replace->target_byte_offset / sizeof(u32)); + + /* determine patch value */ + vaddr = source_mem->gpu_va + replace->source_byte_offset; + err = gk20a_replace_data(cde_ctx, target_mem_ptr, replace->type, + replace->shift, replace->mask, + vaddr); + if (err) { + gk20a_warn(&cde_ctx->pdev->dev, "cde: replace failed. err=%d, target_buf=%u, target_buf_offs=%lld, source_buf=%u, source_buf_offs=%lld", + err, replace->target_buf, + replace->target_byte_offset, + replace->source_buf, + replace->source_byte_offset); + } + + return err; +} + +static int gk20a_cde_patch_params(struct gk20a_cde_ctx *cde_ctx) +{ + struct gk20a *g = cde_ctx->g; + struct gk20a_cde_mem_desc *target_mem; + u32 *target_mem_ptr; + u64 new_data; + int user_id = 0, i, err; + + for (i = 0; i < cde_ctx->num_params; i++) { + struct gk20a_cde_hdr_param *param = cde_ctx->params + i; + target_mem = cde_ctx->mem + param->target_buf; + target_mem_ptr = target_mem->cpuva; + target_mem_ptr += (param->target_byte_offset / sizeof(u32)); + + switch (param->id) { + case TYPE_PARAM_COMPTAGS_PER_CACHELINE: + new_data = g->gr.comptags_per_cacheline; + break; + case TYPE_PARAM_GPU_CONFIGURATION: + new_data = g->ltc_count * g->gr.slices_per_ltc * + g->gr.cacheline_size; + break; + case TYPE_PARAM_FIRSTPAGEOFFSET: + new_data = cde_ctx->src_param_offset; + break; + case TYPE_PARAM_NUMPAGES: + new_data = cde_ctx->src_param_lines; + break; + case TYPE_PARAM_BACKINGSTORE: + new_data = cde_ctx->backing_store_vaddr; + break; + case TYPE_PARAM_DESTINATION: + new_data = cde_ctx->dest_vaddr; + break; + case TYPE_PARAM_DESTINATION_SIZE: + new_data = cde_ctx->dest_size; + break; + case TYPE_PARAM_BACKINGSTORE_SIZE: + new_data = g->gr.compbit_store.size; + break; + case TYPE_PARAM_SOURCE_SMMU_ADDR: + new_data = gk20a_mm_gpuva_to_iova(cde_ctx->vm, + cde_ctx->src_vaddr); + if (new_data == 0) + err = -EINVAL; + break; + default: + user_id = param->id - NUM_RESERVED_PARAMS; + if (user_id < 0 || user_id >= MAX_CDE_USER_PARAMS) + continue; + new_data = cde_ctx->user_param_values[user_id]; + } + + gk20a_dbg(gpu_dbg_cde, "cde: patch: idx_in_file=%d param_id=%d target_buf=%u target_byte_offset=%lld data_value=0x%llx data_offset/data_diff=%lld data_type=%d data_shift=%d data_mask=0x%llx", + i, param->id, param->target_buf, + param->target_byte_offset, new_data, + param->data_offset, param->type, param->shift, + param->mask); + + new_data += param->data_offset; + + err = gk20a_replace_data(cde_ctx, target_mem_ptr, param->type, + param->shift, param->mask, new_data); + + if (err) { + gk20a_warn(&cde_ctx->pdev->dev, "cde: patch failed. err=%d, idx=%d, id=%d, target_buf=%u, target_buf_offs=%lld, patch_value=%llu", + err, i, param->id, param->target_buf, + param->target_byte_offset, new_data); + return err; + } + } + + return 0; +} + +static int gk20a_init_cde_param(struct gk20a_cde_ctx *cde_ctx, + const struct firmware *img, + struct gk20a_cde_hdr_param *param) +{ + struct gk20a_cde_mem_desc *target_mem; + + if (param->target_buf >= cde_ctx->num_bufs) { + gk20a_warn(&cde_ctx->pdev->dev, "cde: invalid buffer parameter. param idx = %d, target_buf=%u, num_bufs=%u", + cde_ctx->num_params, param->target_buf, + cde_ctx->num_bufs); + return -EINVAL; + } + + target_mem = cde_ctx->mem + param->target_buf; + if (target_mem->num_bytes < (param->target_byte_offset + 3)) { + gk20a_warn(&cde_ctx->pdev->dev, "cde: invalid buffer parameter. param idx = %d, target_buf_offs=%lld, target_buf_size=%zu", + cde_ctx->num_params, param->target_byte_offset, + target_mem->num_bytes); + return -EINVAL; + } + + /* does this parameter fit into our parameter structure */ + if (cde_ctx->num_params >= MAX_CDE_PARAMS) { + gk20a_warn(&cde_ctx->pdev->dev, "cde: no room for new parameters param idx = %d", + cde_ctx->num_params); + return -ENOMEM; + } + + /* is the given id valid? */ + if (param->id >= NUM_RESERVED_PARAMS + MAX_CDE_USER_PARAMS) { + gk20a_warn(&cde_ctx->pdev->dev, "cde: parameter id is not valid. param idx = %d, id=%u, max=%u", + param->id, cde_ctx->num_params, + NUM_RESERVED_PARAMS + MAX_CDE_USER_PARAMS); + return -EINVAL; + } + + cde_ctx->params[cde_ctx->num_params] = *param; + cde_ctx->num_params++; + + return 0; +} + +static int gk20a_init_cde_required_class(struct gk20a_cde_ctx *cde_ctx, + const struct firmware *img, + u32 required_class) +{ + struct nvhost_alloc_obj_ctx_args alloc_obj_ctx; + int err; + + if (cde_ctx->num_obj_ids >= MAX_CDE_OBJ_IDS) { + gk20a_warn(&cde_ctx->pdev->dev, "cde: running out of class ids"); + return -ENOMEM; + } + + alloc_obj_ctx.class_num = required_class; + alloc_obj_ctx.padding = 0; + + err = gk20a_alloc_obj_ctx(cde_ctx->ch, &alloc_obj_ctx); + if (err) { + gk20a_warn(&cde_ctx->pdev->dev, "cde: failed to allocate ctx. err=%d", + err); + return err; + } + + cde_ctx->obj_ids[cde_ctx->num_obj_ids] = alloc_obj_ctx.obj_id; + cde_ctx->num_obj_ids++; + + return 0; +} + +static int gk20a_init_cde_command(struct gk20a_cde_ctx *cde_ctx, + const struct firmware *img, + u32 op, + struct gk20a_cde_cmd_elem *cmd_elem, + u32 num_elems) +{ + struct nvhost_gpfifo **gpfifo, *gpfifo_elem; + u32 *num_entries; + int i; + + /* check command type */ + if (op == TYPE_BUF_COMMAND_INIT) { + gpfifo = &cde_ctx->init_cmd; + num_entries = &cde_ctx->init_cmd_num_entries; + } else if (op == TYPE_BUF_COMMAND_CONVERT) { + gpfifo = &cde_ctx->convert_cmd; + num_entries = &cde_ctx->convert_cmd_num_entries; + } else { + gk20a_warn(&cde_ctx->pdev->dev, "cde: unknown command. op=%u", + op); + return -EINVAL; + } + + /* allocate gpfifo entries to be pushed */ + *gpfifo = kzalloc(sizeof(struct nvhost_gpfifo) * num_elems, + GFP_KERNEL); + if (!*gpfifo) { + gk20a_warn(&cde_ctx->pdev->dev, "cde: could not allocate memory for gpfifo entries"); + return -ENOMEM; + } + + gpfifo_elem = *gpfifo; + for (i = 0; i < num_elems; i++, cmd_elem++, gpfifo_elem++) { + struct gk20a_cde_mem_desc *target_mem; + + /* validate the current entry */ + if (cmd_elem->target_buf >= cde_ctx->num_bufs) { + gk20a_warn(&cde_ctx->pdev->dev, "cde: target buffer is not available (target=%u, num_bufs=%u)", + cmd_elem->target_buf, cde_ctx->num_bufs); + return -EINVAL; + } + + target_mem = cde_ctx->mem + cmd_elem->target_buf; + if (target_mem->num_bytes < + cmd_elem->target_byte_offset + cmd_elem->num_bytes) { + gk20a_warn(&cde_ctx->pdev->dev, "cde: target buffer cannot hold all entries (target_size=%zu, target_byte_offset=%lld, num_bytes=%llu)", + target_mem->num_bytes, + cmd_elem->target_byte_offset, + cmd_elem->num_bytes); + return -EINVAL; + } + + /* store the element into gpfifo */ + gpfifo_elem->entry0 = + u64_lo32(target_mem->gpu_va + + cmd_elem->target_byte_offset); + gpfifo_elem->entry1 = + u64_hi32(target_mem->gpu_va + + cmd_elem->target_byte_offset) | + pbdma_gp_entry1_length_f(cmd_elem->num_bytes / + sizeof(u32)); + } + + *num_entries = num_elems; + return 0; +} + +static int gk20a_init_cde_img(struct gk20a_cde_ctx *cde_ctx, + const struct firmware *img) +{ + u32 *data = (u32 *)img->data; + u32 version, num_of_elems; + struct gk20a_cde_hdr_elem *elem; + u32 min_size = 0; + int err = 0; + int i; + + min_size += 2 * sizeof(u32); + if (img->size < min_size) { + gk20a_warn(&cde_ctx->pdev->dev, "cde: invalid image header"); + return -EINVAL; + } + + version = data[0]; + num_of_elems = data[1]; + + min_size += num_of_elems * sizeof(*elem); + if (img->size < min_size) { + gk20a_warn(&cde_ctx->pdev->dev, "cde: bad image"); + return -EINVAL; + } + + elem = (struct gk20a_cde_hdr_elem *)&data[2]; + for (i = 0; i < num_of_elems; i++) { + int err = 0; + switch (elem->type) { + case TYPE_BUF: + err = gk20a_init_cde_buf(cde_ctx, img, &elem->buf); + break; + case TYPE_REPLACE: + err = gk20a_init_cde_replace(cde_ctx, img, + &elem->replace); + break; + case TYPE_PARAM: + err = gk20a_init_cde_param(cde_ctx, img, &elem->param); + break; + case TYPE_REQUIRED_CLASS: + err = gk20a_init_cde_required_class(cde_ctx, img, + elem->required_class); + break; + case TYPE_COMMAND: + { + struct gk20a_cde_cmd_elem *cmd = (void *) + &img->data[elem->command.data_byte_offset]; + err = gk20a_init_cde_command(cde_ctx, img, + elem->command.op, cmd, + elem->command.num_entries); + break; + } + default: + gk20a_warn(&cde_ctx->pdev->dev, "cde: unknown header element"); + err = -EINVAL; + } + + if (err) + goto deinit_image; + + elem++; + } + + if (!cde_ctx->init_cmd || !cde_ctx->init_cmd_num_entries) { + gk20a_warn(&cde_ctx->pdev->dev, "cde: convert command not defined"); + err = -EINVAL; + goto deinit_image; + } + + if (!cde_ctx->convert_cmd || !cde_ctx->convert_cmd_num_entries) { + gk20a_warn(&cde_ctx->pdev->dev, "cde: convert command not defined"); + err = -EINVAL; + goto deinit_image; + } + + return 0; + +deinit_image: + gk20a_deinit_cde_img(cde_ctx); + return err; +} + +static int gk20a_cde_execute_buffer(struct gk20a_cde_ctx *cde_ctx, + u32 op, struct nvhost_fence *fence, + u32 flags, struct gk20a_fence **fence_out) +{ + struct nvhost_gpfifo *gpfifo = NULL; + int num_entries = 0; + + /* check command type */ + if (op == TYPE_BUF_COMMAND_INIT) { + gpfifo = cde_ctx->init_cmd; + num_entries = cde_ctx->init_cmd_num_entries; + } else if (op == TYPE_BUF_COMMAND_CONVERT) { + gpfifo = cde_ctx->convert_cmd; + num_entries = cde_ctx->convert_cmd_num_entries; + } else { + gk20a_warn(&cde_ctx->pdev->dev, "cde: unknown buffer"); + return -EINVAL; + } + + if (gpfifo == NULL || num_entries == 0) { + gk20a_warn(&cde_ctx->pdev->dev, "cde: buffer not available"); + return -ENOSYS; + } + + return gk20a_submit_channel_gpfifo(cde_ctx->ch, gpfifo, + num_entries, flags, fence, fence_out); +} + +int gk20a_cde_convert(struct gk20a *g, u32 src_fd, u32 dst_fd, + s32 dst_kind, u64 dst_byte_offset, + u32 dst_size, struct nvhost_fence *fence, + u32 __flags, struct gk20a_cde_param *params, + int num_params, struct gk20a_fence **fence_out) +{ + struct gk20a_cde_app *cde_app = &g->cde_app; + struct gk20a_comptags comptags; + struct gk20a_cde_ctx *cde_ctx; + struct dma_buf *src = NULL, *dst = NULL; + u64 dst_vaddr = 0, src_vaddr = 0; + u32 flags; + int err, i; + + if (!cde_app->initialised) { + gk20a_warn(&g->dev->dev, "cde: conversion requrest but no image has been provided"); + return -ENOSYS; + } + + mutex_lock(&cde_app->mutex); + + /* pick next free cde context */ + cde_ctx = cde_app->cde_ctx + cde_app->cde_ctx_ptr; + cde_app->cde_ctx_ptr = (cde_app->cde_ctx_ptr + 1) % + ARRAY_SIZE(cde_app->cde_ctx); + + /* First, get buffer references and map the buffers to local va */ + + dst = dma_buf_get(dst_fd); + if (IS_ERR(src)) { + dst = NULL; + err = -EINVAL; + goto exit_unlock; + } + + /* ensure that the dst buffer has drvdata */ + err = gk20a_dmabuf_alloc_drvdata(dst, &g->dev->dev); + if (err) + goto exit_unlock; + + /* map the destination buffer */ + dst_vaddr = gk20a_vm_map(g->cde_app.vm, dst, 0, + 0, dst_kind, NULL, true, + gk20a_mem_flag_none, + 0, 0); + if (!dst_vaddr) { + err = -EINVAL; + goto exit_unlock; + } + + src = dma_buf_get(src_fd); + if (IS_ERR(src)) { + src = NULL; + err = -EINVAL; + goto exit_unlock; + } + + /* ensure that the src buffer has drvdata */ + err = gk20a_dmabuf_alloc_drvdata(src, &g->dev->dev); + if (err) + goto exit_unlock; + + /* map the source buffer to prevent premature release */ + src_vaddr = gk20a_vm_map(g->cde_app.vm, src, 0, + 0, dst_kind, NULL, true, + gk20a_mem_flag_none, + 0, 0); + if (!src_vaddr) { + err = -EINVAL; + goto exit_unlock; + } + + if (!dst_size) + dst_size = dst->size - dst_byte_offset; + + /* reload buffer converter if it has failed */ + if (cde_ctx->ch->has_timedout) { + mutex_unlock(&cde_app->mutex); + err = gk20a_cde_reload(g); + if (err) + return err; + mutex_lock(&cde_app->mutex); + } + + /* wait for channel idle */ + err = gk20a_channel_finish(cde_ctx->ch, 2000); + if (err) { + gk20a_warn(&cde_ctx->pdev->dev, "cde: old work could not be finished"); + goto exit_unlock; + } + + /* disable the channel */ + gk20a_writel(g, ccsr_channel_r(cde_ctx->ch->hw_chid), + gk20a_readl(g, ccsr_channel_r(cde_ctx->ch->hw_chid)) | + ccsr_channel_enable_clr_true_f()); + gk20a_fifo_preempt_channel(g, cde_ctx->ch->hw_chid); + channel_gk20a_unbind(&g->fifo.channel[cde_ctx->ch->hw_chid]); + + /* reinitialise the graphics context of the channel */ + gr_gk20a_load_golden_ctx_image(g, cde_ctx->ch); + + /* re-enable the channel */ + g->ops.fifo.bind_channel(&g->fifo.channel[cde_ctx->ch->hw_chid]); + gk20a_writel(g, ccsr_channel_r(cde_ctx->ch->hw_chid), + gk20a_readl(g, ccsr_channel_r(cde_ctx->ch->hw_chid)) | + ccsr_channel_enable_set_true_f()); + + /* store source buffer compression tags */ + gk20a_get_comptags(&g->dev->dev, src, &comptags); + cde_ctx->src_vaddr = src_vaddr; + cde_ctx->src_param_offset = comptags.offset; + cde_ctx->src_param_lines = comptags.lines; + + /* store information about destination */ + cde_ctx->dest_vaddr = dst_vaddr + dst_byte_offset; + cde_ctx->dest_size = dst_size; + + /* remove existing argument data */ + memset(cde_ctx->user_param_values, 0, + sizeof(cde_ctx->user_param_values)); + + /* read user space arguments for the conversion */ + for (i = 0; i < num_params; i++) { + struct gk20a_cde_param *param = params + i; + int id = param->id - NUM_RESERVED_PARAMS; + + if (id < 0 || id >= MAX_CDE_USER_PARAMS) { + gk20a_warn(&cde_ctx->pdev->dev, "cde: unknown user parameter"); + err = -EINVAL; + goto exit_unlock; + } + cde_ctx->user_param_values[id] = param->value; + } + + /* patch data */ + err = gk20a_cde_patch_params(cde_ctx); + if (err) { + gk20a_warn(&cde_ctx->pdev->dev, "cde: failed to patch parameters"); + goto exit_unlock; + } + + gk20a_dbg(gpu_dbg_cde, "cde: buffer=cbc, size=%zu, gpuva=%llx\n", + g->gr.compbit_store.size, cde_ctx->backing_store_vaddr); + gk20a_dbg(gpu_dbg_cde, "cde: buffer=dst, size=%llu, gpuva=%llx\n", + cde_ctx->dest_size, cde_ctx->dest_vaddr); + gk20a_cde_dump(cde_ctx); + + /* execute the init push buffer */ + err = gk20a_cde_execute_buffer(cde_ctx, TYPE_BUF_COMMAND_INIT, + NULL, 0, NULL); + if (err) + goto exit_unlock; + + /* take always the postfence as it is needed for protecting the + * cde context */ + flags = __flags | NVHOST_SUBMIT_GPFIFO_FLAGS_FENCE_GET; + + /* execute the conversion buffer */ + err = gk20a_cde_execute_buffer(cde_ctx, TYPE_BUF_COMMAND_CONVERT, + fence, flags, fence_out); + +exit_unlock: + + /* unmap the buffers - channel holds references to them now */ + if (dst_vaddr) + gk20a_vm_unmap(g->cde_app.vm, dst_vaddr); + if (src_vaddr) + gk20a_vm_unmap(g->cde_app.vm, src_vaddr); + + /* drop dmabuf refs if work was aborted */ + if (err && src) + dma_buf_put(src); + if (err && dst) + dma_buf_put(dst); + + mutex_unlock(&cde_app->mutex); + + return err; +} + +int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx) +{ + struct gk20a *g = cde_ctx->g; + const struct firmware *img; + struct channel_gk20a *ch; + struct gr_gk20a *gr = &g->gr; + int err = 0; + u64 vaddr; + + img = gk20a_request_firmware(g, "gpu2cde.bin"); + if (!img) { + dev_err(&cde_ctx->pdev->dev, "cde: could not fetch the firmware"); + return -ENOSYS; + } + + ch = gk20a_open_new_channel(g); + if (!ch) { + gk20a_warn(&cde_ctx->pdev->dev, "cde: gk20a channel not available"); + err = -ENOMEM; + goto err_get_gk20a_channel; + } + + /* bind the channel to the vm */ + gk20a_vm_get(&g->mm.pmu.vm); + ch->vm = &g->mm.pmu.vm; + err = channel_gk20a_commit_va(ch); + if (err) { + gk20a_warn(&cde_ctx->pdev->dev, "cde: could not bind vm"); + goto err_commit_va; + } + + /* allocate gpfifo (1024 should be more than enough) */ + err = gk20a_alloc_channel_gpfifo(ch, + &(struct nvhost_alloc_gpfifo_args){1024, 0}); + if (err) { + gk20a_warn(&cde_ctx->pdev->dev, "cde: unable to allocate gpfifo"); + goto err_alloc_gpfifo; + } + + /* map backing store to gpu virtual space */ + vaddr = gk20a_gmmu_map(ch->vm, &gr->compbit_store.sgt, + g->gr.compbit_store.size, 0, + gk20a_mem_flag_none); + + if (!vaddr) { + gk20a_warn(&cde_ctx->pdev->dev, "cde: cannot map compression bit backing store"); + goto err_map_backingstore; + } + + /* store initialisation data */ + cde_ctx->ch = ch; + cde_ctx->vm = ch->vm; + cde_ctx->backing_store_vaddr = vaddr; + + /* initialise the firmware */ + err = gk20a_init_cde_img(cde_ctx, img); + if (err) { + gk20a_warn(&cde_ctx->pdev->dev, "cde: image initialisation failed"); + goto err_init_cde_img; + } + + /* initialisation done */ + release_firmware(img); + + return 0; + +err_init_cde_img: + gk20a_gmmu_unmap(ch->vm, vaddr, g->gr.compbit_store.size, 1); +err_map_backingstore: +err_alloc_gpfifo: + gk20a_vm_put(ch->vm); +err_commit_va: +err_get_gk20a_channel: + release_firmware(img); + dev_err(&cde_ctx->pdev->dev, "cde: couldn't initialise buffer converter: %d", + err); + return err; +} + +int gk20a_cde_reload(struct gk20a *g) +{ + struct gk20a_cde_app *cde_app = &g->cde_app; + struct gk20a_cde_ctx *cde_ctx = cde_app->cde_ctx; + int err, i; + + if (!cde_app->initialised) { + gk20a_busy(g->dev); + gk20a_init_cde_support(g); + gk20a_idle(g->dev); + if (!cde_app->initialised) + return -ENOSYS; + return 0; + } + + gk20a_busy(g->dev); + mutex_lock(&cde_app->mutex); + for (i = 0; i < ARRAY_SIZE(cde_app->cde_ctx); i++, cde_ctx++) { + gk20a_cde_remove(cde_ctx); + err = gk20a_cde_load(cde_ctx); + } + + cde_app->cde_ctx_ptr = 0; + mutex_unlock(&cde_app->mutex); + + gk20a_idle(g->dev); + return err; +} + +int gk20a_init_cde_support(struct gk20a *g) +{ + struct gk20a_cde_app *cde_app = &g->cde_app; + struct gk20a_cde_ctx *cde_ctx = cde_app->cde_ctx; + int ret, i; + + if (cde_app->initialised) + return 0; + + mutex_init(&cde_app->mutex); + mutex_lock(&cde_app->mutex); + + for (i = 0; i < ARRAY_SIZE(cde_app->cde_ctx); i++, cde_ctx++) { + cde_ctx->g = g; + cde_ctx->pdev = g->dev; + ret = gk20a_cde_load(cde_ctx); + if (ret) + goto err_init_instance; + } + + /* take shadow to the vm for general usage */ + cde_app->vm = cde_app->cde_ctx->vm; + + cde_app->cde_ctx_ptr = 0; + cde_app->initialised = true; + mutex_unlock(&cde_app->mutex); + + return 0; + +err_init_instance: + + /* deinitialise initialised channels */ + while (i--) { + gk20a_cde_remove(cde_ctx); + cde_ctx--; + } + return ret; +} diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.h b/drivers/gpu/nvgpu/gk20a/cde_gk20a.h new file mode 100644 index 00000000..784ae8b4 --- /dev/null +++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.h @@ -0,0 +1,254 @@ +/* + * GK20A color decompression engine support + * + * Copyright (c) 2014, NVIDIA Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef _CDE_GK20A_H_ +#define _CDE_GK20A_H_ + +#include "mm_gk20a.h" + +#define MAX_CDE_BUFS 10 +#define MAX_CDE_PARAMS 64 +#define MAX_CDE_USER_PARAMS 32 +#define MAX_CDE_OBJ_IDS 4 + +struct dma_buf; +struct gk20a; + +/* + * this element defines a buffer that is allocated and mapped into gpu address + * space. data_byte_offset defines the beginning of the buffer inside the + * firmare. num_bytes defines how many bytes the firmware contains. + * + * If data_byte_offset is zero, we allocate an empty buffer. + */ + +struct gk20a_cde_hdr_buf { + u64 data_byte_offset; + u64 num_bytes; +}; + +/* + * this element defines a constant patching in buffers. It basically + * computes physical address to +source_byte_offset. The + * address is then modified into patch value as per: + * value = (current_value & ~mask) | (address << shift) & mask . + * + * The type field defines the register size as: + * 0=u32, + * 1=u64 (little endian), + * 2=u64 (big endian) + */ + +struct gk20a_cde_hdr_replace { + u32 target_buf; + u32 source_buf; + s32 shift; + u32 type; + s64 target_byte_offset; + s64 source_byte_offset; + u64 mask; +}; + +enum { + TYPE_PARAM_TYPE_U32 = 0, + TYPE_PARAM_TYPE_U64_LITTLE, + TYPE_PARAM_TYPE_U64_BIG +}; + +/* + * this element defines a runtime patching in buffers. Parameters with id from + * 0 to 1024 are reserved for special usage as follows: + * 0 = comptags_per_cacheline, + * 1 = slices_per_fbp, + * 2 = num_fbps + * 3 = source buffer first page offset + * 4 = source buffer block height log2 + * 5 = backing store memory address + * 6 = destination memory address + * 7 = destination size (bytes) + * 8 = backing store size (bytes) + * 9 = cache line size + * + * Parameters above id 1024 are user-specified. I.e. they determine where a + * parameters from user space should be placed in buffers, what is their + * type, etc. + * + * Once the value is available, we add data_offset to the value. + * + * The value address is then modified into patch value as per: + * value = (current_value & ~mask) | (address << shift) & mask . + * + * The type field defines the register size as: + * 0=u32, + * 1=u64 (little endian), + * 2=u64 (big endian) + */ + +struct gk20a_cde_hdr_param { + u32 id; + u32 target_buf; + s32 shift; + u32 type; + s64 data_offset; + s64 target_byte_offset; + u64 mask; +}; + +enum { + TYPE_PARAM_COMPTAGS_PER_CACHELINE = 0, + TYPE_PARAM_GPU_CONFIGURATION, + TYPE_PARAM_FIRSTPAGEOFFSET, + TYPE_PARAM_NUMPAGES, + TYPE_PARAM_BACKINGSTORE, + TYPE_PARAM_DESTINATION, + TYPE_PARAM_DESTINATION_SIZE, + TYPE_PARAM_BACKINGSTORE_SIZE, + TYPE_PARAM_SOURCE_SMMU_ADDR, + NUM_RESERVED_PARAMS = 1024, +}; + +/* + * This header element defines a command. The op field determines whether the + * element is defining an init (0) or convert command (1). data_byte_offset + * denotes the beginning address of command elements in the file. + */ + +struct gk20a_cde_hdr_command { + u32 op; + u32 num_entries; + u64 data_byte_offset; +}; + +enum { + TYPE_BUF_COMMAND_INIT = 0, + TYPE_BUF_COMMAND_CONVERT +}; + +/* + * This is a command element defines one entry inside push buffer. target_buf + * defines the buffer including the pushbuffer entries, target_byte_offset the + * offset inside the buffer and num_bytes the number of words in the buffer. + */ + +struct gk20a_cde_cmd_elem { + u32 target_buf; + u32 padding; + u64 target_byte_offset; + u64 num_bytes; +}; + +/* + * Following defines a single header element. Each element has a type and + * some of the data structures. + */ + +struct gk20a_cde_hdr_elem { + u32 type; + u32 padding; + union { + struct gk20a_cde_hdr_buf buf; + struct gk20a_cde_hdr_replace replace; + struct gk20a_cde_hdr_param param; + u32 required_class; + struct gk20a_cde_hdr_command command; + }; +}; + +enum { + TYPE_BUF = 0, + TYPE_REPLACE, + TYPE_PARAM, + TYPE_REQUIRED_CLASS, + TYPE_COMMAND +}; + +struct gk20a_cde_mem_desc { + struct sg_table *sgt; + dma_addr_t iova; + void *cpuva; + size_t num_bytes; + u64 gpu_va; +}; + +struct gk20a_cde_param { + u32 id; + u32 padding; + u64 value; +}; + +struct gk20a_cde_ctx { + struct gk20a *g; + struct platform_device *pdev; + + /* channel related data */ + struct channel_gk20a *ch; + struct vm_gk20a *vm; + + /* buf converter configuration */ + struct gk20a_cde_mem_desc mem[MAX_CDE_BUFS]; + int num_bufs; + + /* buffer patching params (where should patching be done) */ + struct gk20a_cde_hdr_param params[MAX_CDE_PARAMS]; + int num_params; + + /* storage for user space parameter values */ + u32 user_param_values[MAX_CDE_USER_PARAMS]; + + u64 src_smmu_addr; + u32 src_param_offset; + u32 src_param_lines; + + u64 src_vaddr; + + u64 dest_vaddr; + u64 dest_size; + + u32 obj_ids[MAX_CDE_OBJ_IDS]; + int num_obj_ids; + + u64 backing_store_vaddr; + + struct nvhost_gpfifo *init_cmd; + int init_cmd_num_entries; + + struct nvhost_gpfifo *convert_cmd; + int convert_cmd_num_entries; + + struct kobj_attribute attr; +}; + +struct gk20a_cde_app { + bool initialised; + struct mutex mutex; + struct vm_gk20a *vm; + + struct gk20a_cde_ctx cde_ctx[1]; + int cde_ctx_ptr; +}; + +int gk20a_cde_destroy(struct gk20a *g); +int gk20a_init_cde_support(struct gk20a *g); +int gk20a_cde_reload(struct gk20a *g); +int gk20a_cde_convert(struct gk20a *g, u32 src_fd, u32 dst_fd, + s32 dst_kind, u64 dst_word_offset, + u32 dst_size, struct nvhost_fence *fence, + u32 __flags, struct gk20a_cde_param *params, + int num_params, struct gk20a_fence **fence_out); + +#endif diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c index fa6e0cce..2975798f 100644 --- a/drivers/gpu/nvgpu/gk20a/gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/gk20a.c @@ -976,6 +976,7 @@ static int gk20a_pm_finalize_poweron(struct device *dev) goto done; } + gk20a_channel_resume(g); set_user_nice(current, nice_value); @@ -983,6 +984,8 @@ static int gk20a_pm_finalize_poweron(struct device *dev) trace_gk20a_finalize_poweron_done(dev_name(dev)); + if (IS_ENABLED(CONFIG_GK20A_CDE)) + gk20a_init_cde_support(g); done: return err; } diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h index 05ed9270..b2ecade5 100644 --- a/drivers/gpu/nvgpu/gk20a/gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/gk20a.h @@ -48,6 +48,7 @@ struct acr_gm20b; #include "therm_gk20a.h" #include "platform_gk20a.h" #include "gm20b/acr_gm20b.h" +#include "cde_gk20a.h" extern struct platform_device tegra_gk20a_device; @@ -356,6 +357,8 @@ struct gk20a { struct gk20a_scale_profile *scale_profile; struct device_dma_parameters dma_parms; + + struct gk20a_cde_app cde_app; }; static inline unsigned long gk20a_get_gr_idle_timeout(struct gk20a *g) @@ -422,6 +425,7 @@ enum gk20a_dbg_categories { gpu_dbg_clk = BIT(7), /* gk20a clk */ gpu_dbg_map = BIT(8), /* mem mappings */ gpu_dbg_gpu_dbg = BIT(9), /* gpu debugger/profiler */ + gpu_dbg_cde = BIT(10), /* cde info messages */ gpu_dbg_mem = BIT(31), /* memory accesses, very verbose */ }; -- cgit v1.2.2