summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
diff options
context:
space:
mode:
authorAlex Waterman <alexw@nvidia.com>2015-03-18 16:33:09 -0400
committerTerje Bergstrom <tbergstrom@nvidia.com>2015-05-11 11:53:25 -0400
commita2e852364582e9c337f52bc53ccc33877c8f3b47 (patch)
treefb13c5ad80db8eb2424a753a92389c7a3a322a12 /drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
parent0566aee853eb32f4f796499b6b00ddf0f1d7de34 (diff)
gpu: nvgpu: New allocator for VA space
Implement a new buddy allocation scheme for the GPU's VA space. The bitmap allocator was using too much memory and is not a scaleable solution as the GPU's address space keeps getting bigger. The buddy allocation scheme is much more memory efficient when the majority of the address space is not allocated. The buddy allocator is not constrained by the notion of a split address space. The bitmap allocator could only manage either small pages or large pages but not both at the same time. Thus the bottom of the address space was for small pages, the top for large pages. Although, that split is not removed quite yet, the new allocator enables that to happen. The buddy allocator is also very scalable. It manages the relatively small comptag space to the enormous GPU VA space and everything in between. This is important since the GPU has lots of different sized spaces that need managing. Currently there are certain limitations. For one the allocator does not handle the fixed allocations from CUDA very well. It can do so but with certain caveats. The PTE page size is always set to small. This means the BA may place other small page allocations in the buddies around the fixed allocation. It does this to avoid having large and small page allocations in the same PDE. Change-Id: I501cd15af03611536490137331d43761c402c7f9 Signed-off-by: Alex Waterman <alexw@nvidia.com> Reviewed-on: http://git-master/r/740694 Reviewed-by: Automatic_Commit_Validation_User GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu/gk20a/pmu_gk20a.c')
-rw-r--r--drivers/gpu/nvgpu/gk20a/pmu_gk20a.c68
1 files changed, 29 insertions, 39 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
index 275fbd4e..fc8d130c 100644
--- a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
@@ -2438,7 +2438,6 @@ static int pmu_init_perfmon(struct pmu_gk20a *pmu)
2438 struct pmu_payload payload; 2438 struct pmu_payload payload;
2439 u32 seq; 2439 u32 seq;
2440 u32 data; 2440 u32 data;
2441 int err = 0;
2442 2441
2443 gk20a_dbg_fn(""); 2442 gk20a_dbg_fn("");
2444 2443
@@ -2489,12 +2488,11 @@ static int pmu_init_perfmon(struct pmu_gk20a *pmu)
2489 gk20a_writel(g, pwr_pmu_idle_ctrl_r(2), data); 2488 gk20a_writel(g, pwr_pmu_idle_ctrl_r(2), data);
2490 2489
2491 if (!pmu->sample_buffer) 2490 if (!pmu->sample_buffer)
2492 err = pmu->dmem.alloc(&pmu->dmem, 2491 pmu->sample_buffer = gk20a_balloc(&pmu->dmem,
2493 &pmu->sample_buffer, 2 * sizeof(u16), 2492 2 * sizeof(u16));
2494 PMU_DMEM_ALLOC_ALIGNMENT); 2493 if (!pmu->sample_buffer) {
2495 if (err) {
2496 gk20a_err(dev_from_gk20a(g), 2494 gk20a_err(dev_from_gk20a(g),
2497 "failed to allocate perfmon sample buffer"); 2495 "failed to allocate perfmon sample buffer");
2498 return -ENOMEM; 2496 return -ENOMEM;
2499 } 2497 }
2500 2498
@@ -2592,15 +2590,17 @@ static int pmu_process_init_msg(struct pmu_gk20a *pmu,
2592 for (i = 0; i < PMU_QUEUE_COUNT; i++) 2590 for (i = 0; i < PMU_QUEUE_COUNT; i++)
2593 pmu_queue_init(pmu, i, init); 2591 pmu_queue_init(pmu, i, init);
2594 2592
2595 if (!pmu->dmem.alloc) { 2593 if (!pmu->dmem.init) {
2596 /*Align start and end addresses*/ 2594 /* Align start and end addresses */
2597 u32 start = ALIGN(pv->get_pmu_init_msg_pmu_sw_mg_off(init), 2595 u32 start = ALIGN(pv->get_pmu_init_msg_pmu_sw_mg_off(init),
2598 PMU_DMEM_ALLOC_ALIGNMENT); 2596 PMU_DMEM_ALLOC_ALIGNMENT);
2599 u32 end = (pv->get_pmu_init_msg_pmu_sw_mg_off(init) + 2597 u32 end = (pv->get_pmu_init_msg_pmu_sw_mg_off(init) +
2600 pv->get_pmu_init_msg_pmu_sw_mg_size(init)) & 2598 pv->get_pmu_init_msg_pmu_sw_mg_size(init)) &
2601 ~(PMU_DMEM_ALLOC_ALIGNMENT - 1); 2599 ~(PMU_DMEM_ALLOC_ALIGNMENT - 1);
2602 u32 size = end - start; 2600 u32 size = end - start;
2603 gk20a_allocator_init(&pmu->dmem, "gk20a_pmu_dmem", start, size); 2601 __gk20a_allocator_init(&pmu->dmem, NULL, "gk20a_pmu_dmem",
2602 start, size,
2603 PMU_DMEM_ALLOC_ALIGNMENT, 4, 0);
2604 } 2604 }
2605 2605
2606 pmu->pmu_ready = true; 2606 pmu->pmu_ready = true;
@@ -2737,20 +2737,14 @@ static int pmu_response_handle(struct pmu_gk20a *pmu,
2737 seq->callback = NULL; 2737 seq->callback = NULL;
2738 if (pv->pmu_allocation_get_dmem_size(pmu, 2738 if (pv->pmu_allocation_get_dmem_size(pmu,
2739 pv->get_pmu_seq_in_a_ptr(seq)) != 0) 2739 pv->get_pmu_seq_in_a_ptr(seq)) != 0)
2740 pmu->dmem.free(&pmu->dmem, 2740 gk20a_bfree(&pmu->dmem,
2741 pv->pmu_allocation_get_dmem_offset(pmu, 2741 pv->pmu_allocation_get_dmem_offset(pmu,
2742 pv->get_pmu_seq_in_a_ptr(seq)), 2742 pv->get_pmu_seq_in_a_ptr(seq)));
2743 pv->pmu_allocation_get_dmem_size(pmu,
2744 pv->get_pmu_seq_in_a_ptr(seq)),
2745 PMU_DMEM_ALLOC_ALIGNMENT);
2746 if (pv->pmu_allocation_get_dmem_size(pmu, 2743 if (pv->pmu_allocation_get_dmem_size(pmu,
2747 pv->get_pmu_seq_out_a_ptr(seq)) != 0) 2744 pv->get_pmu_seq_out_a_ptr(seq)) != 0)
2748 pmu->dmem.free(&pmu->dmem, 2745 gk20a_bfree(&pmu->dmem,
2749 pv->pmu_allocation_get_dmem_offset(pmu, 2746 pv->pmu_allocation_get_dmem_offset(pmu,
2750 pv->get_pmu_seq_out_a_ptr(seq)), 2747 pv->get_pmu_seq_out_a_ptr(seq)));
2751 pv->pmu_allocation_get_dmem_size(pmu,
2752 pv->get_pmu_seq_out_a_ptr(seq)),
2753 PMU_DMEM_ALLOC_ALIGNMENT);
2754 2748
2755 if (seq->callback) 2749 if (seq->callback)
2756 seq->callback(g, msg, seq->cb_params, seq->desc, ret); 2750 seq->callback(g, msg, seq->cb_params, seq->desc, ret);
@@ -3387,11 +3381,10 @@ int gk20a_pmu_cmd_post(struct gk20a *g, struct pmu_cmd *cmd,
3387 pv->pmu_allocation_set_dmem_size(pmu, in, 3381 pv->pmu_allocation_set_dmem_size(pmu, in,
3388 (u16)max(payload->in.size, payload->out.size)); 3382 (u16)max(payload->in.size, payload->out.size));
3389 3383
3390 err = pmu->dmem.alloc(&pmu->dmem, 3384 *(pv->pmu_allocation_get_dmem_offset_addr(pmu, in)) =
3391 pv->pmu_allocation_get_dmem_offset_addr(pmu, in), 3385 gk20a_balloc(&pmu->dmem,
3392 pv->pmu_allocation_get_dmem_size(pmu, in), 3386 pv->pmu_allocation_get_dmem_size(pmu, in));
3393 PMU_DMEM_ALLOC_ALIGNMENT); 3387 if (!*(pv->pmu_allocation_get_dmem_offset_addr(pmu, in)))
3394 if (err)
3395 goto clean_up; 3388 goto clean_up;
3396 3389
3397 pmu_copy_to_dmem(pmu, (pv->pmu_allocation_get_dmem_offset(pmu, 3390 pmu_copy_to_dmem(pmu, (pv->pmu_allocation_get_dmem_offset(pmu,
@@ -3412,11 +3405,12 @@ int gk20a_pmu_cmd_post(struct gk20a *g, struct pmu_cmd *cmd,
3412 (u16)payload->out.size); 3405 (u16)payload->out.size);
3413 3406
3414 if (payload->out.buf != payload->in.buf) { 3407 if (payload->out.buf != payload->in.buf) {
3415 err = pmu->dmem.alloc(&pmu->dmem, 3408
3416 pv->pmu_allocation_get_dmem_offset_addr(pmu, out), 3409 *(pv->pmu_allocation_get_dmem_offset_addr(pmu, out)) =
3417 pv->pmu_allocation_get_dmem_size(pmu, out), 3410 gk20a_balloc(&pmu->dmem,
3418 PMU_DMEM_ALLOC_ALIGNMENT); 3411 pv->pmu_allocation_get_dmem_size(pmu, out));
3419 if (err) 3412 if (!*(pv->pmu_allocation_get_dmem_offset_addr(pmu,
3413 out)))
3420 goto clean_up; 3414 goto clean_up;
3421 } else { 3415 } else {
3422 BUG_ON(in == NULL); 3416 BUG_ON(in == NULL);
@@ -3444,15 +3438,11 @@ int gk20a_pmu_cmd_post(struct gk20a *g, struct pmu_cmd *cmd,
3444clean_up: 3438clean_up:
3445 gk20a_dbg_fn("fail"); 3439 gk20a_dbg_fn("fail");
3446 if (in) 3440 if (in)
3447 pmu->dmem.free(&pmu->dmem, 3441 gk20a_bfree(&pmu->dmem,
3448 pv->pmu_allocation_get_dmem_offset(pmu, in), 3442 pv->pmu_allocation_get_dmem_offset(pmu, in));
3449 pv->pmu_allocation_get_dmem_size(pmu, in),
3450 PMU_DMEM_ALLOC_ALIGNMENT);
3451 if (out) 3443 if (out)
3452 pmu->dmem.free(&pmu->dmem, 3444 gk20a_bfree(&pmu->dmem,
3453 pv->pmu_allocation_get_dmem_offset(pmu, out), 3445 pv->pmu_allocation_get_dmem_offset(pmu, out));
3454 pv->pmu_allocation_get_dmem_size(pmu, out),
3455 PMU_DMEM_ALLOC_ALIGNMENT);
3456 3446
3457 pmu_seq_release(pmu, seq); 3447 pmu_seq_release(pmu, seq);
3458 return err; 3448 return err;