/* * Nvgpu Semaphores * * Copyright (c) 2014-2018, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #include #include #include #include #include #include #include #include #include "gk20a/mm_gk20a.h" #define pool_to_gk20a(p) ((p)->sema_sea->gk20a) #define __lock_sema_sea(s) \ do { \ gpu_sema_verbose_dbg(s->gk20a, "Acquiring sema lock..."); \ nvgpu_mutex_acquire(&s->sea_lock); \ gpu_sema_verbose_dbg(s->gk20a, "Sema lock aquried!"); \ } while (0) #define __unlock_sema_sea(s) \ do { \ nvgpu_mutex_release(&s->sea_lock); \ gpu_sema_verbose_dbg(s->gk20a, "Released sema lock"); \ } while (0) /* * Return the sema_sea pointer. */ struct nvgpu_semaphore_sea *nvgpu_semaphore_get_sea(struct gk20a *g) { return g->sema_sea; } static int __nvgpu_semaphore_sea_grow(struct nvgpu_semaphore_sea *sea) { int ret = 0; struct gk20a *gk20a = sea->gk20a; u32 i; __lock_sema_sea(sea); ret = nvgpu_dma_alloc_sys(gk20a, PAGE_SIZE * SEMAPHORE_POOL_COUNT, &sea->sea_mem); if (ret) { goto out; } sea->size = SEMAPHORE_POOL_COUNT; sea->map_size = SEMAPHORE_POOL_COUNT * PAGE_SIZE; /* * Start the semaphores at values that will soon overflow the 32-bit * integer range. This way any buggy comparisons would start to fail * sooner rather than later. */ for (i = 0U; i < PAGE_SIZE * SEMAPHORE_POOL_COUNT; i += 4U) { nvgpu_mem_wr(gk20a, &sea->sea_mem, i, 0xfffffff0); } out: __unlock_sema_sea(sea); return ret; } void nvgpu_semaphore_sea_destroy(struct gk20a *g) { if (g->sema_sea == NULL) { return; } nvgpu_dma_free(g, &g->sema_sea->sea_mem); nvgpu_mutex_destroy(&g->sema_sea->sea_lock); nvgpu_kfree(g, g->sema_sea); g->sema_sea = NULL; } /* * Create the semaphore sea. Only create it once - subsequent calls to this will * return the originally created sea pointer. */ struct nvgpu_semaphore_sea *nvgpu_semaphore_sea_create(struct gk20a *g) { if (g->sema_sea) { return g->sema_sea; } g->sema_sea = nvgpu_kzalloc(g, sizeof(*g->sema_sea)); if (g->sema_sea == NULL) { return NULL; } g->sema_sea->size = 0; g->sema_sea->page_count = 0; g->sema_sea->gk20a = g; nvgpu_init_list_node(&g->sema_sea->pool_list); if (nvgpu_mutex_init(&g->sema_sea->sea_lock)) { goto cleanup_free; } if (__nvgpu_semaphore_sea_grow(g->sema_sea)) { goto cleanup_destroy; } gpu_sema_dbg(g, "Created semaphore sea!"); return g->sema_sea; cleanup_destroy: nvgpu_mutex_destroy(&g->sema_sea->sea_lock); cleanup_free: nvgpu_kfree(g, g->sema_sea); g->sema_sea = NULL; gpu_sema_dbg(g, "Failed to creat semaphore sea!"); return NULL; } static int __semaphore_bitmap_alloc(unsigned long *bitmap, unsigned long len) { unsigned long idx = find_first_zero_bit(bitmap, len); if (idx == len) { return -ENOSPC; } set_bit(idx, bitmap); return (int)idx; } /* * Allocate a pool from the sea. */ int nvgpu_semaphore_pool_alloc(struct nvgpu_semaphore_sea *sea, struct nvgpu_semaphore_pool **pool) { struct nvgpu_semaphore_pool *p; unsigned long page_idx; int ret; p = nvgpu_kzalloc(sea->gk20a, sizeof(*p)); if (p == NULL) { return -ENOMEM; } __lock_sema_sea(sea); ret = nvgpu_mutex_init(&p->pool_lock); if (ret) { goto fail; } ret = __semaphore_bitmap_alloc(sea->pools_alloced, SEMAPHORE_POOL_COUNT); if (ret < 0) { goto fail_alloc; } page_idx = (unsigned long)ret; p->page_idx = page_idx; p->sema_sea = sea; nvgpu_init_list_node(&p->pool_list_entry); nvgpu_ref_init(&p->ref); sea->page_count++; nvgpu_list_add(&p->pool_list_entry, &sea->pool_list); __unlock_sema_sea(sea); gpu_sema_dbg(sea->gk20a, "Allocated semaphore pool: page-idx=%llu", p->page_idx); *pool = p; return 0; fail_alloc: nvgpu_mutex_destroy(&p->pool_lock); fail: __unlock_sema_sea(sea); nvgpu_kfree(sea->gk20a, p); gpu_sema_dbg(sea->gk20a, "Failed to allocate semaphore pool!"); return ret; } /* * Map a pool into the passed vm's address space. This handles both the fixed * global RO mapping and the non-fixed private RW mapping. */ int nvgpu_semaphore_pool_map(struct nvgpu_semaphore_pool *p, struct vm_gk20a *vm) { int err = 0; u64 addr; if (p->mapped) { return -EBUSY; } gpu_sema_dbg(pool_to_gk20a(p), "Mapping semaphore pool! (idx=%llu)", p->page_idx); /* * Take the sea lock so that we don't race with a possible change to the * nvgpu_mem in the sema sea. */ __lock_sema_sea(p->sema_sea); addr = nvgpu_gmmu_map_fixed(vm, &p->sema_sea->sea_mem, p->sema_sea->gpu_va, p->sema_sea->map_size, 0, gk20a_mem_flag_read_only, 0, p->sema_sea->sea_mem.aperture); if (addr == 0ULL) { err = -ENOMEM; goto fail_unlock; } p->gpu_va_ro = addr; p->mapped = true; gpu_sema_dbg(pool_to_gk20a(p), " %llu: GPU read-only VA = 0x%llx", p->page_idx, p->gpu_va_ro); /* * Now the RW mapping. This is a bit more complicated. We make a * nvgpu_mem describing a page of the bigger RO space and then map * that. Unlike above this does not need to be a fixed address. */ err = nvgpu_mem_create_from_mem(vm->mm->g, &p->rw_mem, &p->sema_sea->sea_mem, p->page_idx, 1); if (err) { goto fail_unmap; } addr = nvgpu_gmmu_map(vm, &p->rw_mem, SZ_4K, 0, gk20a_mem_flag_none, 0, p->rw_mem.aperture); if (addr == 0ULL) { err = -ENOMEM; goto fail_free_submem; } p->gpu_va = addr; __unlock_sema_sea(p->sema_sea); gpu_sema_dbg(pool_to_gk20a(p), " %llu: GPU read-write VA = 0x%llx", p->page_idx, p->gpu_va); gpu_sema_dbg(pool_to_gk20a(p), " %llu: CPU VA = 0x%p", p->page_idx, p->rw_mem.cpu_va); return 0; fail_free_submem: nvgpu_dma_free(pool_to_gk20a(p), &p->rw_mem); fail_unmap: nvgpu_gmmu_unmap(vm, &p->sema_sea->sea_mem, p->gpu_va_ro); gpu_sema_dbg(pool_to_gk20a(p), " %llu: Failed to map semaphore pool!", p->page_idx); fail_unlock: __unlock_sema_sea(p->sema_sea); return err; } /* * Unmap a semaphore_pool. */ void nvgpu_semaphore_pool_unmap(struct nvgpu_semaphore_pool *p, struct vm_gk20a *vm) { __lock_sema_sea(p->sema_sea); nvgpu_gmmu_unmap(vm, &p->sema_sea->sea_mem, p->gpu_va_ro); nvgpu_gmmu_unmap(vm, &p->rw_mem, p->gpu_va); nvgpu_dma_free(pool_to_gk20a(p), &p->rw_mem); p->gpu_va = 0; p->gpu_va_ro = 0; p->mapped = false; __unlock_sema_sea(p->sema_sea); gpu_sema_dbg(pool_to_gk20a(p), "Unmapped semaphore pool! (idx=%llu)", p->page_idx); } /* * Completely free a semaphore_pool. You should make sure this pool is not * mapped otherwise there's going to be a memory leak. */ static void nvgpu_semaphore_pool_free(struct nvgpu_ref *ref) { struct nvgpu_semaphore_pool *p = container_of(ref, struct nvgpu_semaphore_pool, ref); struct nvgpu_semaphore_sea *s = p->sema_sea; /* Freeing a mapped pool is a bad idea. */ WARN_ON((p->mapped) || (p->gpu_va != 0ULL) || (p->gpu_va_ro != 0ULL)); __lock_sema_sea(s); nvgpu_list_del(&p->pool_list_entry); clear_bit((int)p->page_idx, s->pools_alloced); s->page_count--; __unlock_sema_sea(s); nvgpu_mutex_destroy(&p->pool_lock); gpu_sema_dbg(pool_to_gk20a(p), "Freed semaphore pool! (idx=%llu)", p->page_idx); nvgpu_kfree(p->sema_sea->gk20a, p); } void nvgpu_semaphore_pool_get(struct nvgpu_semaphore_pool *p) { nvgpu_ref_get(&p->ref); } void nvgpu_semaphore_pool_put(struct nvgpu_semaphore_pool *p) { nvgpu_ref_put(&p->ref, nvgpu_semaphore_pool_free); } /* * Get the address for a semaphore_pool - if global is true then return the * global RO address instead of the RW address owned by the semaphore's VM. */ u64 __nvgpu_semaphore_pool_gpu_va(struct nvgpu_semaphore_pool *p, bool global) { if (!global) { return p->gpu_va; } return p->gpu_va_ro + (PAGE_SIZE * p->page_idx); } static int __nvgpu_init_hw_sema(struct channel_gk20a *ch) { int hw_sema_idx; int ret = 0; struct nvgpu_semaphore_int *hw_sema; struct nvgpu_semaphore_pool *p = ch->vm->sema_pool; int current_value; BUG_ON(p == NULL); nvgpu_mutex_acquire(&p->pool_lock); /* Find an available HW semaphore. */ hw_sema_idx = __semaphore_bitmap_alloc(p->semas_alloced, PAGE_SIZE / SEMAPHORE_SIZE); if (hw_sema_idx < 0) { ret = hw_sema_idx; goto fail; } hw_sema = nvgpu_kzalloc(ch->g, sizeof(struct nvgpu_semaphore_int)); if (hw_sema == NULL) { ret = -ENOMEM; goto fail_free_idx; } ch->hw_sema = hw_sema; hw_sema->ch = ch; hw_sema->location.pool = p; hw_sema->location.offset = SEMAPHORE_SIZE * (u32)hw_sema_idx; current_value = nvgpu_mem_rd(ch->g, &p->rw_mem, hw_sema->location.offset); nvgpu_atomic_set(&hw_sema->next_value, current_value); nvgpu_mutex_release(&p->pool_lock); return 0; fail_free_idx: clear_bit(hw_sema_idx, p->semas_alloced); fail: nvgpu_mutex_release(&p->pool_lock); return ret; } /* * Free the channel used semaphore index */ void nvgpu_semaphore_free_hw_sema(struct channel_gk20a *ch) { struct nvgpu_semaphore_pool *p = ch->vm->sema_pool; struct nvgpu_semaphore_int *hw_sema = ch->hw_sema; int idx = hw_sema->location.offset / SEMAPHORE_SIZE; BUG_ON(p == NULL); nvgpu_mutex_acquire(&p->pool_lock); clear_bit(idx, p->semas_alloced); nvgpu_kfree(ch->g, hw_sema); ch->hw_sema = NULL; nvgpu_mutex_release(&p->pool_lock); } /* * Allocate a semaphore from the passed pool. * * Since semaphores are ref-counted there's no explicit free for external code * to use. When the ref-count hits 0 the internal free will happen. */ struct nvgpu_semaphore *nvgpu_semaphore_alloc(struct channel_gk20a *ch) { struct nvgpu_semaphore *s; int ret; if (ch->hw_sema == NULL) { ret = __nvgpu_init_hw_sema(ch); if (ret) { return NULL; } } s = nvgpu_kzalloc(ch->g, sizeof(*s)); if (s == NULL) { return NULL; } nvgpu_ref_init(&s->ref); s->g = ch->g; s->location = ch->hw_sema->location; nvgpu_atomic_set(&s->value, 0); /* * Take a ref on the pool so that we can keep this pool alive for * as long as this semaphore is alive. */ nvgpu_semaphore_pool_get(s->location.pool); gpu_sema_dbg(ch->g, "Allocated semaphore (c=%d)", ch->chid); return s; } static void nvgpu_semaphore_free(struct nvgpu_ref *ref) { struct nvgpu_semaphore *s = container_of(ref, struct nvgpu_semaphore, ref); nvgpu_semaphore_pool_put(s->location.pool); nvgpu_kfree(s->g, s); } void nvgpu_semaphore_put(struct nvgpu_semaphore *s) { nvgpu_ref_put(&s->ref, nvgpu_semaphore_free); } void nvgpu_semaphore_get(struct nvgpu_semaphore *s) { nvgpu_ref_get(&s->ref); } /* * Return the address of a specific semaphore. * * Don't call this on a semaphore you don't own - the VA returned will make no * sense in your specific channel's VM. */ u64 nvgpu_semaphore_gpu_rw_va(struct nvgpu_semaphore *s) { return __nvgpu_semaphore_pool_gpu_va(s->location.pool, false) + s->location.offset; } /* * Get the global RO address for the semaphore. Can be called on any semaphore * regardless of whether you own it. */ u64 nvgpu_semaphore_gpu_ro_va(struct nvgpu_semaphore *s) { return __nvgpu_semaphore_pool_gpu_va(s->location.pool, true) + s->location.offset; } u64 nvgpu_hw_sema_addr(struct nvgpu_semaphore_int *hw_sema) { return __nvgpu_semaphore_pool_gpu_va(hw_sema->location.pool, true) + hw_sema->location.offset; } u32 __nvgpu_semaphore_read(struct nvgpu_semaphore_int *hw_sema) { return nvgpu_mem_rd(hw_sema->ch->g, &hw_sema->location.pool->rw_mem, hw_sema->location.offset); } /* * Read the underlying value from a semaphore. */ u32 nvgpu_semaphore_read(struct nvgpu_semaphore *s) { return nvgpu_mem_rd(s->g, &s->location.pool->rw_mem, s->location.offset); } /* * Check if "racer" is over "goal" with wraparound handling. */ static bool __nvgpu_semaphore_value_released(u32 goal, u32 racer) { /* * Handle wraparound with the same heuristic as the hardware does: * although the integer will eventually wrap around, consider a sema * released against a threshold if its value has passed that threshold * but has not wrapped over half of the u32 range over that threshold; * such wrapping is unlikely to happen during a sema lifetime. * * Values for [goal, goal + 0x7fffffff] are considered signaled; that's * precisely half of the 32-bit space. If racer == goal + 0x80000000, * then it needs 0x80000000 increments to wrap again and signal. * * Unsigned arithmetic is used because it's well-defined. This is * effectively the same as: signed_racer - signed_goal > 0. */ return racer - goal < 0x80000000; } u32 nvgpu_semaphore_get_value(struct nvgpu_semaphore *s) { return (u32)nvgpu_atomic_read(&s->value); } bool nvgpu_semaphore_is_released(struct nvgpu_semaphore *s) { u32 sema_val = nvgpu_semaphore_read(s); u32 wait_payload = nvgpu_semaphore_get_value(s); return __nvgpu_semaphore_value_released(wait_payload, sema_val); } bool nvgpu_semaphore_is_acquired(struct nvgpu_semaphore *s) { return !nvgpu_semaphore_is_released(s); } /* * Fast-forward the hw sema to its tracked max value. * * Return true if the sema wasn't at the max value and needed updating, false * otherwise. */ bool nvgpu_semaphore_reset(struct nvgpu_semaphore_int *hw_sema) { u32 threshold = (u32)nvgpu_atomic_read(&hw_sema->next_value); u32 current_val = __nvgpu_semaphore_read(hw_sema); /* * If the semaphore has already reached the value we would write then * this is really just a NO-OP. However, the sema value shouldn't be * more than what we expect to be the max. */ if (WARN_ON(__nvgpu_semaphore_value_released(threshold + 1U, current_val))) return false; if (current_val == threshold) return false; nvgpu_mem_wr(hw_sema->ch->g, &hw_sema->location.pool->rw_mem, hw_sema->location.offset, threshold); gpu_sema_verbose_dbg(hw_sema->ch->g, "(c=%d) RESET %u -> %u", hw_sema->ch->chid, current_val, threshold); return true; } /* * Update nvgpu-tracked shadow of the value in "hw_sema" and mark the threshold * value to "s" which represents the increment that the caller must write in a * pushbuf. The same nvgpu_semaphore will also represent an output fence; when * nvgpu_semaphore_is_released(s) == true, the gpu is done with this increment. */ void nvgpu_semaphore_prepare(struct nvgpu_semaphore *s, struct nvgpu_semaphore_int *hw_sema) { int next = nvgpu_atomic_add_return(1, &hw_sema->next_value); /* "s" should be an uninitialized sema. */ WARN_ON(s->incremented); nvgpu_atomic_set(&s->value, next); s->incremented = true; gpu_sema_verbose_dbg(s->g, "INCR sema for c=%d (%u)", hw_sema->ch->chid, next); }