diff options
author | Alex Waterman <alexw@nvidia.com> | 2016-04-27 15:27:36 -0400 |
---|---|---|
committer | Terje Bergstrom <tbergstrom@nvidia.com> | 2016-06-28 18:49:11 -0400 |
commit | dfd5ec53fcce4ebae27f78242e6b788350337095 (patch) | |
tree | 073ea380b9ee4734391d381745f57600c3525be5 /drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h | |
parent | b30990ea6db564e885d5aee7a1a5ea87a1e5e8ee (diff) |
gpu: nvgpu: Revamp semaphore support
Revamp the support the nvgpu driver has for semaphores.
The original problem with nvgpu's semaphore support is that it
required a SW based wait for every semaphore release. This was
because for every fence that gk20a_channel_semaphore_wait_fd()
waited on a new semaphore was created. This semaphore would then
get released by SW when the fence signaled. This meant that for
every release there was necessarily a sync_fence_wait_async() call
which could block. The latency of this SW wait was enough to cause
massive degredation in performance.
To fix this a fast path was implemented. When a fence is passed to
gk20a_channel_semaphore_wait_fd() that is backed by a GPU semaphore
a semaphore acquire is directly used to block the GPU. No longer is
a sync_fence_wait_async() performed nor is there an extra semaphore
created.
To implement this fast path the semaphore memory had to be shared
between channels. Previously since a new semaphore was created
every time through gk20a_channel_semaphore_wait_fd() what address
space a semaphore was mapped into was irrelevant. However, when
using the fast path a sempahore may be released on one address
space but acquired in another.
Sharing the semaphore memory was done by making a fixed GPU mapping
in all channels. This mapping points to the semaphore memory (the
so called semaphore sea). This global fixed mapping is read-only to
make sure no semaphores can be incremented (i.e released) by a
malicious channel. Each channel then gets a RW mapping of it's own
semaphore. This way a channel may only acquire other channel's
semaphores but may both acquire and release its own semaphore.
The gk20a fence code was updated to allow introspection of the GPU
backed fences. This allows detection of when the fast path can be
taken. If the fast path cannot be used (for example when a fence is
sync-pt backed) the original slow path is still present. This gets
used when the GPU needs to wait on an event from something which
only understands how to use sync-pts.
Bug 1732449
JIRA DNVGPU-12
Change-Id: Ic0fea74994da5819a771deac726bb0d47a33c2de
Signed-off-by: Alex Waterman <alexw@nvidia.com>
Reviewed-on: http://git-master/r/1133792
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h')
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h | 303 |
1 files changed, 250 insertions, 53 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h index 1f12e262..58081b56 100644 --- a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h | |||
@@ -15,17 +15,128 @@ | |||
15 | #define SEMAPHORE_GK20A_H | 15 | #define SEMAPHORE_GK20A_H |
16 | 16 | ||
17 | #include <linux/kref.h> | 17 | #include <linux/kref.h> |
18 | #include "gk20a_allocator.h" | 18 | #include <linux/list.h> |
19 | #include <linux/delay.h> | ||
20 | |||
21 | #include "gk20a.h" | ||
19 | #include "mm_gk20a.h" | 22 | #include "mm_gk20a.h" |
23 | #include "channel_gk20a.h" | ||
24 | |||
25 | /* | ||
26 | * Max number of channels that can be used is 512. This of course needs to be | ||
27 | * fixed to be dynamic but still fast. | ||
28 | */ | ||
29 | #define SEMAPHORE_POOL_COUNT 512 | ||
30 | #define SEMAPHORE_SIZE 16 | ||
31 | #define SEMAPHORE_SEA_GROWTH_RATE 32 | ||
32 | |||
33 | struct gk20a_semaphore_sea; | ||
34 | |||
35 | /* | ||
36 | * Underlying semaphore data structure. This semaphore can be shared amongst | ||
37 | * other semaphore instances. | ||
38 | */ | ||
39 | struct gk20a_semaphore_int { | ||
40 | int idx; /* Semaphore index. */ | ||
41 | u32 offset; /* Offset into the pool. */ | ||
42 | atomic_t next_value; /* Next available value. */ | ||
43 | u32 *value; /* Current value (access w/ readl()). */ | ||
44 | u32 nr_incrs; /* Number of increments programmed. */ | ||
45 | struct gk20a_semaphore_pool *p; /* Pool that owns this sema. */ | ||
46 | struct channel_gk20a *ch; /* Channel that owns this sema. */ | ||
47 | struct list_head hw_sema_list; /* List of HW semaphores. */ | ||
48 | }; | ||
49 | |||
50 | /* | ||
51 | * A semaphore which the rest of the driver actually uses. This consists of a | ||
52 | * pointer to a real semaphore and a value to wait for. This allows one physical | ||
53 | * semaphore to be shared among an essentially infinite number of submits. | ||
54 | */ | ||
55 | struct gk20a_semaphore { | ||
56 | struct gk20a_semaphore_int *hw_sema; | ||
20 | 57 | ||
21 | /* A memory pool for holding semaphores. */ | 58 | atomic_t value; |
59 | int incremented; | ||
60 | |||
61 | struct kref ref; | ||
62 | }; | ||
63 | |||
64 | /* | ||
65 | * A semaphore pool. Each address space will own exactly one of these. | ||
66 | */ | ||
22 | struct gk20a_semaphore_pool { | 67 | struct gk20a_semaphore_pool { |
23 | struct mem_desc mem; | 68 | struct page *page; /* This pool's page of memory */ |
24 | struct gk20a *g; | 69 | struct list_head pool_list_entry; /* Node for list of pools. */ |
25 | struct list_head maps; | 70 | void *cpu_va; /* CPU access to the pool. */ |
26 | struct mutex maps_mutex; | 71 | u64 gpu_va; /* GPU access to the pool. */ |
72 | u64 gpu_va_ro; /* GPU access to the pool. */ | ||
73 | int page_idx; /* Index into sea bitmap. */ | ||
74 | |||
75 | struct list_head hw_semas; /* List of HW semas. */ | ||
76 | DECLARE_BITMAP(semas_alloced, PAGE_SIZE / SEMAPHORE_SIZE); | ||
77 | |||
78 | struct gk20a_semaphore_sea *sema_sea; /* Sea that owns this pool. */ | ||
79 | |||
80 | struct mutex pool_lock; | ||
81 | |||
82 | /* | ||
83 | * This is the address spaces's personal RW table. Other channels will | ||
84 | * ultimately map this page as RO. | ||
85 | */ | ||
86 | struct sg_table *rw_sg_table; | ||
87 | |||
88 | /* | ||
89 | * This is to keep track of whether the pool has had its sg_table | ||
90 | * updated during sea resizing. | ||
91 | */ | ||
92 | struct sg_table *ro_sg_table; | ||
93 | |||
94 | int mapped; | ||
95 | |||
96 | /* | ||
97 | * Sometimes a channel can be released before other channels are | ||
98 | * done waiting on it. This ref count ensures that the pool doesn't | ||
99 | * go away until all semaphores using this pool are cleaned up first. | ||
100 | */ | ||
27 | struct kref ref; | 101 | struct kref ref; |
28 | struct gk20a_allocator alloc; | 102 | }; |
103 | |||
104 | /* | ||
105 | * A sea of semaphores pools. Each pool is owned by a single VM. Since multiple | ||
106 | * channels can share a VM each channel gets it's own HW semaphore from the | ||
107 | * pool. Channels then allocate regular semaphores - basically just a value that | ||
108 | * signifies when a particular job is done. | ||
109 | */ | ||
110 | struct gk20a_semaphore_sea { | ||
111 | struct list_head pool_list; /* List of pools in this sea. */ | ||
112 | struct gk20a *gk20a; | ||
113 | |||
114 | size_t size; /* Number of pages available. */ | ||
115 | u64 gpu_va; /* GPU virtual address of sema sea. */ | ||
116 | u64 map_size; /* Size of the mapping. */ | ||
117 | |||
118 | /* | ||
119 | * TODO: | ||
120 | * List of pages that we use to back the pools. The number of pages | ||
121 | * can grow dynamically since allocating 512 pages for all channels at | ||
122 | * once would be a tremendous waste. | ||
123 | */ | ||
124 | int page_count; /* Pages allocated to pools. */ | ||
125 | |||
126 | struct sg_table *ro_sg_table; | ||
127 | /* | ||
128 | struct page *pages[SEMAPHORE_POOL_COUNT]; | ||
129 | */ | ||
130 | |||
131 | struct mem_desc sea_mem; | ||
132 | |||
133 | /* | ||
134 | * Can't use a regular allocator here since the full range of pools are | ||
135 | * not always allocated. Instead just use a bitmap. | ||
136 | */ | ||
137 | DECLARE_BITMAP(pools_alloced, SEMAPHORE_POOL_COUNT); | ||
138 | |||
139 | struct mutex sea_lock; /* Lock alloc/free calls. */ | ||
29 | }; | 140 | }; |
30 | 141 | ||
31 | enum gk20a_mem_rw_flag { | 142 | enum gk20a_mem_rw_flag { |
@@ -34,64 +145,150 @@ enum gk20a_mem_rw_flag { | |||
34 | gk20a_mem_flag_write_only = 2, | 145 | gk20a_mem_flag_write_only = 2, |
35 | }; | 146 | }; |
36 | 147 | ||
37 | /* A semaphore pool can be mapped to multiple GPU address spaces. */ | 148 | /* |
38 | struct gk20a_semaphore_pool_map { | 149 | * Semaphore sea functions. |
39 | u64 gpu_va; | 150 | */ |
40 | enum gk20a_mem_rw_flag rw_flag; | 151 | struct gk20a_semaphore_sea *gk20a_semaphore_sea_create(struct gk20a *gk20a); |
41 | struct vm_gk20a *vm; | 152 | int gk20a_semaphore_sea_map(struct gk20a_semaphore_pool *sea, |
42 | struct list_head list; | 153 | struct vm_gk20a *vm); |
43 | }; | 154 | void gk20a_semaphore_sea_unmap(struct gk20a_semaphore_pool *sea, |
155 | struct vm_gk20a *vm); | ||
156 | struct gk20a_semaphore_sea *gk20a_semaphore_get_sea(struct gk20a *g); | ||
157 | |||
158 | /* | ||
159 | * Semaphore pool functions. | ||
160 | */ | ||
161 | struct gk20a_semaphore_pool *gk20a_semaphore_pool_alloc( | ||
162 | struct gk20a_semaphore_sea *sea); | ||
163 | int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *pool, | ||
164 | struct vm_gk20a *vm); | ||
165 | void gk20a_semaphore_pool_unmap(struct gk20a_semaphore_pool *pool, | ||
166 | struct vm_gk20a *vm); | ||
167 | u64 __gk20a_semaphore_pool_gpu_va(struct gk20a_semaphore_pool *p, bool global); | ||
168 | void gk20a_semaphore_pool_get(struct gk20a_semaphore_pool *p); | ||
169 | void gk20a_semaphore_pool_put(struct gk20a_semaphore_pool *p); | ||
170 | |||
171 | /* | ||
172 | * Semaphore functions. | ||
173 | */ | ||
174 | struct gk20a_semaphore *gk20a_semaphore_alloc(struct channel_gk20a *ch); | ||
175 | void gk20a_semaphore_put(struct gk20a_semaphore *s); | ||
176 | void gk20a_semaphore_get(struct gk20a_semaphore *s); | ||
177 | |||
178 | /* | ||
179 | * Return the address of a specific semaphore. | ||
180 | * | ||
181 | * Don't call this on a semaphore you don't own - the VA returned will make no | ||
182 | * sense in your specific channel's VM. | ||
183 | */ | ||
184 | static inline u64 gk20a_semaphore_gpu_rw_va(struct gk20a_semaphore *s) | ||
185 | { | ||
186 | return __gk20a_semaphore_pool_gpu_va(s->hw_sema->p, false) + | ||
187 | s->hw_sema->offset; | ||
188 | } | ||
189 | |||
190 | /* | ||
191 | * Get the global RO address for the semaphore. Can be called on any semaphore | ||
192 | * regardless of whether you own it. | ||
193 | */ | ||
194 | static inline u64 gk20a_semaphore_gpu_ro_va(struct gk20a_semaphore *s) | ||
195 | { | ||
196 | return __gk20a_semaphore_pool_gpu_va(s->hw_sema->p, true) + | ||
197 | s->hw_sema->offset; | ||
198 | } | ||
199 | |||
200 | static inline u64 gk20a_hw_sema_addr(struct gk20a_semaphore_int *hw_sema) | ||
201 | { | ||
202 | return __gk20a_semaphore_pool_gpu_va(hw_sema->p, true) + | ||
203 | hw_sema->offset; | ||
204 | } | ||
205 | |||
206 | /* | ||
207 | * TODO: handle wrap around... Hmm, how to do this? | ||
208 | */ | ||
209 | static inline bool gk20a_semaphore_is_released(struct gk20a_semaphore *s) | ||
210 | { | ||
211 | u32 sema_val = readl(s->hw_sema->value); | ||
44 | 212 | ||
45 | /* A semaphore that lives inside a semaphore pool. */ | ||
46 | struct gk20a_semaphore { | ||
47 | struct gk20a_semaphore_pool *pool; | ||
48 | /* | 213 | /* |
49 | * value exists within the pool's memory at the specified offset. | 214 | * If the underlying semaphore value is greater than or equal to |
50 | * 0=acquired, 1=released. | 215 | * the value of the semaphore then the semaphore has been signaled |
216 | * (a.k.a. released). | ||
51 | */ | 217 | */ |
52 | u32 offset; /* byte offset within pool */ | 218 | return sema_val >= atomic_read(&s->value); |
53 | struct kref ref; | 219 | } |
54 | }; | ||
55 | 220 | ||
56 | /* Create a semaphore pool that can hold at most 'capacity' semaphores. */ | 221 | static inline bool gk20a_semaphore_is_acquired(struct gk20a_semaphore *s) |
57 | struct gk20a_semaphore_pool * | ||
58 | gk20a_semaphore_pool_alloc(struct gk20a *, const char *unique_name, | ||
59 | size_t capacity); | ||
60 | void gk20a_semaphore_pool_put(struct gk20a_semaphore_pool *); | ||
61 | int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *, | ||
62 | struct vm_gk20a *, | ||
63 | enum gk20a_mem_rw_flag); | ||
64 | void gk20a_semaphore_pool_unmap(struct gk20a_semaphore_pool *, | ||
65 | struct vm_gk20a *); | ||
66 | u64 gk20a_semaphore_pool_gpu_va(struct gk20a_semaphore_pool *, | ||
67 | struct vm_gk20a *); | ||
68 | |||
69 | /* Allocate a semaphore from the semaphore pool. The newly allocated | ||
70 | * semaphore will be in acquired state (value=0). */ | ||
71 | struct gk20a_semaphore * | ||
72 | gk20a_semaphore_alloc(struct gk20a_semaphore_pool *); | ||
73 | void gk20a_semaphore_put(struct gk20a_semaphore *); | ||
74 | void gk20a_semaphore_get(struct gk20a_semaphore *); | ||
75 | |||
76 | static inline u64 gk20a_semaphore_gpu_va(struct gk20a_semaphore *s, | ||
77 | struct vm_gk20a *vm) | ||
78 | { | 222 | { |
79 | return gk20a_semaphore_pool_gpu_va(s->pool, vm) + s->offset; | 223 | return !gk20a_semaphore_is_released(s); |
80 | } | 224 | } |
81 | 225 | ||
82 | static inline bool gk20a_semaphore_is_acquired(struct gk20a_semaphore *s) | 226 | /* |
227 | * Read the underlying value from a semaphore. | ||
228 | */ | ||
229 | static inline u32 gk20a_semaphore_read(struct gk20a_semaphore *s) | ||
83 | { | 230 | { |
84 | u32 v = gk20a_mem_rd(s->pool->g, &s->pool->mem, s->offset); | 231 | return readl(s->hw_sema->value); |
232 | } | ||
85 | 233 | ||
86 | /* When often block on value reaching a certain threshold. We must make | 234 | static inline u32 gk20a_semaphore_get_value(struct gk20a_semaphore *s) |
87 | * sure that if we get unblocked, we haven't read anything too early. */ | 235 | { |
88 | smp_rmb(); | 236 | return atomic_read(&s->value); |
89 | return v == 0; | ||
90 | } | 237 | } |
91 | 238 | ||
239 | static inline u32 gk20a_semaphore_next_value(struct gk20a_semaphore *s) | ||
240 | { | ||
241 | return atomic_read(&s->hw_sema->next_value); | ||
242 | } | ||
243 | |||
244 | /* | ||
245 | * Note - if you call this then any prior semaphores will also be released. | ||
246 | */ | ||
92 | static inline void gk20a_semaphore_release(struct gk20a_semaphore *s) | 247 | static inline void gk20a_semaphore_release(struct gk20a_semaphore *s) |
93 | { | 248 | { |
94 | smp_wmb(); | 249 | u32 current_val; |
95 | gk20a_mem_wr(s->pool->g, &s->pool->mem, s->offset, 1); | 250 | u32 val = gk20a_semaphore_get_value(s); |
251 | int attempts = 0; | ||
252 | |||
253 | /* | ||
254 | * Wait until the sema value is 1 less than the write value. That | ||
255 | * way this function is essentially an increment. | ||
256 | * | ||
257 | * TODO: tune the wait a little better. | ||
258 | */ | ||
259 | while ((current_val = gk20a_semaphore_read(s)) < (val - 1)) { | ||
260 | msleep(100); | ||
261 | attempts += 1; | ||
262 | if (attempts > 100) { | ||
263 | WARN(1, "Stall on sema release!"); | ||
264 | return; | ||
265 | } | ||
266 | } | ||
267 | |||
268 | /* | ||
269 | * If the semaphore has already passed the value we would write then | ||
270 | * this is really just a NO-OP. | ||
271 | */ | ||
272 | if (current_val >= val) | ||
273 | return; | ||
274 | |||
275 | writel(val, s->hw_sema->value); | ||
276 | } | ||
277 | |||
278 | /* | ||
279 | * Configure a software based increment on this semaphore. This is useful for | ||
280 | * when we want the GPU to wait on a SW event before processing a channel. | ||
281 | * Another way to describe this is when the GPU needs to wait on a SW pre-fence. | ||
282 | * The pre-fence signals SW which in turn calls gk20a_semaphore_release() which | ||
283 | * then allows the GPU to continue. | ||
284 | * | ||
285 | * Also used to prep a semaphore for an INCR by the GPU. | ||
286 | */ | ||
287 | static inline void gk20a_semaphore_incr(struct gk20a_semaphore *s) | ||
288 | { | ||
289 | BUG_ON(s->incremented); | ||
290 | |||
291 | atomic_set(&s->value, atomic_add_return(1, &s->hw_sema->next_value)); | ||
292 | s->incremented = 1; | ||
96 | } | 293 | } |
97 | #endif | 294 | #endif |