|
|
@@ -575,6 +575,395 @@ void amdgpu_bo_slab_free(void *priv, struct pb_slab *pslab) |
|
|
|
FREE(slab); |
|
|
|
} |
|
|
|
|
|
|
|
/* |
|
|
|
* Attempt to allocate the given number of backing pages. Fewer pages may be |
|
|
|
* allocated (depending on the fragmentation of existing backing buffers), |
|
|
|
* which will be reflected by a change to *pnum_pages. |
|
|
|
*/ |
|
|
|
static struct amdgpu_sparse_backing * |
|
|
|
sparse_backing_alloc(struct amdgpu_winsys_bo *bo, uint32_t *pstart_page, uint32_t *pnum_pages) |
|
|
|
{ |
|
|
|
struct amdgpu_sparse_backing *best_backing; |
|
|
|
unsigned best_idx; |
|
|
|
uint32_t best_num_pages; |
|
|
|
|
|
|
|
best_backing = NULL; |
|
|
|
best_idx = 0; |
|
|
|
best_num_pages = 0; |
|
|
|
|
|
|
|
/* This is a very simple and inefficient best-fit algorithm. */ |
|
|
|
list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) { |
|
|
|
for (unsigned idx = 0; idx < backing->num_chunks; ++idx) { |
|
|
|
uint32_t cur_num_pages = backing->chunks[idx].end - backing->chunks[idx].begin; |
|
|
|
if ((best_num_pages < *pnum_pages && cur_num_pages > best_num_pages) || |
|
|
|
(best_num_pages > *pnum_pages && cur_num_pages < best_num_pages)) { |
|
|
|
best_backing = backing; |
|
|
|
best_idx = idx; |
|
|
|
best_num_pages = cur_num_pages; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
/* Allocate a new backing buffer if necessary. */ |
|
|
|
if (!best_backing) { |
|
|
|
struct pb_buffer *buf; |
|
|
|
uint64_t size; |
|
|
|
uint32_t pages; |
|
|
|
|
|
|
|
best_backing = CALLOC_STRUCT(amdgpu_sparse_backing); |
|
|
|
if (!best_backing) |
|
|
|
return NULL; |
|
|
|
|
|
|
|
best_backing->max_chunks = 4; |
|
|
|
best_backing->chunks = CALLOC(best_backing->max_chunks, |
|
|
|
sizeof(*best_backing->chunks)); |
|
|
|
if (!best_backing->chunks) { |
|
|
|
FREE(best_backing); |
|
|
|
return NULL; |
|
|
|
} |
|
|
|
|
|
|
|
assert(bo->u.sparse.num_backing_pages < DIV_ROUND_UP(bo->base.size, RADEON_SPARSE_PAGE_SIZE)); |
|
|
|
|
|
|
|
size = MIN3(bo->base.size / 16, |
|
|
|
8 * 1024 * 1024, |
|
|
|
bo->base.size - (uint64_t)bo->u.sparse.num_backing_pages * RADEON_SPARSE_PAGE_SIZE); |
|
|
|
size = MAX2(size, RADEON_SPARSE_PAGE_SIZE); |
|
|
|
|
|
|
|
buf = amdgpu_bo_create(&bo->ws->base, size, RADEON_SPARSE_PAGE_SIZE, |
|
|
|
bo->initial_domain, |
|
|
|
bo->u.sparse.flags | RADEON_FLAG_HANDLE); |
|
|
|
if (!buf) { |
|
|
|
FREE(best_backing->chunks); |
|
|
|
FREE(best_backing); |
|
|
|
return NULL; |
|
|
|
} |
|
|
|
|
|
|
|
/* We might have gotten a bigger buffer than requested via caching. */ |
|
|
|
pages = buf->size / RADEON_SPARSE_PAGE_SIZE; |
|
|
|
|
|
|
|
best_backing->bo = amdgpu_winsys_bo(buf); |
|
|
|
best_backing->num_chunks = 1; |
|
|
|
best_backing->chunks[0].begin = 0; |
|
|
|
best_backing->chunks[0].end = pages; |
|
|
|
|
|
|
|
list_add(&best_backing->list, &bo->u.sparse.backing); |
|
|
|
bo->u.sparse.num_backing_pages += pages; |
|
|
|
|
|
|
|
best_idx = 0; |
|
|
|
best_num_pages = pages; |
|
|
|
} |
|
|
|
|
|
|
|
*pnum_pages = MIN2(*pnum_pages, best_num_pages); |
|
|
|
*pstart_page = best_backing->chunks[best_idx].begin; |
|
|
|
best_backing->chunks[best_idx].begin += *pnum_pages; |
|
|
|
|
|
|
|
if (best_backing->chunks[best_idx].begin >= best_backing->chunks[best_idx].end) { |
|
|
|
memmove(&best_backing->chunks[best_idx], &best_backing->chunks[best_idx + 1], |
|
|
|
sizeof(*best_backing->chunks) * (best_backing->num_chunks - best_idx - 1)); |
|
|
|
best_backing->num_chunks--; |
|
|
|
} |
|
|
|
|
|
|
|
return best_backing; |
|
|
|
} |
|
|
|
|
|
|
|
static void |
|
|
|
sparse_free_backing_buffer(struct amdgpu_sparse_backing *backing) |
|
|
|
{ |
|
|
|
bo->u.sparse.num_backing_pages -= backing->bo->base.size / RADEON_SPARSE_PAGE_SIZE; |
|
|
|
|
|
|
|
list_del(&backing->list); |
|
|
|
amdgpu_winsys_bo_reference(&backing->bo, NULL); |
|
|
|
FREE(backing->chunks); |
|
|
|
FREE(backing); |
|
|
|
} |
|
|
|
|
|
|
|
/* |
|
|
|
* Return a range of pages from the given backing buffer back into the |
|
|
|
* free structure. |
|
|
|
*/ |
|
|
|
static bool |
|
|
|
sparse_backing_free(struct amdgpu_winsys_bo *bo, |
|
|
|
struct amdgpu_sparse_backing *backing, |
|
|
|
uint32_t start_page, uint32_t num_pages) |
|
|
|
{ |
|
|
|
uint32_t end_page = start_page + num_pages; |
|
|
|
unsigned low = 0; |
|
|
|
unsigned high = backing->num_chunks; |
|
|
|
|
|
|
|
/* Find the first chunk with begin >= start_page. */ |
|
|
|
while (low < high) { |
|
|
|
unsigned mid = low + (high - low) / 2; |
|
|
|
|
|
|
|
if (backing->chunks[mid].begin >= start_page) |
|
|
|
high = mid; |
|
|
|
else |
|
|
|
low = mid + 1; |
|
|
|
} |
|
|
|
|
|
|
|
assert(low >= backing->num_chunks || end_page <= backing->chunks[low].begin); |
|
|
|
assert(low == 0 || backing->chunks[low - 1].end <= start_page); |
|
|
|
|
|
|
|
if (low > 0 && backing->chunks[low - 1].end == start_page) { |
|
|
|
backing->chunks[low - 1].end = end_page; |
|
|
|
|
|
|
|
if (low < backing->num_chunks && end_page == backing->chunks[low].begin) { |
|
|
|
backing->chunks[low - 1].end = backing->chunks[low].end; |
|
|
|
memmove(&backing->chunks[low], &backing->chunks[low + 1], |
|
|
|
sizeof(*backing->chunks) * (backing->num_chunks - low - 1)); |
|
|
|
backing->num_chunks--; |
|
|
|
} |
|
|
|
} else if (low < backing->num_chunks && end_page == backing->chunks[low].begin) { |
|
|
|
backing->chunks[low].begin = start_page; |
|
|
|
} else { |
|
|
|
if (backing->num_chunks >= backing->max_chunks) { |
|
|
|
unsigned new_max_chunks = 2 * backing->max_chunks; |
|
|
|
struct amdgpu_sparse_backing_chunk *new_chunks = |
|
|
|
REALLOC(backing->chunks, |
|
|
|
sizeof(*backing->chunks) * backing->max_chunks, |
|
|
|
sizeof(*backing->chunks) * new_max_chunks); |
|
|
|
if (!new_chunks) |
|
|
|
return false; |
|
|
|
|
|
|
|
backing->max_chunks = new_max_chunks; |
|
|
|
backing->chunks = new_chunks; |
|
|
|
} |
|
|
|
|
|
|
|
memmove(&backing->chunks[low + 1], &backing->chunks[low], |
|
|
|
sizeof(*backing->chunks) * (backing->num_chunks - low)); |
|
|
|
backing->chunks[low].begin = start_page; |
|
|
|
backing->chunks[low].end = end_page; |
|
|
|
backing->num_chunks++; |
|
|
|
} |
|
|
|
|
|
|
|
if (backing->num_chunks == 1 && backing->chunks[0].begin == 0 && |
|
|
|
backing->chunks[0].end == backing->bo->base.size / RADEON_SPARSE_PAGE_SIZE) |
|
|
|
sparse_free_backing_buffer(backing); |
|
|
|
|
|
|
|
return true; |
|
|
|
} |
|
|
|
|
|
|
|
static void amdgpu_bo_sparse_destroy(struct pb_buffer *_buf) |
|
|
|
{ |
|
|
|
struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); |
|
|
|
int r; |
|
|
|
|
|
|
|
assert(!bo->bo && bo->sparse); |
|
|
|
|
|
|
|
r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0, |
|
|
|
(uint64_t)bo->u.sparse.num_va_pages * RADEON_SPARSE_PAGE_SIZE, |
|
|
|
bo->va, 0, AMDGPU_VA_OP_CLEAR); |
|
|
|
if (r) { |
|
|
|
fprintf(stderr, "amdgpu: clearing PRT VA region on destroy failed (%d)\n", r); |
|
|
|
} |
|
|
|
|
|
|
|
while (!list_empty(&bo->u.sparse.backing)) { |
|
|
|
struct amdgpu_sparse_backing *dummy = NULL; |
|
|
|
sparse_free_backing_buffer(container_of(bo->u.sparse.backing.next, |
|
|
|
dummy, list)); |
|
|
|
} |
|
|
|
|
|
|
|
amdgpu_va_range_free(bo->u.sparse.va_handle); |
|
|
|
mtx_destroy(&bo->u.sparse.commit_lock); |
|
|
|
FREE(bo->u.sparse.commitments); |
|
|
|
FREE(bo); |
|
|
|
} |
|
|
|
|
|
|
|
static const struct pb_vtbl amdgpu_winsys_bo_sparse_vtbl = { |
|
|
|
amdgpu_bo_sparse_destroy |
|
|
|
/* other functions are never called */ |
|
|
|
}; |
|
|
|
|
|
|
|
static struct pb_buffer * |
|
|
|
amdgpu_bo_sparse_create(struct amdgpu_winsys *ws, uint64_t size, |
|
|
|
enum radeon_bo_domain domain, |
|
|
|
enum radeon_bo_flag flags) |
|
|
|
{ |
|
|
|
struct amdgpu_winsys_bo *bo; |
|
|
|
uint64_t map_size; |
|
|
|
uint64_t va_gap_size; |
|
|
|
int r; |
|
|
|
|
|
|
|
/* We use 32-bit page numbers; refuse to attempt allocating sparse buffers |
|
|
|
* that exceed this limit. This is not really a restriction: we don't have |
|
|
|
* that much virtual address space anyway. |
|
|
|
*/ |
|
|
|
if (size > (uint64_t)INT32_MAX * RADEON_SPARSE_PAGE_SIZE) |
|
|
|
return NULL; |
|
|
|
|
|
|
|
bo = CALLOC_STRUCT(amdgpu_winsys_bo); |
|
|
|
if (!bo) |
|
|
|
return NULL; |
|
|
|
|
|
|
|
pipe_reference_init(&bo->base.reference, 1); |
|
|
|
bo->base.alignment = RADEON_SPARSE_PAGE_SIZE; |
|
|
|
bo->base.size = size; |
|
|
|
bo->base.vtbl = &amdgpu_winsys_bo_sparse_vtbl; |
|
|
|
bo->ws = ws; |
|
|
|
bo->initial_domain = domain; |
|
|
|
bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1); |
|
|
|
bo->sparse = true; |
|
|
|
bo->u.sparse.flags = flags & ~RADEON_FLAG_SPARSE; |
|
|
|
|
|
|
|
bo->u.sparse.num_va_pages = DIV_ROUND_UP(size, RADEON_SPARSE_PAGE_SIZE); |
|
|
|
bo->u.sparse.commitments = CALLOC(bo->u.sparse.num_va_pages, |
|
|
|
sizeof(*bo->u.sparse.commitments)); |
|
|
|
if (!bo->u.sparse.commitments) |
|
|
|
goto error_alloc_commitments; |
|
|
|
|
|
|
|
mtx_init(&bo->u.sparse.commit_lock, mtx_plain); |
|
|
|
LIST_INITHEAD(&bo->u.sparse.backing); |
|
|
|
|
|
|
|
/* For simplicity, we always map a multiple of the page size. */ |
|
|
|
map_size = align64(size, RADEON_SPARSE_PAGE_SIZE); |
|
|
|
va_gap_size = ws->check_vm ? 4 * RADEON_SPARSE_PAGE_SIZE : 0; |
|
|
|
r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general, |
|
|
|
map_size + va_gap_size, RADEON_SPARSE_PAGE_SIZE, |
|
|
|
0, &bo->va, &bo->u.sparse.va_handle, 0); |
|
|
|
if (r) |
|
|
|
goto error_va_alloc; |
|
|
|
|
|
|
|
r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0, size, bo->va, |
|
|
|
AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_MAP); |
|
|
|
if (r) |
|
|
|
goto error_va_map; |
|
|
|
|
|
|
|
return &bo->base; |
|
|
|
|
|
|
|
error_va_map: |
|
|
|
amdgpu_va_range_free(bo->u.sparse.va_handle); |
|
|
|
error_va_alloc: |
|
|
|
mtx_destroy(&bo->u.sparse.commit_lock); |
|
|
|
FREE(bo->u.sparse.commitments); |
|
|
|
error_alloc_commitments: |
|
|
|
FREE(bo); |
|
|
|
return NULL; |
|
|
|
} |
|
|
|
|
|
|
|
static bool |
|
|
|
amdgpu_bo_sparse_commit(struct pb_buffer *buf, uint64_t offset, uint64_t size, |
|
|
|
bool commit) |
|
|
|
{ |
|
|
|
struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(buf); |
|
|
|
struct amdgpu_sparse_commitment *comm; |
|
|
|
uint32_t va_page, end_va_page; |
|
|
|
bool ok = true; |
|
|
|
int r; |
|
|
|
|
|
|
|
assert(bo->sparse); |
|
|
|
assert(offset % RADEON_SPARSE_PAGE_SIZE == 0); |
|
|
|
assert(offset <= bo->base.size); |
|
|
|
assert(size <= bo->base.size - offset); |
|
|
|
assert(size % RADEON_SPARSE_PAGE_SIZE == 0 || offset + size == bo->base.size); |
|
|
|
|
|
|
|
comm = bo->u.sparse.commitments; |
|
|
|
va_page = offset / RADEON_SPARSE_PAGE_SIZE; |
|
|
|
end_va_page = va_page + DIV_ROUND_UP(size, RADEON_SPARSE_PAGE_SIZE); |
|
|
|
|
|
|
|
mtx_lock(&bo->u.sparse.commit_lock); |
|
|
|
|
|
|
|
if (commit) { |
|
|
|
while (va_page < end_va_page) { |
|
|
|
uint32_t span_va_page; |
|
|
|
|
|
|
|
/* Skip pages that are already committed. */ |
|
|
|
if (comm[va_page].backing) { |
|
|
|
va_page++; |
|
|
|
continue; |
|
|
|
} |
|
|
|
|
|
|
|
/* Determine length of uncommitted span. */ |
|
|
|
span_va_page = va_page; |
|
|
|
while (va_page < end_va_page && !comm[va_page].backing) |
|
|
|
va_page++; |
|
|
|
|
|
|
|
/* Fill the uncommitted span with chunks of backing memory. */ |
|
|
|
while (span_va_page < va_page) { |
|
|
|
struct amdgpu_sparse_backing *backing; |
|
|
|
uint32_t backing_start, backing_size; |
|
|
|
|
|
|
|
backing_size = va_page - span_va_page; |
|
|
|
backing = sparse_backing_alloc(bo, &backing_start, &backing_size); |
|
|
|
if (!backing) { |
|
|
|
ok = false; |
|
|
|
goto out; |
|
|
|
} |
|
|
|
|
|
|
|
r = amdgpu_bo_va_op_raw(bo->ws->dev, backing->bo->bo, |
|
|
|
(uint64_t)backing_start * RADEON_SPARSE_PAGE_SIZE, |
|
|
|
(uint64_t)backing_size * RADEON_SPARSE_PAGE_SIZE, |
|
|
|
bo->va + (uint64_t)span_va_page * RADEON_SPARSE_PAGE_SIZE, |
|
|
|
AMDGPU_VM_PAGE_READABLE | |
|
|
|
AMDGPU_VM_PAGE_WRITEABLE | |
|
|
|
AMDGPU_VM_PAGE_EXECUTABLE, |
|
|
|
AMDGPU_VA_OP_REPLACE); |
|
|
|
if (r) { |
|
|
|
ok = sparse_backing_free(bo, backing, backing_start, backing_size); |
|
|
|
assert(ok && "sufficient memory should already be allocated"); |
|
|
|
|
|
|
|
ok = false; |
|
|
|
goto out; |
|
|
|
} |
|
|
|
|
|
|
|
while (backing_size) { |
|
|
|
comm[span_va_page].backing = backing; |
|
|
|
comm[span_va_page].page = backing_start; |
|
|
|
span_va_page++; |
|
|
|
backing_start++; |
|
|
|
backing_size--; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} else { |
|
|
|
r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0, |
|
|
|
(uint64_t)(end_va_page - va_page) * RADEON_SPARSE_PAGE_SIZE, |
|
|
|
bo->va + (uint64_t)va_page * RADEON_SPARSE_PAGE_SIZE, |
|
|
|
AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_REPLACE); |
|
|
|
if (r) { |
|
|
|
ok = false; |
|
|
|
goto out; |
|
|
|
} |
|
|
|
|
|
|
|
while (va_page < end_va_page) { |
|
|
|
struct amdgpu_sparse_backing *backing; |
|
|
|
uint32_t backing_start; |
|
|
|
uint32_t span_pages; |
|
|
|
|
|
|
|
/* Skip pages that are already uncommitted. */ |
|
|
|
if (!comm[va_page].backing) { |
|
|
|
va_page++; |
|
|
|
continue; |
|
|
|
} |
|
|
|
|
|
|
|
/* Group contiguous spans of pages. */ |
|
|
|
backing = comm[va_page].backing; |
|
|
|
backing_start = comm[va_page].page; |
|
|
|
comm[va_page].backing = NULL; |
|
|
|
|
|
|
|
span_pages = 1; |
|
|
|
va_page++; |
|
|
|
|
|
|
|
while (va_page < end_va_page && |
|
|
|
comm[va_page].backing == backing && |
|
|
|
comm[va_page].page == backing_start + span_pages) { |
|
|
|
comm[va_page].backing = NULL; |
|
|
|
va_page++; |
|
|
|
span_pages++; |
|
|
|
} |
|
|
|
|
|
|
|
if (!sparse_backing_free(bo, backing, backing_start, span_pages)) { |
|
|
|
/* Couldn't allocate tracking data structures, so we have to leak */ |
|
|
|
fprintf(stderr, "amdgpu: leaking PRT backing memory\n"); |
|
|
|
ok = false; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
out: |
|
|
|
|
|
|
|
mtx_unlock(&bo->u.sparse.commit_lock); |
|
|
|
|
|
|
|
return ok; |
|
|
|
} |
|
|
|
|
|
|
|
static unsigned eg_tile_split(unsigned tile_split) |
|
|
|
{ |
|
|
|
switch (tile_split) { |
|
|
@@ -696,7 +1085,7 @@ amdgpu_bo_create(struct radeon_winsys *rws, |
|
|
|
unsigned usage = 0, pb_cache_bucket; |
|
|
|
|
|
|
|
/* Sub-allocate small buffers from slabs. */ |
|
|
|
if (!(flags & RADEON_FLAG_HANDLE) && |
|
|
|
if (!(flags & (RADEON_FLAG_HANDLE | RADEON_FLAG_SPARSE)) && |
|
|
|
size <= (1 << AMDGPU_SLAB_MAX_SIZE_LOG2) && |
|
|
|
alignment <= MAX2(1 << AMDGPU_SLAB_MIN_SIZE_LOG2, util_next_power_of_two(size))) { |
|
|
|
struct pb_slab_entry *entry; |
|
|
@@ -742,6 +1131,15 @@ amdgpu_bo_create(struct radeon_winsys *rws, |
|
|
|
} |
|
|
|
no_slab: |
|
|
|
|
|
|
|
if (flags & RADEON_FLAG_SPARSE) { |
|
|
|
assert(RADEON_SPARSE_PAGE_SIZE % alignment == 0); |
|
|
|
assert(!(flags & RADEON_FLAG_CPU_ACCESS)); |
|
|
|
|
|
|
|
flags |= RADEON_FLAG_NO_CPU_ACCESS; |
|
|
|
|
|
|
|
return amdgpu_bo_sparse_create(ws, size, domain, flags); |
|
|
|
} |
|
|
|
|
|
|
|
/* This flag is irrelevant for the cache. */ |
|
|
|
flags &= ~RADEON_FLAG_HANDLE; |
|
|
|
|
|
|
@@ -1003,6 +1401,7 @@ void amdgpu_bo_init_functions(struct amdgpu_winsys *ws) |
|
|
|
ws->base.buffer_from_ptr = amdgpu_bo_from_ptr; |
|
|
|
ws->base.buffer_is_user_ptr = amdgpu_bo_is_user_ptr; |
|
|
|
ws->base.buffer_get_handle = amdgpu_bo_get_handle; |
|
|
|
ws->base.buffer_commit = amdgpu_bo_sparse_commit; |
|
|
|
ws->base.buffer_get_virtual_address = amdgpu_bo_get_va; |
|
|
|
ws->base.buffer_get_initial_domain = amdgpu_bo_get_initial_domain; |
|
|
|
} |