Even if we don't use local buffers in general. Turns out that even though the performance is not the best the kernel still does it better than our own list. We still have to keep the radv bo list for buffers that are shared externally. This improves Talos on lowest quality setting (so as CPU bound as possible) by ~10% if the global bo list is enabled. Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>tags/19.1-branchpoint
| @@ -3189,8 +3189,12 @@ static VkResult radv_alloc_memory(struct radv_device *device, | |||
| if (mem_type_index == RADV_MEM_TYPE_GTT_WRITE_COMBINE) | |||
| flags |= RADEON_FLAG_GTT_WC; | |||
| if (!dedicate_info && !import_info && (!export_info || !export_info->handleTypes)) | |||
| if (!dedicate_info && !import_info && (!export_info || !export_info->handleTypes)) { | |||
| flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING; | |||
| if (device->use_global_bo_list) { | |||
| flags |= RADEON_FLAG_PREFER_LOCAL_BO; | |||
| } | |||
| } | |||
| mem->bo = device->ws->buffer_create(device->ws, alloc_size, device->physical_device->rad_info.max_alignment, | |||
| domain, flags, priority); | |||
| @@ -58,6 +58,7 @@ enum radeon_bo_flag { /* bitfield */ | |||
| RADEON_FLAG_NO_INTERPROCESS_SHARING = (1 << 6), | |||
| RADEON_FLAG_READ_ONLY = (1 << 7), | |||
| RADEON_FLAG_32BIT = (1 << 8), | |||
| RADEON_FLAG_PREFER_LOCAL_BO = (1 << 9), | |||
| }; | |||
| enum radeon_bo_usage { /* bitfield */ | |||
| @@ -368,7 +368,8 @@ radv_amdgpu_winsys_bo_create(struct radeon_winsys *_ws, | |||
| if (!(flags & RADEON_FLAG_IMPLICIT_SYNC) && ws->info.drm_minor >= 22) | |||
| request.flags |= AMDGPU_GEM_CREATE_EXPLICIT_SYNC; | |||
| if (flags & RADEON_FLAG_NO_INTERPROCESS_SHARING && | |||
| ws->info.has_local_buffers && ws->use_local_bos) { | |||
| ws->info.has_local_buffers && | |||
| (ws->use_local_bos || (flags & RADEON_FLAG_PREFER_LOCAL_BO))) { | |||
| bo->base.is_local = true; | |||
| request.flags |= AMDGPU_GEM_CREATE_VM_ALWAYS_VALID; | |||
| } | |||