浏览代码

aco: Lower to CSSA

Converting to 'Conventional SSA Form' ensures correctness w.r.t. spilling of phi nodes.
Previously, it was possible that phi operands have intersecting live-ranges, and thus,
couldn't get spilled to the same spilling slot. For this reason, ACO tried to avoid to
spill phis, even if it was beneficial.
This patch implements a conversion pass which is currently only called if spilling is necessary.

Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
tags/19.3-branchpoint
Daniel Schürmann 5 年前
父节点
当前提交
0b8216b2cd
共有 4 个文件被更改,包括 268 次插入41 次删除
  1. 11
    8
      src/amd/compiler/aco_builder_h.py
  2. 240
    0
      src/amd/compiler/aco_lower_to_cssa.cpp
  3. 16
    33
      src/amd/compiler/aco_spill.cpp
  4. 1
    0
      src/amd/compiler/meson.build

+ 11
- 8
src/amd/compiler/aco_builder_h.py 查看文件

@@ -115,10 +115,8 @@ public:

Program *program;
bool use_iterator;
union {
bool forwards; //when use_iterator == true
bool start; //when use_iterator == false
};
bool start; // only when use_iterator == false

std::vector<aco_ptr<Instruction>> *instructions;
std::vector<aco_ptr<Instruction>>::iterator it;

@@ -148,13 +146,19 @@ public:
instructions = instrs;
}

void reset(std::vector<aco_ptr<Instruction>> *instrs, std::vector<aco_ptr<Instruction>>::iterator instr_it) {
use_iterator = true;
start = false;
instructions = instrs;
it = instr_it;
}

Result insert(aco_ptr<Instruction> instr) {
Instruction *instr_ptr = instr.get();
if (instructions) {
if (use_iterator) {
it = instructions->emplace(it, std::move(instr));
if (forwards)
it = std::next(it);
it = std::next(it);
} else if (!start) {
instructions->emplace_back(std::move(instr));
} else {
@@ -168,8 +172,7 @@ public:
if (instructions) {
if (use_iterator) {
it = instructions->emplace(it, aco_ptr<Instruction>(instr));
if (forwards)
it = std::next(it);
it = std::next(it);
} else if (!start) {
instructions->emplace_back(aco_ptr<Instruction>(instr));
} else {

+ 240
- 0
src/amd/compiler/aco_lower_to_cssa.cpp 查看文件

@@ -0,0 +1,240 @@
/*
* Copyright © 2019 Valve Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
*/

#include <map>
#include "aco_ir.h"
#include "aco_builder.h"

/*
* Implements an algorithm to lower to Concentional SSA Form (CSSA).
* After "Revisiting Out-of-SSA Translation for Correctness, CodeQuality, and Efficiency"
* by B. Boissinot, A. Darte, F. Rastello, B. Dupont de Dinechin, C. Guillon,
*
* By lowering the IR to CSSA, the insertion of parallelcopies is separated from
* the register coalescing problem. Additionally, correctness is ensured w.r.t. spilling.
* The algorithm tries to find beneficial insertion points by checking if a basic block
* is empty and if the variable already has a new definition in a dominating block.
*/


namespace aco {
namespace {

typedef std::map<uint32_t, std::vector<std::pair<Definition, Operand>>> phi_info;

struct cssa_ctx {
Program* program;
live& live_vars;
phi_info logical_phi_info;
phi_info linear_phi_info;

cssa_ctx(Program* program, live& live_vars) : program(program), live_vars(live_vars) {}
};

unsigned get_lca(cssa_ctx& ctx, unsigned x, unsigned y, bool is_logical)
{
while (x != y) {
if (x > y) {
x = is_logical ? ctx.program->blocks[x].logical_idom : ctx.program->blocks[x].linear_idom;
} else {
y = is_logical ? ctx.program->blocks[y].logical_idom : ctx.program->blocks[y].linear_idom;
}
}
return x;
}

bool collect_phi_info(cssa_ctx& ctx)
{
bool progress = false;
for (Block& block : ctx.program->blocks) {
for (aco_ptr<Instruction>& phi : block.instructions) {
bool is_logical;
if (phi->opcode == aco_opcode::p_phi)
is_logical = true;
else if (phi->opcode == aco_opcode::p_linear_phi)
is_logical = false;
else
break;

/* no CSSA for the exec mask as we don't spill it anyway */
if (phi->definitions[0].isFixed() && phi->definitions[0].physReg() == exec)
continue;
std::vector<unsigned>& preds = is_logical ? block.logical_preds : block.linear_preds;

/* collect definition's block per Operand */
std::vector<unsigned> def_points(phi->operands.size());
for (unsigned i = 0; i < phi->operands.size(); i++) {
Operand& op = phi->operands[i];
if (op.isUndefined()) {
def_points[i] = preds[i];
} else if (op.isConstant()) {
/* in theory, we could insert the definition there... */
def_points[i] = 0;
} else {
assert(op.isTemp());
unsigned pred = preds[i];
do {
def_points[i] = pred;
pred = is_logical ?
ctx.program->blocks[pred].logical_idom :
ctx.program->blocks[pred].linear_idom;
} while (def_points[i] != pred &&
ctx.live_vars.live_out[pred].find(op.getTemp()) != ctx.live_vars.live_out[pred].end());
}
}

/* check live-range intersections */
for (unsigned i = 0; i < phi->operands.size(); i++) {
Operand op = phi->operands[i];
if (op.isUndefined())
continue;
/* check if the operand comes from the exec mask of a predecessor */
if (op.isTemp() && op.getTemp() == ctx.program->blocks[preds[i]].live_out_exec)
op.setFixed(exec);

/* calculate the earliest block where we can insert a copy if needed */
bool intersects = false;
unsigned upper_bound = preds[i];
while (def_points[i] < upper_bound) {
unsigned new_ub = is_logical ?
ctx.program->blocks[upper_bound].logical_idom :
ctx.program->blocks[upper_bound].linear_idom;

for (unsigned j = 0; j < phi->operands.size(); j++) {
/* same operands cannot intersect */
if (phi->operands[j].isTemp() && op.getTemp() == phi->operands[j].getTemp())
continue;
/* live-ranges intersect if any other definition point is dominated by the current definition */
intersects |= def_points[j] == new_ub;
}
if (intersects)
break;
else
upper_bound = new_ub;
}

if (!intersects) {
/* constants and live-through definitions can get a copy
* at their definition point if there is no other intersection */
if (op.isConstant() || !op.isKill() || op.regClass().type() != phi->definitions[0].regClass().type()) {
upper_bound = def_points[i];
} else {
continue;
}
}

progress = true;
unsigned insert_block = preds[i];

/* if the predecessor block is empty, try to insert at the dominator */
bool is_empty = (is_logical && ctx.program->blocks[insert_block].instructions.size() == 3) ||
ctx.program->blocks[insert_block].instructions.size() == 1;
if (is_empty) {
unsigned idom = is_logical ?
ctx.program->blocks[insert_block].logical_idom :
ctx.program->blocks[insert_block].linear_idom;
if (idom > upper_bound)
insert_block = idom;
}

/* if other operands share the same value, try to insert at LCA */
std::vector<unsigned> indices = {i};
for (unsigned j = i + 1; j < phi->operands.size(); j++) {
if ((phi->operands[j].isTemp() && op.isTemp() && phi->operands[j].getTemp() == op.getTemp()) ||
(phi->operands[j].isConstant() && op.isConstant() && phi->operands[j].constantValue() == op.constantValue())) {
unsigned lca = get_lca(ctx, insert_block, preds[j], is_logical);
if (lca > upper_bound) {
insert_block = lca;
indices.push_back(j);
}
}
}

/* create new temporary and rename operands */
Temp new_tmp = Temp{ctx.program->allocateId(), phi->definitions[0].regClass()};
if (is_logical)
ctx.logical_phi_info[insert_block].emplace_back(Definition(new_tmp), op);
else
ctx.linear_phi_info[insert_block].emplace_back(Definition(new_tmp), op);
for (unsigned index : indices) {
phi->operands[index] = Operand(new_tmp);
phi->operands[index].setKill(true);
def_points[index] = insert_block;
}
}
}
}
return progress;
}

void insert_parallelcopies(cssa_ctx& ctx)
{
/* insert the parallelcopies from logical phis before p_logical_end */
for (auto&& entry : ctx.logical_phi_info) {
Block& block = ctx.program->blocks[entry.first];
unsigned idx = block.instructions.size() - 1;
while (block.instructions[idx]->opcode != aco_opcode::p_logical_end) {
assert(idx > 0);
idx--;
}

Builder bld(ctx.program);
bld.reset(&block.instructions, std::next(block.instructions.begin(), idx));
for (std::pair<Definition, Operand>& pair : entry.second)
bld.pseudo(aco_opcode::p_parallelcopy, pair.first, pair.second);
}

/* insert parallelcopies for the linear phis at the end of blocks just before the branch */
for (auto&& entry : ctx.linear_phi_info) {
Block& block = ctx.program->blocks[entry.first];
std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.end();
--it;
assert((*it)->format == Format::PSEUDO_BRANCH);

Builder bld(ctx.program);
bld.reset(&block.instructions, it);
for (std::pair<Definition, Operand>& pair : entry.second)
bld.pseudo(aco_opcode::p_parallelcopy, pair.first, pair.second);
}
}

} /* end namespace */


void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options)
{
cssa_ctx ctx = {program, live_vars};
/* collect information about all interfering phi operands */
bool progress = collect_phi_info(ctx);

if (!progress)
return;

insert_parallelcopies(ctx);

/* update live variable information */
live_vars = live_var_analysis(program, options);
}
}


+ 16
- 33
src/amd/compiler/aco_spill.cpp 查看文件

@@ -541,9 +541,10 @@ RegisterDemand init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_id
bool spill = true;

for (unsigned i = 0; i < phi->operands.size(); i++) {
if (!phi->operands[i].isTemp())
spill = false;
else if (ctx.spills_exit[preds[i]].find(phi->operands[i].getTemp()) == ctx.spills_exit[preds[i]].end())
if (phi->operands[i].isUndefined())
continue;
assert(phi->operands[i].isTemp());
if (ctx.spills_exit[preds[i]].find(phi->operands[i].getTemp()) == ctx.spills_exit[preds[i]].end())
spill = false;
else
partial_spills.insert(phi->definitions[0].getTemp());
@@ -720,43 +721,23 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
uint32_t def_spill_id = ctx.spills_entry[block_idx][phi->definitions[0].getTemp()];

for (unsigned i = 0; i < phi->operands.size(); i++) {
unsigned pred_idx = preds[i];

/* we have to spill constants to the same memory address */
if (phi->operands[i].isConstant()) {
uint32_t spill_id = ctx.allocate_spill_id(phi->definitions[0].regClass());
for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {
ctx.interferences[def_spill_id].second.emplace(pair.second);
ctx.interferences[pair.second].second.emplace(def_spill_id);
}
ctx.affinities.emplace_back(std::pair<uint32_t, uint32_t>{def_spill_id, spill_id});

aco_ptr<Pseudo_instruction> spill{create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 2, 0)};
spill->operands[0] = phi->operands[i];
spill->operands[1] = Operand(spill_id);
Block& pred = ctx.program->blocks[pred_idx];
unsigned idx = pred.instructions.size();
do {
assert(idx != 0);
idx--;
} while (phi->opcode == aco_opcode::p_phi && pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);
pred.instructions.insert(it, std::move(spill));
continue;
}
if (!phi->operands[i].isTemp())
if (phi->operands[i].isUndefined())
continue;

unsigned pred_idx = preds[i];
assert(phi->operands[i].isTemp() && phi->operands[i].isKill());
Temp var = phi->operands[i].getTemp();

/* build interferences between the phi def and all spilled variables at the predecessor blocks */
for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {
if (phi->operands[i].getTemp() == pair.first)
if (var == pair.first)
continue;
ctx.interferences[def_spill_id].second.emplace(pair.second);
ctx.interferences[pair.second].second.emplace(def_spill_id);
}

/* variable is already spilled at predecessor */
std::map<Temp, uint32_t>::iterator spilled = ctx.spills_exit[pred_idx].find(phi->operands[i].getTemp());
/* check if variable is already spilled at predecessor */
std::map<Temp, uint32_t>::iterator spilled = ctx.spills_exit[pred_idx].find(var);
if (spilled != ctx.spills_exit[pred_idx].end()) {
if (spilled->second != def_spill_id)
ctx.affinities.emplace_back(std::pair<uint32_t, uint32_t>{def_spill_id, spilled->second});
@@ -764,7 +745,6 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
}

/* rename if necessary */
Temp var = phi->operands[i].getTemp();
std::map<Temp, Temp>::iterator rename_it = ctx.renames[pred_idx].find(var);
if (rename_it != ctx.renames[pred_idx].end()) {
var = rename_it->second;
@@ -813,7 +793,7 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
continue;
}

/* variable is dead at predecessor, it must be from a phi: this works because of CSSA form */ // FIXME: lower_to_cssa()
/* variable is dead at predecessor, it must be from a phi: this works because of CSSA form */
if (ctx.next_use_distances_end[pred_idx].find(pair.first) == ctx.next_use_distances_end[pred_idx].end())
continue;

@@ -1567,6 +1547,9 @@ void spill(Program* program, live& live_vars, const struct radv_nir_compiler_opt
if (program->num_waves >= 6)
return;

/* lower to CSSA before spilling to ensure correctness w.r.t. phis */
lower_to_cssa(program, live_vars, options);

/* else, we check if we can improve things a bit */

/* calculate target register demand */

+ 1
- 0
src/amd/compiler/meson.build 查看文件

@@ -69,6 +69,7 @@ libaco_files = files(
'aco_register_allocation.cpp',
'aco_live_var_analysis.cpp',
'aco_lower_bool_phis.cpp',
'aco_lower_to_cssa.cpp',
'aco_lower_to_hw_instr.cpp',
'aco_optimizer.cpp',
'aco_opt_value_numbering.cpp',

正在加载...
取消
保存