besched: Change sched_foreach_from(sched_next(x), y) to sched_foreach_after(x, y).
[libfirm] / ir / be / beprefalloc.c
1 /*
2  * This file is part of libFirm.
3  * Copyright (C) 2012 University of Karlsruhe.
4  */
5
6 /**
7  * @file
8  * @brief       Preference Guided Register Assignment
9  * @author      Matthias Braun
10  * @date        14.2.2009
11  *
12  * The idea is to allocate registers in 2 passes:
13  * 1. A first pass to determine "preferred" registers for live-ranges. This
14  *    calculates for each register and each live-range a value indicating
15  *    the usefulness. (You can roughly think of the value as the negative
16  *    costs needed for copies when the value is in the specific registers...)
17  *
18  * 2. Walk blocks and assigns registers in a greedy fashion. Preferring
19  *    registers with high preferences. When register constraints are not met,
20  *    add copies and split live-ranges.
21  *
22  * TODO:
23  *  - make use of free registers in the permute_values code
24  */
25 #include "config.h"
26
27 #include <float.h>
28 #include <stdbool.h>
29 #include <math.h>
30 #include "lpp.h"
31
32 #include "error.h"
33 #include "execfreq.h"
34 #include "ircons.h"
35 #include "irdom.h"
36 #include "iredges_t.h"
37 #include "irgraph_t.h"
38 #include "irgwalk.h"
39 #include "irnode_t.h"
40 #include "irprintf.h"
41 #include "irdump.h"
42 #include "irtools.h"
43 #include "util.h"
44 #include "obst.h"
45 #include "raw_bitset.h"
46 #include "unionfind.h"
47 #include "pdeq.h"
48 #include "hungarian.h"
49 #include "statev.h"
50 #include "beabi.h"
51 #include "bechordal_t.h"
52 #include "be.h"
53 #include "beirg.h"
54 #include "belive_t.h"
55 #include "bemodule.h"
56 #include "benode.h"
57 #include "bera.h"
58 #include "besched.h"
59 #include "bespill.h"
60 #include "bespillutil.h"
61 #include "beverify.h"
62 #include "beutil.h"
63 #include "bestack.h"
64
65 #define USE_FACTOR                     1.0f
66 #define DEF_FACTOR                     1.0f
67 #define NEIGHBOR_FACTOR                0.2f
68 #define AFF_SHOULD_BE_SAME             0.5f
69 #define AFF_PHI                        1.0f
70 #define SPLIT_DELTA                    1.0f
71 #define MAX_OPTIMISTIC_SPLIT_RECURSION 0
72
73 DEBUG_ONLY(static firm_dbg_module_t *dbg = NULL;)
74
75 static struct obstack               obst;
76 static ir_graph                    *irg;
77 static const arch_register_class_t *cls;
78 static be_lv_t                     *lv;
79 static unsigned                     n_regs;
80 static unsigned                    *normal_regs;
81 static int                         *congruence_classes;
82 static ir_node                    **block_order;
83 static size_t                       n_block_order;
84
85 /** currently active assignments (while processing a basic block)
86  * maps registers to values(their current copies) */
87 static ir_node **assignments;
88
89 /**
90  * allocation information: last_uses, register preferences
91  * the information is per firm-node.
92  */
93 struct allocation_info_t {
94         unsigned  last_uses[2];   /**< bitset indicating last uses (input pos) */
95         ir_node  *current_value;  /**< copy of the value that should be used */
96         ir_node  *original_value; /**< for copies point to original value */
97         float     prefs[];        /**< register preferences */
98 };
99 typedef struct allocation_info_t allocation_info_t;
100
101 /** helper datastructure used when sorting register preferences */
102 struct reg_pref_t {
103         unsigned num;
104         float    pref;
105 };
106 typedef struct reg_pref_t reg_pref_t;
107
108 /** per basic-block information */
109 struct block_info_t {
110         bool     processed;       /**< indicate whether block is processed */
111         ir_node *assignments[];   /**< register assignments at end of block */
112 };
113 typedef struct block_info_t block_info_t;
114
115 /**
116  * Get the allocation info for a node.
117  * The info is allocated on the first visit of a node.
118  */
119 static allocation_info_t *get_allocation_info(ir_node *node)
120 {
121         allocation_info_t *info = (allocation_info_t*)get_irn_link(node);
122         if (info == NULL) {
123                 info = OALLOCFZ(&obst, allocation_info_t, prefs, n_regs);
124                 info->current_value  = node;
125                 info->original_value = node;
126                 set_irn_link(node, info);
127         }
128
129         return info;
130 }
131
132 static allocation_info_t *try_get_allocation_info(const ir_node *node)
133 {
134         return (allocation_info_t*) get_irn_link(node);
135 }
136
137 /**
138  * Get allocation information for a basic block
139  */
140 static block_info_t *get_block_info(ir_node *block)
141 {
142         block_info_t *info = (block_info_t*)get_irn_link(block);
143
144         assert(is_Block(block));
145         if (info == NULL) {
146                 info = OALLOCFZ(&obst, block_info_t, assignments, n_regs);
147                 set_irn_link(block, info);
148         }
149
150         return info;
151 }
152
153 /**
154  * Link the allocation info of a node to a copy.
155  * Afterwards, both nodes uses the same allocation info.
156  * Copy must not have an allocation info assigned yet.
157  *
158  * @param copy   the node that gets the allocation info assigned
159  * @param value  the original node
160  */
161 static void mark_as_copy_of(ir_node *copy, ir_node *value)
162 {
163         allocation_info_t *info      = get_allocation_info(value);
164         allocation_info_t *copy_info = get_allocation_info(copy);
165
166         /* find original value */
167         ir_node *original = info->original_value;
168         if (original != value) {
169                 info = get_allocation_info(original);
170         }
171
172         assert(info->original_value == original);
173         info->current_value = copy;
174
175         /* the copy should not be linked to something else yet */
176         assert(copy_info->original_value == copy);
177         copy_info->original_value = original;
178
179         /* copy over allocation preferences */
180         memcpy(copy_info->prefs, info->prefs, n_regs * sizeof(copy_info->prefs[0]));
181 }
182
183 /**
184  * Calculate the penalties for every register on a node and its live neighbors.
185  *
186  * @param live_nodes  the set of live nodes at the current position, may be NULL
187  * @param penalty     the penalty to subtract from
188  * @param limited     a raw bitset containing the limited set for the node
189  * @param node        the node
190  */
191 static void give_penalties_for_limits(const ir_nodeset_t *live_nodes,
192                                       float penalty, const unsigned* limited,
193                                       ir_node *node)
194 {
195         allocation_info_t *info = get_allocation_info(node);
196
197         /* give penalty for all forbidden regs */
198         for (unsigned r = 0; r < n_regs; ++r) {
199                 if (rbitset_is_set(limited, r))
200                         continue;
201
202                 info->prefs[r] -= penalty;
203         }
204
205         /* all other live values should get a penalty for allowed regs */
206         if (live_nodes == NULL)
207                 return;
208
209         penalty   *= NEIGHBOR_FACTOR;
210         size_t n_allowed = rbitset_popcount(limited, n_regs);
211         if (n_allowed > 1) {
212                 /* only create a very weak penalty if multiple regs are allowed */
213                 penalty = (penalty * 0.8f) / n_allowed;
214         }
215         foreach_ir_nodeset(live_nodes, neighbor, iter) {
216                 allocation_info_t *neighbor_info;
217
218                 /* TODO: if op is used on multiple inputs we might not do a
219                  * continue here */
220                 if (neighbor == node)
221                         continue;
222
223                 neighbor_info = get_allocation_info(neighbor);
224                 for (unsigned r = 0; r < n_regs; ++r) {
225                         if (!rbitset_is_set(limited, r))
226                                 continue;
227
228                         neighbor_info->prefs[r] -= penalty;
229                 }
230         }
231 }
232
233 /**
234  * Calculate the preferences of a definition for the current register class.
235  * If the definition uses a limited set of registers, reduce the preferences
236  * for the limited register on the node and its neighbors.
237  *
238  * @param live_nodes  the set of live nodes at the current node
239  * @param weight      the weight
240  * @param node        the current node
241  */
242 static void check_defs(ir_nodeset_t const *const live_nodes, float const weight, ir_node *const node, arch_register_req_t const *const req)
243 {
244         if (arch_register_req_is(req, limited)) {
245                 const unsigned *limited = req->limited;
246                 float           penalty = weight * DEF_FACTOR;
247                 give_penalties_for_limits(live_nodes, penalty, limited, node);
248         }
249
250         if (arch_register_req_is(req, should_be_same)) {
251                 ir_node           *insn  = skip_Proj(node);
252                 allocation_info_t *info  = get_allocation_info(node);
253                 int                arity = get_irn_arity(insn);
254
255                 float factor = 1.0f / rbitset_popcount(&req->other_same, arity);
256                 for (int i = 0; i < arity; ++i) {
257                         if (!rbitset_is_set(&req->other_same, i))
258                                 continue;
259
260                         ir_node *op = get_irn_n(insn, i);
261
262                         /* if we the value at the should_be_same input doesn't die at the
263                          * node, then it is no use to propagate the constraints (since a
264                          * copy will emerge anyway) */
265                         if (ir_nodeset_contains(live_nodes, op))
266                                 continue;
267
268                         allocation_info_t *op_info = get_allocation_info(op);
269                         for (unsigned r = 0; r < n_regs; ++r) {
270                                 op_info->prefs[r] += info->prefs[r] * factor;
271                         }
272                 }
273         }
274 }
275
276 /**
277  * Walker: Runs an a block calculates the preferences for any
278  * node and every register from the considered register class.
279  */
280 static void analyze_block(ir_node *block, void *data)
281 {
282         float        weight = (float)get_block_execfreq(block);
283         ir_nodeset_t live_nodes;
284         (void) data;
285
286         ir_nodeset_init(&live_nodes);
287         be_liveness_end_of_block(lv, cls, block, &live_nodes);
288
289         sched_foreach_reverse(block, node) {
290                 if (is_Phi(node))
291                         break;
292
293                 be_foreach_definition(node, cls, value, req,
294                         check_defs(&live_nodes, weight, value, req);
295                 );
296
297                 /* mark last uses */
298                 int arity = get_irn_arity(node);
299
300                 /* the allocation info node currently only uses 1 unsigned value
301                    to mark last used inputs. So we will fail for a node with more than
302                    32 inputs. */
303                 allocation_info_t *info = get_allocation_info(node);
304                 if (arity >= (int) sizeof(info->last_uses) * 8) {
305                         panic("Node with more than %d inputs not supported yet",
306                                         (int) sizeof(info->last_uses) * 8);
307                 }
308
309                 for (int i = 0; i < arity; ++i) {
310                         ir_node                   *op  = get_irn_n(node, i);
311                         const arch_register_req_t *req = arch_get_irn_register_req(op);
312                         if (req->cls != cls)
313                                 continue;
314
315                         /* last usage of a value? */
316                         if (!ir_nodeset_contains(&live_nodes, op)) {
317                                 rbitset_set(info->last_uses, i);
318                         }
319                 }
320
321                 be_liveness_transfer(cls, node, &live_nodes);
322
323                 /* update weights based on usage constraints */
324                 be_foreach_use(node, cls, req, op, op_req,
325                         if (!arch_register_req_is(req, limited))
326                                 continue;
327
328                         give_penalties_for_limits(&live_nodes, weight * USE_FACTOR, req->limited, op);
329                 );
330         }
331
332         ir_nodeset_destroy(&live_nodes);
333 }
334
335 static void congruence_def(ir_nodeset_t *const live_nodes, ir_node const *const node, arch_register_req_t const *const req)
336 {
337         /* should be same constraint? */
338         if (arch_register_req_is(req, should_be_same)) {
339                 const ir_node *insn     = skip_Proj_const(node);
340                 int            arity    = get_irn_arity(insn);
341                 unsigned       node_idx = get_irn_idx(node);
342                 node_idx = uf_find(congruence_classes, node_idx);
343
344                 for (int i = 0; i < arity; ++i) {
345                         if (!rbitset_is_set(&req->other_same, i))
346                                 continue;
347
348                         ir_node *op     = get_irn_n(insn, i);
349                         int      op_idx = get_irn_idx(op);
350                         op_idx = uf_find(congruence_classes, op_idx);
351
352                         /* do we interfere with the value */
353                         bool interferes = false;
354                         foreach_ir_nodeset(live_nodes, live, iter) {
355                                 int lv_idx = get_irn_idx(live);
356                                 lv_idx     = uf_find(congruence_classes, lv_idx);
357                                 if (lv_idx == op_idx) {
358                                         interferes = true;
359                                         break;
360                                 }
361                         }
362                         /* don't put in same affinity class if we interfere */
363                         if (interferes)
364                                 continue;
365
366                         uf_union(congruence_classes, node_idx, op_idx);
367                         DB((dbg, LEVEL_3, "Merge %+F and %+F congruence classes\n",
368                             node, op));
369                         /* one should_be_same is enough... */
370                         break;
371                 }
372         }
373 }
374
375 static void create_congruence_class(ir_node *block, void *data)
376 {
377         ir_nodeset_t live_nodes;
378
379         (void) data;
380         ir_nodeset_init(&live_nodes);
381         be_liveness_end_of_block(lv, cls, block, &live_nodes);
382
383         /* check should be same constraints */
384         ir_node *last_phi = NULL;
385         sched_foreach_reverse(block, node) {
386                 if (is_Phi(node)) {
387                         last_phi = node;
388                         break;
389                 }
390
391                 be_foreach_definition(node, cls, value, req,
392                         congruence_def(&live_nodes, value, req);
393                 );
394                 be_liveness_transfer(cls, node, &live_nodes);
395         }
396         if (!last_phi) {
397                 ir_nodeset_destroy(&live_nodes);
398                 return;
399         }
400
401         /* check phi congruence classes */
402         sched_foreach_reverse_from(last_phi, phi) {
403                 assert(is_Phi(phi));
404
405                 if (!arch_irn_consider_in_reg_alloc(cls, phi))
406                         continue;
407
408                 int node_idx = get_irn_idx(phi);
409                 node_idx = uf_find(congruence_classes, node_idx);
410
411                 int arity = get_irn_arity(phi);
412                 for (int i = 0; i < arity; ++i) {
413                         ir_node *op     = get_Phi_pred(phi, i);
414                         int      op_idx = get_irn_idx(op);
415                         op_idx = uf_find(congruence_classes, op_idx);
416
417                         /* do we interfere with the value */
418                         bool interferes = false;
419                         foreach_ir_nodeset(&live_nodes, live, iter) {
420                                 int lv_idx = get_irn_idx(live);
421                                 lv_idx     = uf_find(congruence_classes, lv_idx);
422                                 if (lv_idx == op_idx) {
423                                         interferes = true;
424                                         break;
425                                 }
426                         }
427                         /* don't put in same affinity class if we interfere */
428                         if (interferes)
429                                 continue;
430                         /* any other phi has the same input? */
431                         sched_foreach(block, phi) {
432                                 ir_node *oop;
433                                 int      oop_idx;
434                                 if (!is_Phi(phi))
435                                         break;
436                                 if (!arch_irn_consider_in_reg_alloc(cls, phi))
437                                         continue;
438                                 oop = get_Phi_pred(phi, i);
439                                 if (oop == op)
440                                         continue;
441                                 oop_idx = get_irn_idx(oop);
442                                 oop_idx = uf_find(congruence_classes, oop_idx);
443                                 if (oop_idx == op_idx) {
444                                         interferes = true;
445                                         break;
446                                 }
447                         }
448                         if (interferes)
449                                 continue;
450
451                         /* merge the 2 congruence classes and sum up their preferences */
452                         int old_node_idx = node_idx;
453                         node_idx = uf_union(congruence_classes, node_idx, op_idx);
454                         DB((dbg, LEVEL_3, "Merge %+F and %+F congruence classes\n",
455                             phi, op));
456
457                         old_node_idx = node_idx == old_node_idx ? op_idx : old_node_idx;
458                         allocation_info_t *head_info
459                                 = get_allocation_info(get_idx_irn(irg, node_idx));
460                         allocation_info_t *other_info
461                                 = get_allocation_info(get_idx_irn(irg, old_node_idx));
462                         for (unsigned r = 0; r < n_regs; ++r) {
463                                 head_info->prefs[r] += other_info->prefs[r];
464                         }
465                 }
466         }
467         ir_nodeset_destroy(&live_nodes);
468 }
469
470 static void set_congruence_prefs(ir_node *node, void *data)
471 {
472         (void) data;
473         unsigned node_idx = get_irn_idx(node);
474         unsigned node_set = uf_find(congruence_classes, node_idx);
475
476         /* head of congruence class or not in any class */
477         if (node_set == node_idx)
478                 return;
479
480         if (!arch_irn_consider_in_reg_alloc(cls, node))
481                 return;
482
483         ir_node *head = get_idx_irn(irg, node_set);
484         allocation_info_t *head_info = get_allocation_info(head);
485         allocation_info_t *info      = get_allocation_info(node);
486
487         memcpy(info->prefs, head_info->prefs, n_regs * sizeof(info->prefs[0]));
488 }
489
490 static void combine_congruence_classes(void)
491 {
492         size_t n = get_irg_last_idx(irg);
493         congruence_classes = XMALLOCN(int, n);
494         uf_init(congruence_classes, n);
495
496         /* create congruence classes */
497         irg_block_walk_graph(irg, create_congruence_class, NULL, NULL);
498         /* merge preferences */
499         irg_walk_graph(irg, set_congruence_prefs, NULL, NULL);
500         free(congruence_classes);
501 }
502
503
504
505 /**
506  * Assign register reg to the given node.
507  *
508  * @param node  the node
509  * @param reg   the register
510  */
511 static void use_reg(ir_node *node, const arch_register_t *reg, unsigned width)
512 {
513         unsigned r = reg->index;
514         for (unsigned r0 = r; r0 < r + width; ++r0)
515                 assignments[r0] = node;
516         arch_set_irn_register(node, reg);
517 }
518
519 static void free_reg_of_value(ir_node *node)
520 {
521         if (!arch_irn_consider_in_reg_alloc(cls, node))
522                 return;
523
524         const arch_register_t     *reg = arch_get_irn_register(node);
525         const arch_register_req_t *req = arch_get_irn_register_req(node);
526         unsigned                   r   = reg->index;
527         /* assignment->value may be NULL if a value is used at 2 inputs
528          * so it gets freed twice. */
529         for (unsigned r0 = r; r0 < r + req->width; ++r0) {
530                 assert(assignments[r0] == node || assignments[r0] == NULL);
531                 assignments[r0] = NULL;
532         }
533 }
534
535 /**
536  * Compare two register preferences in decreasing order.
537  */
538 static int compare_reg_pref(const void *e1, const void *e2)
539 {
540         const reg_pref_t *rp1 = (const reg_pref_t*) e1;
541         const reg_pref_t *rp2 = (const reg_pref_t*) e2;
542         if (rp1->pref < rp2->pref)
543                 return 1;
544         if (rp1->pref > rp2->pref)
545                 return -1;
546         return 0;
547 }
548
549 static void fill_sort_candidates(reg_pref_t *regprefs,
550                                  const allocation_info_t *info)
551 {
552         for (unsigned r = 0; r < n_regs; ++r) {
553                 float pref = info->prefs[r];
554                 regprefs[r].num  = r;
555                 regprefs[r].pref = pref;
556         }
557         /* TODO: use a stable sort here to avoid unnecessary register jumping */
558         qsort(regprefs, n_regs, sizeof(regprefs[0]), compare_reg_pref);
559 }
560
561 static bool try_optimistic_split(ir_node *to_split, ir_node *before,
562                                  float pref, float pref_delta,
563                                  unsigned *forbidden_regs, int recursion)
564 {
565         (void) pref;
566         unsigned           r = 0;
567         allocation_info_t *info = get_allocation_info(to_split);
568         float              delta = 0;
569
570         /* stupid hack: don't optimistically split don't spill nodes...
571          * (so we don't split away the values produced because of
572          *  must_be_different constraints) */
573         ir_node *original_insn = skip_Proj(info->original_value);
574         if (arch_get_irn_flags(original_insn) & arch_irn_flags_dont_spill)
575                 return false;
576
577         const arch_register_t *from_reg        = arch_get_irn_register(to_split);
578         unsigned               from_r          = from_reg->index;
579         ir_node               *block           = get_nodes_block(before);
580         float split_threshold = (float)get_block_execfreq(block) * SPLIT_DELTA;
581
582         if (pref_delta < split_threshold*0.5)
583                 return false;
584
585         /* find the best free position where we could move to */
586         reg_pref_t *prefs = ALLOCAN(reg_pref_t, n_regs);
587         fill_sort_candidates(prefs, info);
588         unsigned i;
589         for (i = 0; i < n_regs; ++i) {
590                 /* we need a normal register which is not an output register
591                    an different from the current register of to_split */
592                 r = prefs[i].num;
593                 if (!rbitset_is_set(normal_regs, r))
594                         continue;
595                 if (rbitset_is_set(forbidden_regs, r))
596                         continue;
597                 if (r == from_r)
598                         continue;
599
600                 /* is the split worth it? */
601                 delta = pref_delta + prefs[i].pref;
602                 if (delta < split_threshold) {
603                         DB((dbg, LEVEL_3, "Not doing optimistical split of %+F (depth %d), win %f too low\n",
604                                 to_split, recursion, delta));
605                         return false;
606                 }
607
608                 /* if the register is free then we can do the split */
609                 if (assignments[r] == NULL)
610                         break;
611
612                 /* otherwise we might try recursively calling optimistic_split */
613                 if (recursion+1 > MAX_OPTIMISTIC_SPLIT_RECURSION)
614                         continue;
615
616                 float apref        = prefs[i].pref;
617                 float apref_delta  = i+1 < n_regs ? apref - prefs[i+1].pref : 0;
618                 apref_delta += pref_delta - split_threshold;
619
620                 /* our source register isn't a useful destination for recursive
621                    splits */
622                 bool old_source_state = rbitset_is_set(forbidden_regs, from_r);
623                 rbitset_set(forbidden_regs, from_r);
624                 /* try recursive split */
625                 bool res = try_optimistic_split(assignments[r], before, apref,
626                                               apref_delta, forbidden_regs, recursion+1);
627                 /* restore our destination */
628                 if (old_source_state) {
629                         rbitset_set(forbidden_regs, from_r);
630                 } else {
631                         rbitset_clear(forbidden_regs, from_r);
632                 }
633
634                 if (res)
635                         break;
636         }
637         if (i >= n_regs)
638                 return false;
639
640         const arch_register_t *reg   = arch_register_for_index(cls, r);
641         ir_node               *copy  = be_new_Copy(block, to_split);
642         unsigned               width = 1;
643         mark_as_copy_of(copy, to_split);
644         /* hacky, but correct here */
645         if (assignments[from_reg->index] == to_split)
646                 free_reg_of_value(to_split);
647         use_reg(copy, reg, width);
648         sched_add_before(before, copy);
649
650         DB((dbg, LEVEL_3,
651             "Optimistic live-range split %+F move %+F(%s) -> %s before %+F (win %f, depth %d)\n",
652             copy, to_split, from_reg->name, reg->name, before, delta, recursion));
653         return true;
654 }
655
656 /**
657  * Determine and assign a register for node @p node
658  */
659 static void assign_reg(ir_node const *const block, ir_node *const node, arch_register_req_t const *const req, unsigned *const forbidden_regs)
660 {
661         assert(!is_Phi(node));
662         /* preassigned register? */
663         arch_register_t const *final_reg = arch_get_irn_register(node);
664         unsigned         const width     = req->width;
665         if (final_reg != NULL) {
666                 DB((dbg, LEVEL_2, "Preassignment %+F -> %s\n", node, final_reg->name));
667                 use_reg(node, final_reg, width);
668                 return;
669         }
670
671         /* ignore reqs must be preassigned */
672         assert(!arch_register_req_is(req, ignore));
673
674         /* give should_be_same boni */
675         allocation_info_t *info    = get_allocation_info(node);
676         ir_node           *in_node = skip_Proj(node);
677         if (arch_register_req_is(req, should_be_same)) {
678                 float weight = (float)get_block_execfreq(block);
679                 int   arity  = get_irn_arity(in_node);
680
681                 assert(arity <= (int) sizeof(req->other_same) * 8);
682                 for (int i = 0; i < arity; ++i) {
683                         if (!rbitset_is_set(&req->other_same, i))
684                                 continue;
685
686                         ir_node               *in        = get_irn_n(in_node, i);
687                         const arch_register_t *reg       = arch_get_irn_register(in);
688                         unsigned               reg_index = reg->index;
689
690                         /* if the value didn't die here then we should not propagate the
691                          * should_be_same info */
692                         if (assignments[reg_index] == in)
693                                 continue;
694
695                         info->prefs[reg_index] += weight * AFF_SHOULD_BE_SAME;
696                 }
697         }
698
699         /* create list of register candidates and sort by their preference */
700         DB((dbg, LEVEL_2, "Candidates for %+F:", node));
701         reg_pref_t *reg_prefs = ALLOCAN(reg_pref_t, n_regs);
702         fill_sort_candidates(reg_prefs, info);
703         for (unsigned r = 0; r < n_regs; ++r) {
704                 unsigned num = reg_prefs[r].num;
705                 if (!rbitset_is_set(normal_regs, num))
706                         continue;
707                 const arch_register_t *reg = arch_register_for_index(cls, num);
708                 DB((dbg, LEVEL_2, " %s(%f)", reg->name, reg_prefs[r].pref));
709         }
710         DB((dbg, LEVEL_2, "\n"));
711
712         const unsigned *allowed_regs = normal_regs;
713         if (arch_register_req_is(req, limited)) {
714                 allowed_regs = req->limited;
715         }
716
717         unsigned final_reg_index = 0;
718         unsigned r;
719         for (r = 0; r < n_regs; ++r) {
720                 final_reg_index = reg_prefs[r].num;
721                 if (!rbitset_is_set(allowed_regs, final_reg_index))
722                         continue;
723                 /* alignment constraint? */
724                 if (width > 1) {
725                         if (arch_register_req_is(req, aligned) && (final_reg_index % width) != 0)
726                                 continue;
727                         bool fine = true;
728                         for (unsigned r0 = r+1; r0 < r+width; ++r0) {
729                                 if (assignments[r0] != NULL)
730                                         fine = false;
731                         }
732                         /* TODO: attempt optimistic split here */
733                         if (!fine)
734                                 continue;
735                 }
736
737                 if (assignments[final_reg_index] == NULL)
738                         break;
739                 float    pref   = reg_prefs[r].pref;
740                 float    delta  = r+1 < n_regs ? pref - reg_prefs[r+1].pref : 0;
741                 ir_node *before = skip_Proj(node);
742                 bool     res
743                         = try_optimistic_split(assignments[final_reg_index], before, pref,
744                                                delta, forbidden_regs, 0);
745                 if (res)
746                         break;
747         }
748         if (r >= n_regs) {
749                 /* the common reason to hit this panic is when 1 of your nodes is not
750                  * register pressure faithful */
751                 panic("No register left for %+F\n", node);
752         }
753
754         final_reg = arch_register_for_index(cls, final_reg_index);
755         DB((dbg, LEVEL_2, "Assign %+F -> %s\n", node, final_reg->name));
756         use_reg(node, final_reg, width);
757 }
758
759 /**
760  * Add an permutation in front of a node and change the assignments
761  * due to this permutation.
762  *
763  * To understand this imagine a permutation like this:
764  *
765  * 1 -> 2
766  * 2 -> 3
767  * 3 -> 1, 5
768  * 4 -> 6
769  * 5
770  * 6
771  * 7 -> 7
772  *
773  * First we count how many destinations a single value has. At the same time
774  * we can be sure that each destination register has at most 1 source register
775  * (it can have 0 which means we don't care what value is in it).
776  * We ignore all fulfilled permuations (like 7->7)
777  * In a first pass we create as much copy instructions as possible as they
778  * are generally cheaper than exchanges. We do this by counting into how many
779  * destinations a register has to be copied (in the example it's 2 for register
780  * 3, or 1 for the registers 1,2,4 and 7).
781  * We can then create a copy into every destination register when the usecount
782  * of that register is 0 (= noone else needs the value in the register).
783  *
784  * After this step we should only have cycles left. We implement a cyclic
785  * permutation of n registers with n-1 transpositions.
786  *
787  * @param live_nodes   the set of live nodes, updated due to live range split
788  * @param before       the node before we add the permutation
789  * @param permutation  the permutation array indices are the destination
790  *                     registers, the values in the array are the source
791  *                     registers.
792  */
793 static void permute_values(ir_nodeset_t *live_nodes, ir_node *before,
794                            unsigned *permutation)
795 {
796         unsigned *n_used = ALLOCANZ(unsigned, n_regs);
797
798         /* determine how often each source register needs to be read */
799         for (unsigned r = 0; r < n_regs; ++r) {
800                 unsigned  old_reg = permutation[r];
801                 ir_node  *value;
802
803                 value = assignments[old_reg];
804                 if (value == NULL) {
805                         /* nothing to do here, reg is not live. Mark it as fixpoint
806                          * so we ignore it in the next steps */
807                         permutation[r] = r;
808                         continue;
809                 }
810
811                 ++n_used[old_reg];
812         }
813
814         ir_node *block = get_nodes_block(before);
815
816         /* step1: create copies where immediately possible */
817         for (unsigned r = 0; r < n_regs; /* empty */) {
818                 unsigned old_r = permutation[r];
819
820                 /* - no need to do anything for fixed points.
821                    - we can't copy if the value in the dest reg is still needed */
822                 if (old_r == r || n_used[r] > 0) {
823                         ++r;
824                         continue;
825                 }
826
827                 /* create a copy */
828                 ir_node *src  = assignments[old_r];
829                 ir_node *copy = be_new_Copy(block, src);
830                 sched_add_before(before, copy);
831                 const arch_register_t *reg = arch_register_for_index(cls, r);
832                 DB((dbg, LEVEL_2, "Copy %+F (from %+F, before %+F) -> %s\n",
833                     copy, src, before, reg->name));
834                 mark_as_copy_of(copy, src);
835                 unsigned width = 1; /* TODO */
836                 use_reg(copy, reg, width);
837
838                 if (live_nodes != NULL) {
839                         ir_nodeset_insert(live_nodes, copy);
840                 }
841
842                 /* old register has 1 user less, permutation is resolved */
843                 assert(arch_get_irn_register(src)->index == old_r);
844                 permutation[r] = r;
845
846                 assert(n_used[old_r] > 0);
847                 --n_used[old_r];
848                 if (n_used[old_r] == 0) {
849                         if (live_nodes != NULL) {
850                                 ir_nodeset_remove(live_nodes, src);
851                         }
852                         free_reg_of_value(src);
853                 }
854
855                 /* advance or jump back (if this copy enabled another copy) */
856                 if (old_r < r && n_used[old_r] == 0) {
857                         r = old_r;
858                 } else {
859                         ++r;
860                 }
861         }
862
863         /* at this point we only have "cycles" left which we have to resolve with
864          * perm instructions
865          * TODO: if we have free registers left, then we should really use copy
866          * instructions for any cycle longer than 2 registers...
867          * (this is probably architecture dependent, there might be archs where
868          *  copies are preferable even for 2-cycles) */
869
870         /* create perms with the rest */
871         for (unsigned r = 0; r < n_regs; /* empty */) {
872                 unsigned old_r = permutation[r];
873
874                 if (old_r == r) {
875                         ++r;
876                         continue;
877                 }
878
879                 /* we shouldn't have copies from 1 value to multiple destinations left*/
880                 assert(n_used[old_r] == 1);
881
882                 /* exchange old_r and r2; after that old_r is a fixed point */
883                 unsigned r2 = permutation[old_r];
884
885                 ir_node *in[2] = { assignments[r2], assignments[old_r] };
886                 ir_node *perm = be_new_Perm(cls, block, 2, in);
887                 sched_add_before(before, perm);
888                 DB((dbg, LEVEL_2, "Perm %+F (perm %+F,%+F, before %+F)\n",
889                     perm, in[0], in[1], before));
890
891                 unsigned width = 1; /* TODO */
892
893                 ir_node *proj0 = new_r_Proj(perm, get_irn_mode(in[0]), 0);
894                 mark_as_copy_of(proj0, in[0]);
895                 const arch_register_t *reg0 = arch_register_for_index(cls, old_r);
896                 use_reg(proj0, reg0, width);
897
898                 ir_node *proj1 = new_r_Proj(perm, get_irn_mode(in[1]), 1);
899                 mark_as_copy_of(proj1, in[1]);
900                 const arch_register_t *reg1 = arch_register_for_index(cls, r2);
901                 use_reg(proj1, reg1, width);
902
903                 /* 1 value is now in the correct register */
904                 permutation[old_r] = old_r;
905                 /* the source of r changed to r2 */
906                 permutation[r] = r2;
907
908                 /* if we have reached a fixpoint update data structures */
909                 if (live_nodes != NULL) {
910                         ir_nodeset_remove(live_nodes, in[0]);
911                         ir_nodeset_remove(live_nodes, in[1]);
912                         ir_nodeset_remove(live_nodes, proj0);
913                         ir_nodeset_insert(live_nodes, proj1);
914                 }
915         }
916
917 #ifndef NDEBUG
918         /* now we should only have fixpoints left */
919         for (unsigned r = 0; r < n_regs; ++r) {
920                 assert(permutation[r] == r);
921         }
922 #endif
923 }
924
925 /**
926  * Free regs for values last used.
927  *
928  * @param live_nodes   set of live nodes, will be updated
929  * @param node         the node to consider
930  */
931 static void free_last_uses(ir_nodeset_t *live_nodes, ir_node *node)
932 {
933         allocation_info_t *info      = get_allocation_info(node);
934         const unsigned    *last_uses = info->last_uses;
935         int                arity     = get_irn_arity(node);
936
937         for (int i = 0; i < arity; ++i) {
938                 /* check if one operand is the last use */
939                 if (!rbitset_is_set(last_uses, i))
940                         continue;
941
942                 ir_node *op = get_irn_n(node, i);
943                 free_reg_of_value(op);
944                 ir_nodeset_remove(live_nodes, op);
945         }
946 }
947
948 /**
949  * change inputs of a node to the current value (copies/perms)
950  */
951 static void rewire_inputs(ir_node *node)
952 {
953         int arity = get_irn_arity(node);
954         for (int i = 0; i < arity; ++i) {
955                 ir_node           *op = get_irn_n(node, i);
956                 allocation_info_t *info = try_get_allocation_info(op);
957
958                 if (info == NULL)
959                         continue;
960
961                 info = get_allocation_info(info->original_value);
962                 if (info->current_value != op) {
963                         set_irn_n(node, i, info->current_value);
964                 }
965         }
966 }
967
968 /**
969  * Create a bitset of registers occupied with value living through an
970  * instruction
971  */
972 static void determine_live_through_regs(unsigned *bitset, ir_node *node)
973 {
974         const allocation_info_t *info = get_allocation_info(node);
975
976         /* mark all used registers as potentially live-through */
977         for (unsigned r = 0; r < n_regs; ++r) {
978                 if (assignments[r] == NULL)
979                         continue;
980                 if (!rbitset_is_set(normal_regs, r))
981                         continue;
982
983                 rbitset_set(bitset, r);
984         }
985
986         /* remove registers of value dying at the instruction */
987         int arity = get_irn_arity(node);
988         for (int i = 0; i < arity; ++i) {
989                 if (!rbitset_is_set(info->last_uses, i))
990                         continue;
991
992                 ir_node               *op  = get_irn_n(node, i);
993                 const arch_register_t *reg = arch_get_irn_register(op);
994                 rbitset_clear(bitset, reg->index);
995         }
996 }
997
998 static void solve_lpp(ir_nodeset_t *live_nodes, ir_node *node,
999                       unsigned *forbidden_regs, unsigned *live_through_regs)
1000 {
1001         unsigned *forbidden_edges = rbitset_malloc(n_regs * n_regs);
1002         int      *lpp_vars        = XMALLOCNZ(int, n_regs*n_regs);
1003
1004         lpp_t *lpp = lpp_new("prefalloc", lpp_minimize);
1005         //lpp_set_time_limit(lpp, 20);
1006         lpp_set_log(lpp, stdout);
1007
1008         /** mark some edges as forbidden */
1009         be_foreach_use(node, cls, req, op, op_req,
1010                 if (!arch_register_req_is(req, limited))
1011                         continue;
1012
1013                 const unsigned        *limited     = req->limited;
1014                 const arch_register_t *reg         = arch_get_irn_register(op);
1015                 unsigned               current_reg = reg->index;
1016                 for (unsigned r = 0; r < n_regs; ++r) {
1017                         if (rbitset_is_set(limited, r))
1018                                 continue;
1019
1020                         rbitset_set(forbidden_edges, current_reg*n_regs + r);
1021                 }
1022         );
1023
1024         /* add all combinations, except for not allowed ones */
1025         for (unsigned l = 0; l < n_regs; ++l) {
1026                 if (!rbitset_is_set(normal_regs, l)) {
1027                         char name[15];
1028                         snprintf(name, sizeof(name), "%u_to_%u", l, l);
1029                         lpp_vars[l*n_regs+l] = lpp_add_var(lpp, name, lpp_binary, 1);
1030                         continue;
1031                 }
1032
1033                 for (unsigned r = 0; r < n_regs; ++r) {
1034                         if (!rbitset_is_set(normal_regs, r))
1035                                 continue;
1036                         if (rbitset_is_set(forbidden_edges, l*n_regs + r))
1037                                 continue;
1038                         /* livethrough values may not use constrained output registers */
1039                         if (rbitset_is_set(live_through_regs, l)
1040                             && rbitset_is_set(forbidden_regs, r))
1041                                 continue;
1042
1043                         char name[15];
1044                         snprintf(name, sizeof(name), "%u_to_%u", l, r);
1045
1046                         double costs = l==r ? 9 : 8;
1047                         lpp_vars[l*n_regs+r]
1048                                 = lpp_add_var(lpp, name, lpp_binary, costs);
1049                         assert(lpp_vars[l*n_regs+r] > 0);
1050                 }
1051         }
1052         /* add constraints */
1053         for (unsigned l = 0; l < n_regs; ++l) {
1054                 /* only 1 destination per register */
1055                 int constraint = -1;
1056                 for (unsigned r = 0; r < n_regs; ++r) {
1057                         int var = lpp_vars[l*n_regs+r];
1058                         if (var == 0)
1059                                 continue;
1060                         if (constraint < 0) {
1061                                 char name[64];
1062                                 snprintf(name, sizeof(name), "%u_to_dest", l);
1063                                 constraint = lpp_add_cst(lpp, name, lpp_equal, 1);
1064                         }
1065                         lpp_set_factor_fast(lpp, constraint, var, 1);
1066                 }
1067                 /* each destination used by at most 1 value */
1068                 constraint = -1;
1069                 for (unsigned r = 0; r < n_regs; ++r) {
1070                         int var = lpp_vars[r*n_regs+l];
1071                         if (var == 0)
1072                                 continue;
1073                         if (constraint < 0) {
1074                                 char name[64];
1075                                 snprintf(name, sizeof(name), "one_to_%u", l);
1076                                 constraint = lpp_add_cst(lpp, name, lpp_less_equal, 1);
1077                         }
1078                         lpp_set_factor_fast(lpp, constraint, var, 1);
1079                 }
1080         }
1081
1082         lpp_dump_plain(lpp, fopen("lppdump.txt", "w"));
1083
1084         /* solve lpp */
1085         lpp_solve(lpp, be_options.ilp_server, be_options.ilp_solver);
1086         if (!lpp_is_sol_valid(lpp))
1087                 panic("ilp solution not valid!");
1088
1089         unsigned *assignment = ALLOCAN(unsigned, n_regs);
1090         for (unsigned l = 0; l < n_regs; ++l) {
1091                 unsigned dest_reg = (unsigned)-1;
1092                 for (unsigned r = 0; r < n_regs; ++r) {
1093                         int var = lpp_vars[l*n_regs+r];
1094                         if (var == 0)
1095                                 continue;
1096                         double val = lpp_get_var_sol(lpp, var);
1097                         if (val == 1) {
1098                                 assert(dest_reg == (unsigned)-1);
1099                                 dest_reg = r;
1100                         }
1101                 }
1102                 assert(dest_reg != (unsigned)-1);
1103                 assignment[dest_reg] = l;
1104         }
1105
1106         fprintf(stderr, "Assignment: ");
1107         for (unsigned l = 0; l < n_regs; ++l) {
1108                 fprintf(stderr, "%u ", assignment[l]);
1109         }
1110         fprintf(stderr, "\n");
1111         fflush(stdout);
1112         permute_values(live_nodes, node, assignment);
1113         lpp_free(lpp);
1114 }
1115
1116 static bool is_aligned(unsigned num, unsigned alignment)
1117 {
1118         unsigned mask = alignment-1;
1119         assert(is_po2(alignment));
1120         return (num&mask) == 0;
1121 }
1122
1123 /**
1124  * Enforce constraints at a node by live range splits.
1125  *
1126  * @param  live_nodes  the set of live nodes, might be changed
1127  * @param  node        the current node
1128  */
1129 static void enforce_constraints(ir_nodeset_t *live_nodes, ir_node *node,
1130                                 unsigned *forbidden_regs)
1131 {
1132         /* see if any use constraints are not met and whether double-width
1133          * values are involved */
1134         bool double_width = false;
1135         bool good = true;
1136         be_foreach_use(node, cls, req, op, op_req,
1137                 /* are there any limitations for the i'th operand? */
1138                 if (req->width > 1)
1139                         double_width = true;
1140                 const arch_register_t *reg       = arch_get_irn_register(op);
1141                 unsigned               reg_index = reg->index;
1142                 if (arch_register_req_is(req, aligned)) {
1143                         if (!is_aligned(reg_index, req->width)) {
1144                                 good = false;
1145                                 continue;
1146                         }
1147                 }
1148                 if (!arch_register_req_is(req, limited))
1149                         continue;
1150
1151                 const unsigned *limited = req->limited;
1152                 if (!rbitset_is_set(limited, reg_index)) {
1153                         /* found an assignment outside the limited set */
1154                         good = false;
1155                         continue;
1156                 }
1157         );
1158
1159         /* is any of the live-throughs using a constrained output register? */
1160         unsigned *live_through_regs = NULL;
1161         be_foreach_definition(node, cls, value, req,
1162                 (void)value;
1163                 if (req->width > 1)
1164                         double_width = true;
1165                 if (!arch_register_req_is(req, limited))
1166                         continue;
1167                 if (live_through_regs == NULL) {
1168                         live_through_regs = rbitset_alloca(n_regs);
1169                         determine_live_through_regs(live_through_regs, node);
1170                 }
1171                 rbitset_or(forbidden_regs, req->limited, n_regs);
1172                 if (rbitsets_have_common(req->limited, live_through_regs, n_regs))
1173                         good = false;
1174         );
1175
1176         if (good)
1177                 return;
1178
1179         /* create these arrays if we haven't yet */
1180         if (live_through_regs == NULL) {
1181                 live_through_regs = rbitset_alloca(n_regs);
1182         }
1183
1184         if (double_width) {
1185                 /* only the ILP variant can solve this yet */
1186                 solve_lpp(live_nodes, node, forbidden_regs, live_through_regs);
1187                 return;
1188         }
1189
1190         /* at this point we have to construct a bipartite matching problem to see
1191          * which values should go to which registers
1192          * Note: We're building the matrix in "reverse" - source registers are
1193          *       right, destinations left because this will produce the solution
1194          *       in the format required for permute_values.
1195          */
1196         hungarian_problem_t *bp
1197                 = hungarian_new(n_regs, n_regs, HUNGARIAN_MATCH_PERFECT);
1198
1199         /* add all combinations, then remove not allowed ones */
1200         for (unsigned l = 0; l < n_regs; ++l) {
1201                 if (!rbitset_is_set(normal_regs, l)) {
1202                         hungarian_add(bp, l, l, 1);
1203                         continue;
1204                 }
1205
1206                 for (unsigned r = 0; r < n_regs; ++r) {
1207                         if (!rbitset_is_set(normal_regs, r))
1208                                 continue;
1209                         /* livethrough values may not use constrainted output registers */
1210                         if (rbitset_is_set(live_through_regs, l)
1211                                         && rbitset_is_set(forbidden_regs, r))
1212                                 continue;
1213
1214                         hungarian_add(bp, r, l, l == r ? 9 : 8);
1215                 }
1216         }
1217
1218         be_foreach_use(node, cls, req, op, op_req,
1219                 if (!arch_register_req_is(req, limited))
1220                         continue;
1221
1222                 const unsigned        *limited     = req->limited;
1223                 const arch_register_t *reg         = arch_get_irn_register(op);
1224                 unsigned               current_reg = reg->index;
1225                 for (unsigned r = 0; r < n_regs; ++r) {
1226                         if (rbitset_is_set(limited, r))
1227                                 continue;
1228                         hungarian_remove(bp, r, current_reg);
1229                 }
1230         );
1231
1232         //hungarian_print_cost_matrix(bp, 1);
1233         hungarian_prepare_cost_matrix(bp, HUNGARIAN_MODE_MAXIMIZE_UTIL);
1234
1235         unsigned *assignment = ALLOCAN(unsigned, n_regs);
1236         int res = hungarian_solve(bp, assignment, NULL, 0);
1237         assert(res == 0);
1238
1239         hungarian_free(bp);
1240
1241         permute_values(live_nodes, node, assignment);
1242 }
1243
1244 /** test whether a node @p n is a copy of the value of node @p of */
1245 static bool is_copy_of(ir_node *value, ir_node *test_value)
1246 {
1247         if (value == test_value)
1248                 return true;
1249
1250         allocation_info_t *info      = get_allocation_info(value);
1251         allocation_info_t *test_info = get_allocation_info(test_value);
1252         return test_info->original_value == info->original_value;
1253 }
1254
1255 /**
1256  * find a value in the end-assignment of a basic block
1257  * @returns the index into the assignment array if found
1258  *          -1 if not found
1259  */
1260 static int find_value_in_block_info(block_info_t *info, ir_node *value)
1261 {
1262         ir_node  **end_assignments = info->assignments;
1263         for (unsigned r = 0; r < n_regs; ++r) {
1264                 ir_node *a_value = end_assignments[r];
1265
1266                 if (a_value == NULL)
1267                         continue;
1268                 if (is_copy_of(a_value, value))
1269                         return (int) r;
1270         }
1271
1272         return -1;
1273 }
1274
1275 /**
1276  * Create the necessary permutations at the end of a basic block to fullfill
1277  * the register assignment for phi-nodes in the next block
1278  */
1279 static void add_phi_permutations(ir_node *block, int p)
1280 {
1281         ir_node      *pred      = get_Block_cfgpred_block(block, p);
1282         block_info_t *pred_info = get_block_info(pred);
1283
1284         /* predecessor not processed yet? nothing to do */
1285         if (!pred_info->processed)
1286                 return;
1287
1288         unsigned *permutation = ALLOCAN(unsigned, n_regs);
1289         for (unsigned r = 0; r < n_regs; ++r) {
1290                 permutation[r] = r;
1291         }
1292
1293         /* check phi nodes */
1294         bool     need_permutation = false;
1295         ir_node *phi              = sched_first(block);
1296         for ( ; is_Phi(phi); phi = sched_next(phi)) {
1297                 if (!arch_irn_consider_in_reg_alloc(cls, phi))
1298                         continue;
1299
1300                 ir_node *phi_pred = get_Phi_pred(phi, p);
1301                 int      a        = find_value_in_block_info(pred_info, phi_pred);
1302                 assert(a >= 0);
1303
1304                 const arch_register_t *reg  = arch_get_irn_register(phi);
1305                 int                    regn = reg->index;
1306                 /* same register? nothing to do */
1307                 if (regn == a)
1308                         continue;
1309
1310                 ir_node               *op     = pred_info->assignments[a];
1311                 const arch_register_t *op_reg = arch_get_irn_register(op);
1312                 /* Virtual registers are ok, too. */
1313                 if (op_reg->type & arch_register_type_virtual)
1314                         continue;
1315
1316                 permutation[regn] = a;
1317                 need_permutation  = true;
1318         }
1319
1320         if (need_permutation) {
1321                 /* permute values at end of predecessor */
1322                 ir_node **old_assignments = assignments;
1323                 assignments     = pred_info->assignments;
1324                 permute_values(NULL, be_get_end_of_block_insertion_point(pred),
1325                                permutation);
1326                 assignments = old_assignments;
1327         }
1328
1329         /* change phi nodes to use the copied values */
1330         phi = sched_first(block);
1331         for ( ; is_Phi(phi); phi = sched_next(phi)) {
1332                 if (!arch_irn_consider_in_reg_alloc(cls, phi))
1333                         continue;
1334
1335                 /* we have permuted all values into the correct registers so we can
1336                    simply query which value occupies the phis register in the
1337                    predecessor */
1338                 int      a  = arch_get_irn_register(phi)->index;
1339                 ir_node *op = pred_info->assignments[a];
1340                 set_Phi_pred(phi, p, op);
1341         }
1342 }
1343
1344 /**
1345  * Set preferences for a phis register based on the registers used on the
1346  * phi inputs.
1347  */
1348 static void adapt_phi_prefs(ir_node *phi)
1349 {
1350         ir_node           *block = get_nodes_block(phi);
1351         allocation_info_t *info  = get_allocation_info(phi);
1352
1353         int arity = get_irn_arity(phi);
1354         for (int i = 0; i < arity; ++i) {
1355                 ir_node               *op  = get_irn_n(phi, i);
1356                 const arch_register_t *reg = arch_get_irn_register(op);
1357
1358                 if (reg == NULL)
1359                         continue;
1360                 /* we only give the bonus if the predecessor already has registers
1361                  * assigned, otherwise we only see a dummy value
1362                  * and any conclusions about its register are useless */
1363                 ir_node      *pred_block      = get_Block_cfgpred_block(block, i);
1364                 block_info_t *pred_block_info = get_block_info(pred_block);
1365                 if (!pred_block_info->processed)
1366                         continue;
1367
1368                 /* give bonus for already assigned register */
1369                 float weight = (float)get_block_execfreq(pred_block);
1370                 info->prefs[reg->index] += weight * AFF_PHI;
1371         }
1372 }
1373
1374 /**
1375  * After a phi has been assigned a register propagate preference inputs
1376  * to the phi inputs.
1377  */
1378 static void propagate_phi_register(ir_node *phi, unsigned assigned_r)
1379 {
1380         ir_node *block = get_nodes_block(phi);
1381
1382         int arity = get_irn_arity(phi);
1383         for (int i = 0; i < arity; ++i) {
1384                 ir_node           *op         = get_Phi_pred(phi, i);
1385                 allocation_info_t *info       = get_allocation_info(op);
1386                 ir_node           *pred_block = get_Block_cfgpred_block(block, i);
1387                 float              weight
1388                         = (float)get_block_execfreq(pred_block) * AFF_PHI;
1389
1390                 if (info->prefs[assigned_r] >= weight)
1391                         continue;
1392
1393                 /* promote the prefered register */
1394                 for (unsigned r = 0; r < n_regs; ++r) {
1395                         if (info->prefs[r] > -weight) {
1396                                 info->prefs[r] = -weight;
1397                         }
1398                 }
1399                 info->prefs[assigned_r] = weight;
1400
1401                 if (is_Phi(op))
1402                         propagate_phi_register(op, assigned_r);
1403         }
1404 }
1405
1406 static void assign_phi_registers(ir_node *block)
1407 {
1408         /* count phi nodes */
1409         int n_phis = 0;
1410         sched_foreach(block, node) {
1411                 if (!is_Phi(node))
1412                         break;
1413                 if (!arch_irn_consider_in_reg_alloc(cls, node))
1414                         continue;
1415                 ++n_phis;
1416         }
1417
1418         if (n_phis == 0)
1419                 return;
1420
1421         /* build a bipartite matching problem for all phi nodes */
1422         hungarian_problem_t *bp
1423                 = hungarian_new(n_phis, n_regs, HUNGARIAN_MATCH_PERFECT);
1424         int n = 0;
1425         sched_foreach(block, node) {
1426                 if (!is_Phi(node))
1427                         break;
1428                 if (!arch_irn_consider_in_reg_alloc(cls, node))
1429                         continue;
1430
1431                 /* give boni for predecessor colorings */
1432                 adapt_phi_prefs(node);
1433                 /* add stuff to bipartite problem */
1434                 allocation_info_t *info = get_allocation_info(node);
1435                 DB((dbg, LEVEL_3, "Prefs for %+F: ", node));
1436                 for (unsigned r = 0; r < n_regs; ++r) {
1437                         if (!rbitset_is_set(normal_regs, r))
1438                                 continue;
1439
1440                         float costs = info->prefs[r];
1441                         costs = costs < 0 ? -logf(-costs+1) : logf(costs+1);
1442                         costs *= 100;
1443                         costs += 10000;
1444                         hungarian_add(bp, n, r, (int)costs);
1445                         DB((dbg, LEVEL_3, " %s(%f)", arch_register_for_index(cls, r)->name,
1446                                                 info->prefs[r]));
1447                 }
1448                 DB((dbg, LEVEL_3, "\n"));
1449                 ++n;
1450         }
1451
1452         //hungarian_print_cost_matrix(bp, 7);
1453         hungarian_prepare_cost_matrix(bp, HUNGARIAN_MODE_MAXIMIZE_UTIL);
1454
1455         unsigned *assignment = ALLOCAN(unsigned, n_regs);
1456         int       res        = hungarian_solve(bp, assignment, NULL, 0);
1457         assert(res == 0);
1458
1459         /* apply results */
1460         n = 0;
1461         sched_foreach(block, node) {
1462                 if (!is_Phi(node))
1463                         break;
1464                 if (!arch_irn_consider_in_reg_alloc(cls, node))
1465                         continue;
1466                 const arch_register_req_t *req
1467                         = arch_get_irn_register_req(node);
1468
1469                 unsigned r = assignment[n++];
1470                 assert(rbitset_is_set(normal_regs, r));
1471                 const arch_register_t *reg = arch_register_for_index(cls, r);
1472                 DB((dbg, LEVEL_2, "Assign %+F -> %s\n", node, reg->name));
1473                 use_reg(node, reg, req->width);
1474
1475                 /* adapt preferences for phi inputs */
1476                 propagate_phi_register(node, r);
1477         }
1478 }
1479
1480 static arch_register_req_t *allocate_reg_req(ir_graph *irg)
1481 {
1482         struct obstack *obst = be_get_be_obst(irg);
1483         arch_register_req_t *req = OALLOCZ(obst, arch_register_req_t);
1484         return req;
1485 }
1486
1487 /**
1488  * Walker: assign registers to all nodes of a block that
1489  * need registers from the currently considered register class.
1490  */
1491 static void allocate_coalesce_block(ir_node *block, void *data)
1492 {
1493         (void) data;
1494         DB((dbg, LEVEL_2, "* Block %+F\n", block));
1495
1496         /* clear assignments */
1497         block_info_t *block_info  = get_block_info(block);
1498         assignments = block_info->assignments;
1499
1500         ir_nodeset_t live_nodes;
1501         ir_nodeset_init(&live_nodes);
1502
1503         /* gather regalloc infos of predecessor blocks */
1504         int            n_preds          = get_Block_n_cfgpreds(block);
1505         block_info_t **pred_block_infos = ALLOCAN(block_info_t*, n_preds);
1506         for (int i = 0; i < n_preds; ++i) {
1507                 ir_node      *pred      = get_Block_cfgpred_block(block, i);
1508                 block_info_t *pred_info = get_block_info(pred);
1509                 pred_block_infos[i]     = pred_info;
1510         }
1511
1512         ir_node **phi_ins = ALLOCAN(ir_node*, n_preds);
1513
1514         /* collect live-in nodes and preassigned values */
1515         be_lv_foreach(lv, block, be_lv_state_in, node) {
1516                 const arch_register_req_t *req = arch_get_irn_register_req(node);
1517                 if (req->cls != cls)
1518                         continue;
1519
1520                 if (arch_register_req_is(req, ignore)) {
1521                         allocation_info_t *info = get_allocation_info(node);
1522                         info->current_value = node;
1523
1524                         const arch_register_t *reg = arch_get_irn_register(node);
1525                         assert(reg != NULL); /* ignore values must be preassigned */
1526                         use_reg(node, reg, req->width);
1527                         continue;
1528                 }
1529
1530                 /* check all predecessors for this value, if it is not everywhere the
1531                    same or unknown then we have to construct a phi
1532                    (we collect the potential phi inputs here) */
1533                 bool need_phi = false;
1534                 for (int p = 0; p < n_preds; ++p) {
1535                         block_info_t *pred_info = pred_block_infos[p];
1536
1537                         if (!pred_info->processed) {
1538                                 /* use node for now, it will get fixed later */
1539                                 phi_ins[p] = node;
1540                                 need_phi   = true;
1541                         } else {
1542                                 int a = find_value_in_block_info(pred_info, node);
1543
1544                                 /* must live out of predecessor */
1545                                 assert(a >= 0);
1546                                 phi_ins[p] = pred_info->assignments[a];
1547                                 /* different value from last time? then we need a phi */
1548                                 if (p > 0 && phi_ins[p-1] != phi_ins[p]) {
1549                                         need_phi = true;
1550                                 }
1551                         }
1552                 }
1553
1554                 if (need_phi) {
1555                         ir_mode *mode = get_irn_mode(node);
1556                         const arch_register_req_t *phi_req = cls->class_req;
1557                         if (req->width > 1) {
1558                                 arch_register_req_t *new_req = allocate_reg_req(irg);
1559                                 new_req->cls   = cls;
1560                                 new_req->type  = req->type & arch_register_req_type_aligned;
1561                                 new_req->width = req->width;
1562                                 phi_req = new_req;
1563                         }
1564                         ir_node *phi  = be_new_Phi(block, n_preds, phi_ins, mode,
1565                                                    phi_req);
1566
1567                         DB((dbg, LEVEL_3, "Create Phi %+F (for %+F) -", phi, node));
1568 #ifdef DEBUG_libfirm
1569                         for (int pi = 0; pi < n_preds; ++pi) {
1570                                 DB((dbg, LEVEL_3, " %+F", phi_ins[pi]));
1571                         }
1572                         DB((dbg, LEVEL_3, "\n"));
1573 #endif
1574                         mark_as_copy_of(phi, node);
1575                         sched_add_after(block, phi);
1576
1577                         node = phi;
1578                 } else {
1579                         allocation_info_t *info = get_allocation_info(node);
1580                         info->current_value = phi_ins[0];
1581
1582                         /* Grab 1 of the inputs we constructed (might not be the same as
1583                          * "node" as we could see the same copy of the value in all
1584                          * predecessors */
1585                         node = phi_ins[0];
1586                 }
1587
1588                 /* if the node already has a register assigned use it */
1589                 const arch_register_t *reg = arch_get_irn_register(node);
1590                 if (reg != NULL) {
1591                         use_reg(node, reg, req->width);
1592                 }
1593
1594                 /* remember that this node is live at the beginning of the block */
1595                 ir_nodeset_insert(&live_nodes, node);
1596         }
1597
1598         /** Collects registers which must not be used for optimistic splits. */
1599         unsigned *const forbidden_regs = rbitset_alloca(n_regs);
1600
1601         /* handle phis... */
1602         assign_phi_registers(block);
1603
1604         /* all live-ins must have a register */
1605 #ifndef NDEBUG
1606         foreach_ir_nodeset(&live_nodes, node, iter) {
1607                 const arch_register_t *reg = arch_get_irn_register(node);
1608                 assert(reg != NULL);
1609         }
1610 #endif
1611
1612         /* assign instructions in the block */
1613         sched_foreach(block, node) {
1614                 /* phis are already assigned */
1615                 if (is_Phi(node))
1616                         continue;
1617
1618                 rewire_inputs(node);
1619
1620                 /* enforce use constraints */
1621                 rbitset_clear_all(forbidden_regs, n_regs);
1622                 enforce_constraints(&live_nodes, node, forbidden_regs);
1623
1624                 rewire_inputs(node);
1625
1626                 /* we may not use registers used for inputs for optimistic splits */
1627                 be_foreach_use(node, cls, in_req, op, op_req,
1628                         const arch_register_t *reg = arch_get_irn_register(op);
1629                         rbitset_set(forbidden_regs, reg->index);
1630                 );
1631
1632                 /* free registers of values last used at this instruction */
1633                 free_last_uses(&live_nodes, node);
1634
1635                 /* assign output registers */
1636                 be_foreach_definition_(node, cls, value, req,
1637                         assign_reg(block, value, req, forbidden_regs);
1638                 );
1639         }
1640
1641         ir_nodeset_destroy(&live_nodes);
1642         assignments = NULL;
1643
1644         block_info->processed = true;
1645
1646         /* permute values at end of predecessor blocks in case of phi-nodes */
1647         if (n_preds > 1) {
1648                 for (int p = 0; p < n_preds; ++p) {
1649                         add_phi_permutations(block, p);
1650                 }
1651         }
1652
1653         /* if we have exactly 1 successor then we might be able to produce phi
1654            copies now */
1655         if (get_irn_n_edges_kind(block, EDGE_KIND_BLOCK) == 1) {
1656                 const ir_edge_t *edge
1657                         = get_irn_out_edge_first_kind(block, EDGE_KIND_BLOCK);
1658                 ir_node      *succ      = get_edge_src_irn(edge);
1659                 int           p         = get_edge_src_pos(edge);
1660                 block_info_t *succ_info = get_block_info(succ);
1661
1662                 if (succ_info->processed) {
1663                         add_phi_permutations(succ, p);
1664                 }
1665         }
1666 }
1667
1668 typedef struct block_costs_t block_costs_t;
1669 struct block_costs_t {
1670         float costs;   /**< costs of the block */
1671         int   dfs_num; /**< depth first search number (to detect backedges) */
1672 };
1673
1674 static int cmp_block_costs(const void *d1, const void *d2)
1675 {
1676         const ir_node       * const *block1 = (const ir_node**)d1;
1677         const ir_node       * const *block2 = (const ir_node**)d2;
1678         const block_costs_t *info1  = (const block_costs_t*)get_irn_link(*block1);
1679         const block_costs_t *info2  = (const block_costs_t*)get_irn_link(*block2);
1680         return QSORT_CMP(info2->costs, info1->costs);
1681 }
1682
1683 static void determine_block_order(void)
1684 {
1685         ir_node **blocklist = be_get_cfgpostorder(irg);
1686         size_t    n_blocks  = ARR_LEN(blocklist);
1687         int       dfs_num   = 0;
1688         pdeq     *worklist  = new_pdeq();
1689         ir_node **order     = XMALLOCN(ir_node*, n_blocks);
1690         size_t    order_p   = 0;
1691
1692         /* clear block links... */
1693         for (size_t p = 0; p < n_blocks; ++p) {
1694                 ir_node *block = blocklist[p];
1695                 set_irn_link(block, NULL);
1696         }
1697
1698         /* walk blocks in reverse postorder, the costs for each block are the
1699          * sum of the costs of its predecessors (excluding the costs on backedges
1700          * which we can't determine) */
1701         for (size_t p = n_blocks; p > 0;) {
1702                 block_costs_t *cost_info;
1703                 ir_node *block = blocklist[--p];
1704
1705                 float execfreq   = (float)get_block_execfreq(block);
1706                 float costs      = execfreq;
1707                 int   n_cfgpreds = get_Block_n_cfgpreds(block);
1708                 for (int p2 = 0; p2 < n_cfgpreds; ++p2) {
1709                         ir_node       *pred_block = get_Block_cfgpred_block(block, p2);
1710                         block_costs_t *pred_costs = (block_costs_t*)get_irn_link(pred_block);
1711                         /* we don't have any info for backedges */
1712                         if (pred_costs == NULL)
1713                                 continue;
1714                         costs += pred_costs->costs;
1715                 }
1716
1717                 cost_info          = OALLOCZ(&obst, block_costs_t);
1718                 cost_info->costs   = costs;
1719                 cost_info->dfs_num = dfs_num++;
1720                 set_irn_link(block, cost_info);
1721         }
1722
1723         /* sort array by block costs */
1724         qsort(blocklist, n_blocks, sizeof(blocklist[0]), cmp_block_costs);
1725
1726         ir_reserve_resources(irg, IR_RESOURCE_BLOCK_VISITED);
1727         inc_irg_block_visited(irg);
1728
1729         for (size_t p = 0; p < n_blocks; ++p) {
1730                 ir_node *block = blocklist[p];
1731                 if (Block_block_visited(block))
1732                         continue;
1733
1734                 /* continually add predecessors with highest costs to worklist
1735                  * (without using backedges) */
1736                 do {
1737                         block_costs_t *info       = (block_costs_t*)get_irn_link(block);
1738                         ir_node       *best_pred  = NULL;
1739                         float          best_costs = -1;
1740                         int            n_cfgpred  = get_Block_n_cfgpreds(block);
1741
1742                         pdeq_putr(worklist, block);
1743                         mark_Block_block_visited(block);
1744                         for (int i = 0; i < n_cfgpred; ++i) {
1745                                 ir_node       *pred_block = get_Block_cfgpred_block(block, i);
1746                                 block_costs_t *pred_info  = (block_costs_t*)get_irn_link(pred_block);
1747
1748                                 /* ignore backedges */
1749                                 if (pred_info->dfs_num > info->dfs_num)
1750                                         continue;
1751
1752                                 if (info->costs > best_costs) {
1753                                         best_costs = info->costs;
1754                                         best_pred  = pred_block;
1755                                 }
1756                         }
1757                         block = best_pred;
1758                 } while (block != NULL && !Block_block_visited(block));
1759
1760                 /* now put all nodes in the worklist in our final order */
1761                 while (!pdeq_empty(worklist)) {
1762                         ir_node *pblock = (ir_node*)pdeq_getr(worklist);
1763                         assert(order_p < n_blocks);
1764                         order[order_p++] = pblock;
1765                 }
1766         }
1767         assert(order_p == n_blocks);
1768         del_pdeq(worklist);
1769
1770         ir_free_resources(irg, IR_RESOURCE_BLOCK_VISITED);
1771
1772         DEL_ARR_F(blocklist);
1773
1774         obstack_free(&obst, NULL);
1775         obstack_init(&obst);
1776
1777         block_order   = order;
1778         n_block_order = n_blocks;
1779 }
1780
1781 static void free_block_order(void)
1782 {
1783         xfree(block_order);
1784 }
1785
1786 /**
1787  * Run the register allocator for the current register class.
1788  */
1789 static void be_pref_alloc_cls(void)
1790 {
1791         be_assure_live_sets(irg);
1792         lv = be_get_irg_liveness(irg);
1793
1794         ir_reserve_resources(irg, IR_RESOURCE_IRN_LINK);
1795
1796         DB((dbg, LEVEL_2, "=== Allocating registers of %s ===\n", cls->name));
1797
1798         be_clear_links(irg);
1799
1800         irg_block_walk_graph(irg, NULL, analyze_block, NULL);
1801         combine_congruence_classes();
1802
1803         for (size_t i = 0; i < n_block_order; ++i) {
1804                 ir_node *block = block_order[i];
1805                 allocate_coalesce_block(block, NULL);
1806         }
1807
1808         ir_free_resources(irg, IR_RESOURCE_IRN_LINK);
1809 }
1810
1811 static void dump(int mask, ir_graph *irg, const char *suffix)
1812 {
1813         if (be_options.dump_flags & mask)
1814                 dump_ir_graph(irg, suffix);
1815 }
1816
1817 /**
1818  * Run the spiller on the current graph.
1819  */
1820 static void spill(void)
1821 {
1822         /* make sure all nodes show their real register pressure */
1823         be_timer_push(T_RA_CONSTR);
1824         be_pre_spill_prepare_constr(irg, cls);
1825         be_timer_pop(T_RA_CONSTR);
1826
1827         dump(DUMP_RA, irg, "spillprepare");
1828
1829         /* spill */
1830         be_timer_push(T_RA_SPILL);
1831         be_do_spill(irg, cls);
1832         be_timer_pop(T_RA_SPILL);
1833
1834         be_timer_push(T_RA_SPILL_APPLY);
1835         check_for_memory_operands(irg);
1836         be_timer_pop(T_RA_SPILL_APPLY);
1837
1838         dump(DUMP_RA, irg, "spill");
1839 }
1840
1841 /**
1842  * The pref register allocator for a whole procedure.
1843  */
1844 static void be_pref_alloc(ir_graph *new_irg)
1845 {
1846         obstack_init(&obst);
1847
1848         irg = new_irg;
1849
1850         /* determine a good coloring order */
1851         determine_block_order();
1852
1853         const arch_env_t *arch_env = be_get_irg_arch_env(new_irg);
1854         int               n_cls    = arch_env->n_register_classes;
1855         for (int c = 0; c < n_cls; ++c) {
1856                 cls = &arch_env->register_classes[c];
1857                 if (arch_register_class_flags(cls) & arch_register_class_flag_manual_ra)
1858                         continue;
1859
1860                 stat_ev_ctx_push_str("regcls", cls->name);
1861
1862                 n_regs      = arch_register_class_n_regs(cls);
1863                 normal_regs = rbitset_malloc(n_regs);
1864                 be_set_allocatable_regs(irg, cls, normal_regs);
1865
1866                 spill();
1867
1868                 /* verify schedule and register pressure */
1869                 be_timer_push(T_VERIFY);
1870                 if (be_options.verify_option == BE_VERIFY_WARN) {
1871                         be_verify_schedule(irg);
1872                         be_verify_register_pressure(irg, cls);
1873                 } else if (be_options.verify_option == BE_VERIFY_ASSERT) {
1874                         assert(be_verify_schedule(irg) && "Schedule verification failed");
1875                         assert(be_verify_register_pressure(irg, cls)
1876                                 && "Register pressure verification failed");
1877                 }
1878                 be_timer_pop(T_VERIFY);
1879
1880                 be_timer_push(T_RA_COLOR);
1881                 be_pref_alloc_cls();
1882                 be_timer_pop(T_RA_COLOR);
1883
1884                 /* we most probably constructed new Phis so liveness info is invalid
1885                  * now */
1886                 be_invalidate_live_sets(irg);
1887                 free(normal_regs);
1888
1889                 stat_ev_ctx_pop("regcls");
1890         }
1891
1892         free_block_order();
1893
1894         be_timer_push(T_RA_SPILL_APPLY);
1895         be_abi_fix_stack_nodes(irg);
1896         be_timer_pop(T_RA_SPILL_APPLY);
1897
1898         be_timer_push(T_VERIFY);
1899         if (be_options.verify_option == BE_VERIFY_WARN) {
1900                 be_verify_register_allocation(irg);
1901         } else if (be_options.verify_option == BE_VERIFY_ASSERT) {
1902                 assert(be_verify_register_allocation(irg)
1903                        && "Register allocation invalid");
1904         }
1905         be_timer_pop(T_VERIFY);
1906
1907         obstack_free(&obst, NULL);
1908 }
1909
1910 BE_REGISTER_MODULE_CONSTRUCTOR(be_init_pref_alloc)
1911 void be_init_pref_alloc(void)
1912 {
1913         static be_ra_t be_ra_pref = { be_pref_alloc };
1914         be_register_allocator("pref", &be_ra_pref);
1915         FIRM_DBG_REGISTER(dbg, "firm.be.prefalloc");
1916 }