beifg: Factorise code to count interference components.
[libfirm] / ir / be / beprefalloc.c
1 /*
2  * This file is part of libFirm.
3  * Copyright (C) 2012 University of Karlsruhe.
4  */
5
6 /**
7  * @file
8  * @brief       Preference Guided Register Assignment
9  * @author      Matthias Braun
10  * @date        14.2.2009
11  *
12  * The idea is to allocate registers in 2 passes:
13  * 1. A first pass to determine "preferred" registers for live-ranges. This
14  *    calculates for each register and each live-range a value indicating
15  *    the usefulness. (You can roughly think of the value as the negative
16  *    costs needed for copies when the value is in the specific registers...)
17  *
18  * 2. Walk blocks and assigns registers in a greedy fashion. Preferring
19  *    registers with high preferences. When register constraints are not met,
20  *    add copies and split live-ranges.
21  *
22  * TODO:
23  *  - make use of free registers in the permute_values code
24  */
25 #include "config.h"
26
27 #include <float.h>
28 #include <stdbool.h>
29 #include <math.h>
30 #include "lpp.h"
31
32 #include "error.h"
33 #include "execfreq.h"
34 #include "ircons.h"
35 #include "irdom.h"
36 #include "iredges_t.h"
37 #include "irgraph_t.h"
38 #include "irgwalk.h"
39 #include "irnode_t.h"
40 #include "irprintf.h"
41 #include "irdump.h"
42 #include "irtools.h"
43 #include "util.h"
44 #include "obst.h"
45 #include "raw_bitset.h"
46 #include "unionfind.h"
47 #include "pdeq.h"
48 #include "hungarian.h"
49 #include "statev.h"
50 #include "beabi.h"
51 #include "bechordal_t.h"
52 #include "be.h"
53 #include "beirg.h"
54 #include "belive_t.h"
55 #include "bemodule.h"
56 #include "benode.h"
57 #include "bera.h"
58 #include "besched.h"
59 #include "bespill.h"
60 #include "bespillutil.h"
61 #include "beverify.h"
62 #include "beutil.h"
63 #include "bestack.h"
64
65 #define USE_FACTOR                     1.0f
66 #define DEF_FACTOR                     1.0f
67 #define NEIGHBOR_FACTOR                0.2f
68 #define AFF_SHOULD_BE_SAME             0.5f
69 #define AFF_PHI                        1.0f
70 #define SPLIT_DELTA                    1.0f
71 #define MAX_OPTIMISTIC_SPLIT_RECURSION 0
72
73 DEBUG_ONLY(static firm_dbg_module_t *dbg = NULL;)
74
75 static struct obstack               obst;
76 static ir_graph                    *irg;
77 static const arch_register_class_t *cls;
78 static be_lv_t                     *lv;
79 static unsigned                     n_regs;
80 static unsigned                    *normal_regs;
81 static int                         *congruence_classes;
82 static ir_node                    **block_order;
83 static size_t                       n_block_order;
84
85 /** currently active assignments (while processing a basic block)
86  * maps registers to values(their current copies) */
87 static ir_node **assignments;
88
89 /**
90  * allocation information: last_uses, register preferences
91  * the information is per firm-node.
92  */
93 struct allocation_info_t {
94         unsigned  last_uses[2];   /**< bitset indicating last uses (input pos) */
95         ir_node  *current_value;  /**< copy of the value that should be used */
96         ir_node  *original_value; /**< for copies point to original value */
97         float     prefs[];        /**< register preferences */
98 };
99 typedef struct allocation_info_t allocation_info_t;
100
101 /** helper datastructure used when sorting register preferences */
102 struct reg_pref_t {
103         unsigned num;
104         float    pref;
105 };
106 typedef struct reg_pref_t reg_pref_t;
107
108 /** per basic-block information */
109 struct block_info_t {
110         bool     processed;       /**< indicate whether block is processed */
111         ir_node *assignments[];   /**< register assignments at end of block */
112 };
113 typedef struct block_info_t block_info_t;
114
115 /**
116  * Get the allocation info for a node.
117  * The info is allocated on the first visit of a node.
118  */
119 static allocation_info_t *get_allocation_info(ir_node *node)
120 {
121         allocation_info_t *info = (allocation_info_t*)get_irn_link(node);
122         if (info == NULL) {
123                 info = OALLOCFZ(&obst, allocation_info_t, prefs, n_regs);
124                 info->current_value  = node;
125                 info->original_value = node;
126                 set_irn_link(node, info);
127         }
128
129         return info;
130 }
131
132 static allocation_info_t *try_get_allocation_info(const ir_node *node)
133 {
134         return (allocation_info_t*) get_irn_link(node);
135 }
136
137 /**
138  * Get allocation information for a basic block
139  */
140 static block_info_t *get_block_info(ir_node *block)
141 {
142         block_info_t *info = (block_info_t*)get_irn_link(block);
143
144         assert(is_Block(block));
145         if (info == NULL) {
146                 info = OALLOCFZ(&obst, block_info_t, assignments, n_regs);
147                 set_irn_link(block, info);
148         }
149
150         return info;
151 }
152
153 /**
154  * Link the allocation info of a node to a copy.
155  * Afterwards, both nodes uses the same allocation info.
156  * Copy must not have an allocation info assigned yet.
157  *
158  * @param copy   the node that gets the allocation info assigned
159  * @param value  the original node
160  */
161 static void mark_as_copy_of(ir_node *copy, ir_node *value)
162 {
163         allocation_info_t *info      = get_allocation_info(value);
164         allocation_info_t *copy_info = get_allocation_info(copy);
165
166         /* find original value */
167         ir_node *original = info->original_value;
168         if (original != value) {
169                 info = get_allocation_info(original);
170         }
171
172         assert(info->original_value == original);
173         info->current_value = copy;
174
175         /* the copy should not be linked to something else yet */
176         assert(copy_info->original_value == copy);
177         copy_info->original_value = original;
178
179         /* copy over allocation preferences */
180         memcpy(copy_info->prefs, info->prefs, n_regs * sizeof(copy_info->prefs[0]));
181 }
182
183 /**
184  * Calculate the penalties for every register on a node and its live neighbors.
185  *
186  * @param live_nodes  the set of live nodes at the current position, may be NULL
187  * @param penalty     the penalty to subtract from
188  * @param limited     a raw bitset containing the limited set for the node
189  * @param node        the node
190  */
191 static void give_penalties_for_limits(const ir_nodeset_t *live_nodes,
192                                       float penalty, const unsigned* limited,
193                                       ir_node *node)
194 {
195         allocation_info_t *info = get_allocation_info(node);
196
197         /* give penalty for all forbidden regs */
198         for (unsigned r = 0; r < n_regs; ++r) {
199                 if (rbitset_is_set(limited, r))
200                         continue;
201
202                 info->prefs[r] -= penalty;
203         }
204
205         /* all other live values should get a penalty for allowed regs */
206         if (live_nodes == NULL)
207                 return;
208
209         penalty   *= NEIGHBOR_FACTOR;
210         size_t n_allowed = rbitset_popcount(limited, n_regs);
211         if (n_allowed > 1) {
212                 /* only create a very weak penalty if multiple regs are allowed */
213                 penalty = (penalty * 0.8f) / n_allowed;
214         }
215         foreach_ir_nodeset(live_nodes, neighbor, iter) {
216                 allocation_info_t *neighbor_info;
217
218                 /* TODO: if op is used on multiple inputs we might not do a
219                  * continue here */
220                 if (neighbor == node)
221                         continue;
222
223                 neighbor_info = get_allocation_info(neighbor);
224                 for (unsigned r = 0; r < n_regs; ++r) {
225                         if (!rbitset_is_set(limited, r))
226                                 continue;
227
228                         neighbor_info->prefs[r] -= penalty;
229                 }
230         }
231 }
232
233 /**
234  * Calculate the preferences of a definition for the current register class.
235  * If the definition uses a limited set of registers, reduce the preferences
236  * for the limited register on the node and its neighbors.
237  *
238  * @param live_nodes  the set of live nodes at the current node
239  * @param weight      the weight
240  * @param node        the current node
241  */
242 static void check_defs(ir_nodeset_t const *const live_nodes, float const weight, ir_node *const node, arch_register_req_t const *const req)
243 {
244         if (arch_register_req_is(req, limited)) {
245                 const unsigned *limited = req->limited;
246                 float           penalty = weight * DEF_FACTOR;
247                 give_penalties_for_limits(live_nodes, penalty, limited, node);
248         }
249
250         if (arch_register_req_is(req, should_be_same)) {
251                 ir_node           *insn  = skip_Proj(node);
252                 allocation_info_t *info  = get_allocation_info(node);
253                 int                arity = get_irn_arity(insn);
254
255                 float factor = 1.0f / rbitset_popcount(&req->other_same, arity);
256                 for (int i = 0; i < arity; ++i) {
257                         if (!rbitset_is_set(&req->other_same, i))
258                                 continue;
259
260                         ir_node *op = get_irn_n(insn, i);
261
262                         /* if we the value at the should_be_same input doesn't die at the
263                          * node, then it is no use to propagate the constraints (since a
264                          * copy will emerge anyway) */
265                         if (ir_nodeset_contains(live_nodes, op))
266                                 continue;
267
268                         allocation_info_t *op_info = get_allocation_info(op);
269                         for (unsigned r = 0; r < n_regs; ++r) {
270                                 op_info->prefs[r] += info->prefs[r] * factor;
271                         }
272                 }
273         }
274 }
275
276 /**
277  * Walker: Runs an a block calculates the preferences for any
278  * node and every register from the considered register class.
279  */
280 static void analyze_block(ir_node *block, void *data)
281 {
282         float        weight = (float)get_block_execfreq(block);
283         ir_nodeset_t live_nodes;
284         (void) data;
285
286         ir_nodeset_init(&live_nodes);
287         be_liveness_end_of_block(lv, cls, block, &live_nodes);
288
289         sched_foreach_reverse(block, node) {
290                 if (is_Phi(node))
291                         break;
292
293                 be_foreach_definition(node, cls, value, req,
294                         check_defs(&live_nodes, weight, value, req);
295                 );
296
297                 /* mark last uses */
298                 int arity = get_irn_arity(node);
299
300                 /* the allocation info node currently only uses 1 unsigned value
301                    to mark last used inputs. So we will fail for a node with more than
302                    32 inputs. */
303                 allocation_info_t *info = get_allocation_info(node);
304                 if (arity >= (int) sizeof(info->last_uses) * 8) {
305                         panic("Node with more than %d inputs not supported yet",
306                                         (int) sizeof(info->last_uses) * 8);
307                 }
308
309                 for (int i = 0; i < arity; ++i) {
310                         ir_node                   *op  = get_irn_n(node, i);
311                         const arch_register_req_t *req = arch_get_irn_register_req(op);
312                         if (req->cls != cls)
313                                 continue;
314
315                         /* last usage of a value? */
316                         if (!ir_nodeset_contains(&live_nodes, op)) {
317                                 rbitset_set(info->last_uses, i);
318                         }
319                 }
320
321                 be_liveness_transfer(cls, node, &live_nodes);
322
323                 /* update weights based on usage constraints */
324                 be_foreach_use(node, cls, req, op, op_req,
325                         if (!arch_register_req_is(req, limited))
326                                 continue;
327
328                         give_penalties_for_limits(&live_nodes, weight * USE_FACTOR, req->limited, op);
329                 );
330         }
331
332         ir_nodeset_destroy(&live_nodes);
333 }
334
335 static void congruence_def(ir_nodeset_t *const live_nodes, ir_node const *const node, arch_register_req_t const *const req)
336 {
337         /* should be same constraint? */
338         if (arch_register_req_is(req, should_be_same)) {
339                 const ir_node *insn     = skip_Proj_const(node);
340                 int            arity    = get_irn_arity(insn);
341                 unsigned       node_idx = get_irn_idx(node);
342                 node_idx = uf_find(congruence_classes, node_idx);
343
344                 for (int i = 0; i < arity; ++i) {
345                         if (!rbitset_is_set(&req->other_same, i))
346                                 continue;
347
348                         ir_node *op     = get_irn_n(insn, i);
349                         int      op_idx = get_irn_idx(op);
350                         op_idx = uf_find(congruence_classes, op_idx);
351
352                         /* do we interfere with the value */
353                         bool interferes = false;
354                         foreach_ir_nodeset(live_nodes, live, iter) {
355                                 int lv_idx = get_irn_idx(live);
356                                 lv_idx     = uf_find(congruence_classes, lv_idx);
357                                 if (lv_idx == op_idx) {
358                                         interferes = true;
359                                         break;
360                                 }
361                         }
362                         /* don't put in same affinity class if we interfere */
363                         if (interferes)
364                                 continue;
365
366                         uf_union(congruence_classes, node_idx, op_idx);
367                         DB((dbg, LEVEL_3, "Merge %+F and %+F congruence classes\n",
368                             node, op));
369                         /* one should_be_same is enough... */
370                         break;
371                 }
372         }
373 }
374
375 static void create_congruence_class(ir_node *block, void *data)
376 {
377         ir_nodeset_t live_nodes;
378
379         (void) data;
380         ir_nodeset_init(&live_nodes);
381         be_liveness_end_of_block(lv, cls, block, &live_nodes);
382
383         /* check should be same constraints */
384         sched_foreach_reverse(block, node) {
385                 if (is_Phi(node))
386                         break;
387
388                 be_foreach_definition(node, cls, value, req,
389                         congruence_def(&live_nodes, value, req);
390                 );
391                 be_liveness_transfer(cls, node, &live_nodes);
392         }
393
394         /* check phi congruence classes */
395         sched_foreach(block, phi) {
396                 if (!is_Phi(phi))
397                         break;
398
399                 if (!arch_irn_consider_in_reg_alloc(cls, phi))
400                         continue;
401
402                 int node_idx = get_irn_idx(phi);
403                 node_idx = uf_find(congruence_classes, node_idx);
404
405                 int arity = get_irn_arity(phi);
406                 for (int i = 0; i < arity; ++i) {
407                         ir_node *op     = get_Phi_pred(phi, i);
408                         int      op_idx = get_irn_idx(op);
409                         op_idx = uf_find(congruence_classes, op_idx);
410
411                         /* do we interfere with the value */
412                         bool interferes = false;
413                         foreach_ir_nodeset(&live_nodes, live, iter) {
414                                 int lv_idx = get_irn_idx(live);
415                                 lv_idx     = uf_find(congruence_classes, lv_idx);
416                                 if (lv_idx == op_idx) {
417                                         interferes = true;
418                                         break;
419                                 }
420                         }
421                         /* don't put in same affinity class if we interfere */
422                         if (interferes)
423                                 continue;
424                         /* any other phi has the same input? */
425                         sched_foreach(block, phi) {
426                                 ir_node *oop;
427                                 int      oop_idx;
428                                 if (!is_Phi(phi))
429                                         break;
430                                 if (!arch_irn_consider_in_reg_alloc(cls, phi))
431                                         continue;
432                                 oop = get_Phi_pred(phi, i);
433                                 if (oop == op)
434                                         continue;
435                                 oop_idx = get_irn_idx(oop);
436                                 oop_idx = uf_find(congruence_classes, oop_idx);
437                                 if (oop_idx == op_idx) {
438                                         interferes = true;
439                                         break;
440                                 }
441                         }
442                         if (interferes)
443                                 continue;
444
445                         /* merge the 2 congruence classes and sum up their preferences */
446                         int old_node_idx = node_idx;
447                         node_idx = uf_union(congruence_classes, node_idx, op_idx);
448                         DB((dbg, LEVEL_3, "Merge %+F and %+F congruence classes\n",
449                             phi, op));
450
451                         old_node_idx = node_idx == old_node_idx ? op_idx : old_node_idx;
452                         allocation_info_t *head_info
453                                 = get_allocation_info(get_idx_irn(irg, node_idx));
454                         allocation_info_t *other_info
455                                 = get_allocation_info(get_idx_irn(irg, old_node_idx));
456                         for (unsigned r = 0; r < n_regs; ++r) {
457                                 head_info->prefs[r] += other_info->prefs[r];
458                         }
459                 }
460         }
461         ir_nodeset_destroy(&live_nodes);
462 }
463
464 static void set_congruence_prefs(ir_node *node, void *data)
465 {
466         (void) data;
467         unsigned node_idx = get_irn_idx(node);
468         unsigned node_set = uf_find(congruence_classes, node_idx);
469
470         /* head of congruence class or not in any class */
471         if (node_set == node_idx)
472                 return;
473
474         if (!arch_irn_consider_in_reg_alloc(cls, node))
475                 return;
476
477         ir_node *head = get_idx_irn(irg, node_set);
478         allocation_info_t *head_info = get_allocation_info(head);
479         allocation_info_t *info      = get_allocation_info(node);
480
481         memcpy(info->prefs, head_info->prefs, n_regs * sizeof(info->prefs[0]));
482 }
483
484 static void combine_congruence_classes(void)
485 {
486         size_t n = get_irg_last_idx(irg);
487         congruence_classes = XMALLOCN(int, n);
488         uf_init(congruence_classes, n);
489
490         /* create congruence classes */
491         irg_block_walk_graph(irg, create_congruence_class, NULL, NULL);
492         /* merge preferences */
493         irg_walk_graph(irg, set_congruence_prefs, NULL, NULL);
494         free(congruence_classes);
495 }
496
497
498
499 /**
500  * Assign register reg to the given node.
501  *
502  * @param node  the node
503  * @param reg   the register
504  */
505 static void use_reg(ir_node *node, const arch_register_t *reg, unsigned width)
506 {
507         unsigned r = reg->index;
508         for (unsigned r0 = r; r0 < r + width; ++r0)
509                 assignments[r0] = node;
510         arch_set_irn_register(node, reg);
511 }
512
513 static void free_reg_of_value(ir_node *node)
514 {
515         if (!arch_irn_consider_in_reg_alloc(cls, node))
516                 return;
517
518         const arch_register_t     *reg = arch_get_irn_register(node);
519         const arch_register_req_t *req = arch_get_irn_register_req(node);
520         unsigned                   r   = reg->index;
521         /* assignment->value may be NULL if a value is used at 2 inputs
522          * so it gets freed twice. */
523         for (unsigned r0 = r; r0 < r + req->width; ++r0) {
524                 assert(assignments[r0] == node || assignments[r0] == NULL);
525                 assignments[r0] = NULL;
526         }
527 }
528
529 /**
530  * Compare two register preferences in decreasing order.
531  */
532 static int compare_reg_pref(const void *e1, const void *e2)
533 {
534         const reg_pref_t *rp1 = (const reg_pref_t*) e1;
535         const reg_pref_t *rp2 = (const reg_pref_t*) e2;
536         if (rp1->pref < rp2->pref)
537                 return 1;
538         if (rp1->pref > rp2->pref)
539                 return -1;
540         return 0;
541 }
542
543 static void fill_sort_candidates(reg_pref_t *regprefs,
544                                  const allocation_info_t *info)
545 {
546         for (unsigned r = 0; r < n_regs; ++r) {
547                 float pref = info->prefs[r];
548                 regprefs[r].num  = r;
549                 regprefs[r].pref = pref;
550         }
551         /* TODO: use a stable sort here to avoid unnecessary register jumping */
552         qsort(regprefs, n_regs, sizeof(regprefs[0]), compare_reg_pref);
553 }
554
555 static bool try_optimistic_split(ir_node *to_split, ir_node *before,
556                                  float pref, float pref_delta,
557                                  unsigned *forbidden_regs, int recursion)
558 {
559         (void) pref;
560         unsigned           r = 0;
561         allocation_info_t *info = get_allocation_info(to_split);
562         float              delta = 0;
563
564         /* stupid hack: don't optimistically split don't spill nodes...
565          * (so we don't split away the values produced because of
566          *  must_be_different constraints) */
567         ir_node *original_insn = skip_Proj(info->original_value);
568         if (arch_get_irn_flags(original_insn) & arch_irn_flags_dont_spill)
569                 return false;
570
571         const arch_register_t *from_reg        = arch_get_irn_register(to_split);
572         unsigned               from_r          = from_reg->index;
573         ir_node               *block           = get_nodes_block(before);
574         float split_threshold = (float)get_block_execfreq(block) * SPLIT_DELTA;
575
576         if (pref_delta < split_threshold*0.5)
577                 return false;
578
579         /* find the best free position where we could move to */
580         reg_pref_t *prefs = ALLOCAN(reg_pref_t, n_regs);
581         fill_sort_candidates(prefs, info);
582         unsigned i;
583         for (i = 0; i < n_regs; ++i) {
584                 /* we need a normal register which is not an output register
585                    an different from the current register of to_split */
586                 r = prefs[i].num;
587                 if (!rbitset_is_set(normal_regs, r))
588                         continue;
589                 if (rbitset_is_set(forbidden_regs, r))
590                         continue;
591                 if (r == from_r)
592                         continue;
593
594                 /* is the split worth it? */
595                 delta = pref_delta + prefs[i].pref;
596                 if (delta < split_threshold) {
597                         DB((dbg, LEVEL_3, "Not doing optimistical split of %+F (depth %d), win %f too low\n",
598                                 to_split, recursion, delta));
599                         return false;
600                 }
601
602                 /* if the register is free then we can do the split */
603                 if (assignments[r] == NULL)
604                         break;
605
606                 /* otherwise we might try recursively calling optimistic_split */
607                 if (recursion+1 > MAX_OPTIMISTIC_SPLIT_RECURSION)
608                         continue;
609
610                 float apref        = prefs[i].pref;
611                 float apref_delta  = i+1 < n_regs ? apref - prefs[i+1].pref : 0;
612                 apref_delta += pref_delta - split_threshold;
613
614                 /* our source register isn't a useful destination for recursive
615                    splits */
616                 bool old_source_state = rbitset_is_set(forbidden_regs, from_r);
617                 rbitset_set(forbidden_regs, from_r);
618                 /* try recursive split */
619                 bool res = try_optimistic_split(assignments[r], before, apref,
620                                               apref_delta, forbidden_regs, recursion+1);
621                 /* restore our destination */
622                 if (old_source_state) {
623                         rbitset_set(forbidden_regs, from_r);
624                 } else {
625                         rbitset_clear(forbidden_regs, from_r);
626                 }
627
628                 if (res)
629                         break;
630         }
631         if (i >= n_regs)
632                 return false;
633
634         const arch_register_t *reg   = arch_register_for_index(cls, r);
635         ir_node               *copy  = be_new_Copy(block, to_split);
636         unsigned               width = 1;
637         mark_as_copy_of(copy, to_split);
638         /* hacky, but correct here */
639         if (assignments[from_reg->index] == to_split)
640                 free_reg_of_value(to_split);
641         use_reg(copy, reg, width);
642         sched_add_before(before, copy);
643
644         DB((dbg, LEVEL_3,
645             "Optimistic live-range split %+F move %+F(%s) -> %s before %+F (win %f, depth %d)\n",
646             copy, to_split, from_reg->name, reg->name, before, delta, recursion));
647         return true;
648 }
649
650 /**
651  * Determine and assign a register for node @p node
652  */
653 static void assign_reg(ir_node const *const block, ir_node *const node, arch_register_req_t const *const req, unsigned *const forbidden_regs)
654 {
655         assert(!is_Phi(node));
656         /* preassigned register? */
657         arch_register_t const *final_reg = arch_get_irn_register(node);
658         unsigned         const width     = req->width;
659         if (final_reg != NULL) {
660                 DB((dbg, LEVEL_2, "Preassignment %+F -> %s\n", node, final_reg->name));
661                 use_reg(node, final_reg, width);
662                 return;
663         }
664
665         /* ignore reqs must be preassigned */
666         assert(!arch_register_req_is(req, ignore));
667
668         /* give should_be_same boni */
669         allocation_info_t *info    = get_allocation_info(node);
670         ir_node           *in_node = skip_Proj(node);
671         if (arch_register_req_is(req, should_be_same)) {
672                 float weight = (float)get_block_execfreq(block);
673                 int   arity  = get_irn_arity(in_node);
674
675                 assert(arity <= (int) sizeof(req->other_same) * 8);
676                 for (int i = 0; i < arity; ++i) {
677                         if (!rbitset_is_set(&req->other_same, i))
678                                 continue;
679
680                         ir_node               *in        = get_irn_n(in_node, i);
681                         const arch_register_t *reg       = arch_get_irn_register(in);
682                         unsigned               reg_index = reg->index;
683
684                         /* if the value didn't die here then we should not propagate the
685                          * should_be_same info */
686                         if (assignments[reg_index] == in)
687                                 continue;
688
689                         info->prefs[reg_index] += weight * AFF_SHOULD_BE_SAME;
690                 }
691         }
692
693         /* create list of register candidates and sort by their preference */
694         DB((dbg, LEVEL_2, "Candidates for %+F:", node));
695         reg_pref_t *reg_prefs = ALLOCAN(reg_pref_t, n_regs);
696         fill_sort_candidates(reg_prefs, info);
697         for (unsigned r = 0; r < n_regs; ++r) {
698                 unsigned num = reg_prefs[r].num;
699                 if (!rbitset_is_set(normal_regs, num))
700                         continue;
701                 const arch_register_t *reg = arch_register_for_index(cls, num);
702                 DB((dbg, LEVEL_2, " %s(%f)", reg->name, reg_prefs[r].pref));
703         }
704         DB((dbg, LEVEL_2, "\n"));
705
706         const unsigned *allowed_regs = normal_regs;
707         if (arch_register_req_is(req, limited)) {
708                 allowed_regs = req->limited;
709         }
710
711         unsigned final_reg_index = 0;
712         unsigned r;
713         for (r = 0; r < n_regs; ++r) {
714                 final_reg_index = reg_prefs[r].num;
715                 if (!rbitset_is_set(allowed_regs, final_reg_index))
716                         continue;
717                 /* alignment constraint? */
718                 if (width > 1) {
719                         if (arch_register_req_is(req, aligned) && (final_reg_index % width) != 0)
720                                 continue;
721                         bool fine = true;
722                         for (unsigned r0 = r+1; r0 < r+width; ++r0) {
723                                 if (assignments[r0] != NULL)
724                                         fine = false;
725                         }
726                         /* TODO: attempt optimistic split here */
727                         if (!fine)
728                                 continue;
729                 }
730
731                 if (assignments[final_reg_index] == NULL)
732                         break;
733                 float    pref   = reg_prefs[r].pref;
734                 float    delta  = r+1 < n_regs ? pref - reg_prefs[r+1].pref : 0;
735                 ir_node *before = skip_Proj(node);
736                 bool     res
737                         = try_optimistic_split(assignments[final_reg_index], before, pref,
738                                                delta, forbidden_regs, 0);
739                 if (res)
740                         break;
741         }
742         if (r >= n_regs) {
743                 /* the common reason to hit this panic is when 1 of your nodes is not
744                  * register pressure faithful */
745                 panic("No register left for %+F\n", node);
746         }
747
748         final_reg = arch_register_for_index(cls, final_reg_index);
749         DB((dbg, LEVEL_2, "Assign %+F -> %s\n", node, final_reg->name));
750         use_reg(node, final_reg, width);
751 }
752
753 /**
754  * Add an permutation in front of a node and change the assignments
755  * due to this permutation.
756  *
757  * To understand this imagine a permutation like this:
758  *
759  * 1 -> 2
760  * 2 -> 3
761  * 3 -> 1, 5
762  * 4 -> 6
763  * 5
764  * 6
765  * 7 -> 7
766  *
767  * First we count how many destinations a single value has. At the same time
768  * we can be sure that each destination register has at most 1 source register
769  * (it can have 0 which means we don't care what value is in it).
770  * We ignore all fulfilled permuations (like 7->7)
771  * In a first pass we create as much copy instructions as possible as they
772  * are generally cheaper than exchanges. We do this by counting into how many
773  * destinations a register has to be copied (in the example it's 2 for register
774  * 3, or 1 for the registers 1,2,4 and 7).
775  * We can then create a copy into every destination register when the usecount
776  * of that register is 0 (= noone else needs the value in the register).
777  *
778  * After this step we should only have cycles left. We implement a cyclic
779  * permutation of n registers with n-1 transpositions.
780  *
781  * @param live_nodes   the set of live nodes, updated due to live range split
782  * @param before       the node before we add the permutation
783  * @param permutation  the permutation array indices are the destination
784  *                     registers, the values in the array are the source
785  *                     registers.
786  */
787 static void permute_values(ir_nodeset_t *live_nodes, ir_node *before,
788                            unsigned *permutation)
789 {
790         unsigned *n_used = ALLOCANZ(unsigned, n_regs);
791
792         /* determine how often each source register needs to be read */
793         for (unsigned r = 0; r < n_regs; ++r) {
794                 unsigned  old_reg = permutation[r];
795                 ir_node  *value;
796
797                 value = assignments[old_reg];
798                 if (value == NULL) {
799                         /* nothing to do here, reg is not live. Mark it as fixpoint
800                          * so we ignore it in the next steps */
801                         permutation[r] = r;
802                         continue;
803                 }
804
805                 ++n_used[old_reg];
806         }
807
808         ir_node *block = get_nodes_block(before);
809
810         /* step1: create copies where immediately possible */
811         for (unsigned r = 0; r < n_regs; /* empty */) {
812                 unsigned old_r = permutation[r];
813
814                 /* - no need to do anything for fixed points.
815                    - we can't copy if the value in the dest reg is still needed */
816                 if (old_r == r || n_used[r] > 0) {
817                         ++r;
818                         continue;
819                 }
820
821                 /* create a copy */
822                 ir_node *src  = assignments[old_r];
823                 ir_node *copy = be_new_Copy(block, src);
824                 sched_add_before(before, copy);
825                 const arch_register_t *reg = arch_register_for_index(cls, r);
826                 DB((dbg, LEVEL_2, "Copy %+F (from %+F, before %+F) -> %s\n",
827                     copy, src, before, reg->name));
828                 mark_as_copy_of(copy, src);
829                 unsigned width = 1; /* TODO */
830                 use_reg(copy, reg, width);
831
832                 if (live_nodes != NULL) {
833                         ir_nodeset_insert(live_nodes, copy);
834                 }
835
836                 /* old register has 1 user less, permutation is resolved */
837                 assert(arch_get_irn_register(src)->index == old_r);
838                 permutation[r] = r;
839
840                 assert(n_used[old_r] > 0);
841                 --n_used[old_r];
842                 if (n_used[old_r] == 0) {
843                         if (live_nodes != NULL) {
844                                 ir_nodeset_remove(live_nodes, src);
845                         }
846                         free_reg_of_value(src);
847                 }
848
849                 /* advance or jump back (if this copy enabled another copy) */
850                 if (old_r < r && n_used[old_r] == 0) {
851                         r = old_r;
852                 } else {
853                         ++r;
854                 }
855         }
856
857         /* at this point we only have "cycles" left which we have to resolve with
858          * perm instructions
859          * TODO: if we have free registers left, then we should really use copy
860          * instructions for any cycle longer than 2 registers...
861          * (this is probably architecture dependent, there might be archs where
862          *  copies are preferable even for 2-cycles) */
863
864         /* create perms with the rest */
865         for (unsigned r = 0; r < n_regs; /* empty */) {
866                 unsigned old_r = permutation[r];
867
868                 if (old_r == r) {
869                         ++r;
870                         continue;
871                 }
872
873                 /* we shouldn't have copies from 1 value to multiple destinations left*/
874                 assert(n_used[old_r] == 1);
875
876                 /* exchange old_r and r2; after that old_r is a fixed point */
877                 unsigned r2 = permutation[old_r];
878
879                 ir_node *in[2] = { assignments[r2], assignments[old_r] };
880                 ir_node *perm = be_new_Perm(cls, block, 2, in);
881                 sched_add_before(before, perm);
882                 DB((dbg, LEVEL_2, "Perm %+F (perm %+F,%+F, before %+F)\n",
883                     perm, in[0], in[1], before));
884
885                 unsigned width = 1; /* TODO */
886
887                 ir_node *proj0 = new_r_Proj(perm, get_irn_mode(in[0]), 0);
888                 mark_as_copy_of(proj0, in[0]);
889                 const arch_register_t *reg0 = arch_register_for_index(cls, old_r);
890                 use_reg(proj0, reg0, width);
891
892                 ir_node *proj1 = new_r_Proj(perm, get_irn_mode(in[1]), 1);
893                 mark_as_copy_of(proj1, in[1]);
894                 const arch_register_t *reg1 = arch_register_for_index(cls, r2);
895                 use_reg(proj1, reg1, width);
896
897                 /* 1 value is now in the correct register */
898                 permutation[old_r] = old_r;
899                 /* the source of r changed to r2 */
900                 permutation[r] = r2;
901
902                 /* if we have reached a fixpoint update data structures */
903                 if (live_nodes != NULL) {
904                         ir_nodeset_remove(live_nodes, in[0]);
905                         ir_nodeset_remove(live_nodes, in[1]);
906                         ir_nodeset_remove(live_nodes, proj0);
907                         ir_nodeset_insert(live_nodes, proj1);
908                 }
909         }
910
911 #ifndef NDEBUG
912         /* now we should only have fixpoints left */
913         for (unsigned r = 0; r < n_regs; ++r) {
914                 assert(permutation[r] == r);
915         }
916 #endif
917 }
918
919 /**
920  * Free regs for values last used.
921  *
922  * @param live_nodes   set of live nodes, will be updated
923  * @param node         the node to consider
924  */
925 static void free_last_uses(ir_nodeset_t *live_nodes, ir_node *node)
926 {
927         allocation_info_t *info      = get_allocation_info(node);
928         const unsigned    *last_uses = info->last_uses;
929         int                arity     = get_irn_arity(node);
930
931         for (int i = 0; i < arity; ++i) {
932                 /* check if one operand is the last use */
933                 if (!rbitset_is_set(last_uses, i))
934                         continue;
935
936                 ir_node *op = get_irn_n(node, i);
937                 free_reg_of_value(op);
938                 ir_nodeset_remove(live_nodes, op);
939         }
940 }
941
942 /**
943  * change inputs of a node to the current value (copies/perms)
944  */
945 static void rewire_inputs(ir_node *node)
946 {
947         int arity = get_irn_arity(node);
948         for (int i = 0; i < arity; ++i) {
949                 ir_node           *op = get_irn_n(node, i);
950                 allocation_info_t *info = try_get_allocation_info(op);
951
952                 if (info == NULL)
953                         continue;
954
955                 info = get_allocation_info(info->original_value);
956                 if (info->current_value != op) {
957                         set_irn_n(node, i, info->current_value);
958                 }
959         }
960 }
961
962 /**
963  * Create a bitset of registers occupied with value living through an
964  * instruction
965  */
966 static void determine_live_through_regs(unsigned *bitset, ir_node *node)
967 {
968         const allocation_info_t *info = get_allocation_info(node);
969
970         /* mark all used registers as potentially live-through */
971         for (unsigned r = 0; r < n_regs; ++r) {
972                 if (assignments[r] == NULL)
973                         continue;
974                 if (!rbitset_is_set(normal_regs, r))
975                         continue;
976
977                 rbitset_set(bitset, r);
978         }
979
980         /* remove registers of value dying at the instruction */
981         int arity = get_irn_arity(node);
982         for (int i = 0; i < arity; ++i) {
983                 if (!rbitset_is_set(info->last_uses, i))
984                         continue;
985
986                 ir_node               *op  = get_irn_n(node, i);
987                 const arch_register_t *reg = arch_get_irn_register(op);
988                 rbitset_clear(bitset, reg->index);
989         }
990 }
991
992 static void solve_lpp(ir_nodeset_t *live_nodes, ir_node *node,
993                       unsigned *forbidden_regs, unsigned *live_through_regs)
994 {
995         unsigned *forbidden_edges = rbitset_malloc(n_regs * n_regs);
996         int      *lpp_vars        = XMALLOCNZ(int, n_regs*n_regs);
997
998         lpp_t *lpp = lpp_new("prefalloc", lpp_minimize);
999         //lpp_set_time_limit(lpp, 20);
1000         lpp_set_log(lpp, stdout);
1001
1002         /** mark some edges as forbidden */
1003         be_foreach_use(node, cls, req, op, op_req,
1004                 if (!arch_register_req_is(req, limited))
1005                         continue;
1006
1007                 const unsigned        *limited     = req->limited;
1008                 const arch_register_t *reg         = arch_get_irn_register(op);
1009                 unsigned               current_reg = reg->index;
1010                 for (unsigned r = 0; r < n_regs; ++r) {
1011                         if (rbitset_is_set(limited, r))
1012                                 continue;
1013
1014                         rbitset_set(forbidden_edges, current_reg*n_regs + r);
1015                 }
1016         );
1017
1018         /* add all combinations, except for not allowed ones */
1019         for (unsigned l = 0; l < n_regs; ++l) {
1020                 if (!rbitset_is_set(normal_regs, l)) {
1021                         char name[15];
1022                         snprintf(name, sizeof(name), "%u_to_%u", l, l);
1023                         lpp_vars[l*n_regs+l] = lpp_add_var(lpp, name, lpp_binary, 1);
1024                         continue;
1025                 }
1026
1027                 for (unsigned r = 0; r < n_regs; ++r) {
1028                         if (!rbitset_is_set(normal_regs, r))
1029                                 continue;
1030                         if (rbitset_is_set(forbidden_edges, l*n_regs + r))
1031                                 continue;
1032                         /* livethrough values may not use constrained output registers */
1033                         if (rbitset_is_set(live_through_regs, l)
1034                             && rbitset_is_set(forbidden_regs, r))
1035                                 continue;
1036
1037                         char name[15];
1038                         snprintf(name, sizeof(name), "%u_to_%u", l, r);
1039
1040                         double costs = l==r ? 9 : 8;
1041                         lpp_vars[l*n_regs+r]
1042                                 = lpp_add_var(lpp, name, lpp_binary, costs);
1043                         assert(lpp_vars[l*n_regs+r] > 0);
1044                 }
1045         }
1046         /* add constraints */
1047         for (unsigned l = 0; l < n_regs; ++l) {
1048                 /* only 1 destination per register */
1049                 int constraint = -1;
1050                 for (unsigned r = 0; r < n_regs; ++r) {
1051                         int var = lpp_vars[l*n_regs+r];
1052                         if (var == 0)
1053                                 continue;
1054                         if (constraint < 0) {
1055                                 char name[64];
1056                                 snprintf(name, sizeof(name), "%u_to_dest", l);
1057                                 constraint = lpp_add_cst(lpp, name, lpp_equal, 1);
1058                         }
1059                         lpp_set_factor_fast(lpp, constraint, var, 1);
1060                 }
1061                 /* each destination used by at most 1 value */
1062                 constraint = -1;
1063                 for (unsigned r = 0; r < n_regs; ++r) {
1064                         int var = lpp_vars[r*n_regs+l];
1065                         if (var == 0)
1066                                 continue;
1067                         if (constraint < 0) {
1068                                 char name[64];
1069                                 snprintf(name, sizeof(name), "one_to_%u", l);
1070                                 constraint = lpp_add_cst(lpp, name, lpp_less_equal, 1);
1071                         }
1072                         lpp_set_factor_fast(lpp, constraint, var, 1);
1073                 }
1074         }
1075
1076         lpp_dump_plain(lpp, fopen("lppdump.txt", "w"));
1077
1078         /* solve lpp */
1079         lpp_solve(lpp, be_options.ilp_server, be_options.ilp_solver);
1080         if (!lpp_is_sol_valid(lpp))
1081                 panic("ilp solution not valid!");
1082
1083         unsigned *assignment = ALLOCAN(unsigned, n_regs);
1084         for (unsigned l = 0; l < n_regs; ++l) {
1085                 unsigned dest_reg = (unsigned)-1;
1086                 for (unsigned r = 0; r < n_regs; ++r) {
1087                         int var = lpp_vars[l*n_regs+r];
1088                         if (var == 0)
1089                                 continue;
1090                         double val = lpp_get_var_sol(lpp, var);
1091                         if (val == 1) {
1092                                 assert(dest_reg == (unsigned)-1);
1093                                 dest_reg = r;
1094                         }
1095                 }
1096                 assert(dest_reg != (unsigned)-1);
1097                 assignment[dest_reg] = l;
1098         }
1099
1100         fprintf(stderr, "Assignment: ");
1101         for (unsigned l = 0; l < n_regs; ++l) {
1102                 fprintf(stderr, "%u ", assignment[l]);
1103         }
1104         fprintf(stderr, "\n");
1105         fflush(stdout);
1106         permute_values(live_nodes, node, assignment);
1107         lpp_free(lpp);
1108 }
1109
1110 static bool is_aligned(unsigned num, unsigned alignment)
1111 {
1112         unsigned mask = alignment-1;
1113         assert(is_po2(alignment));
1114         return (num&mask) == 0;
1115 }
1116
1117 /**
1118  * Enforce constraints at a node by live range splits.
1119  *
1120  * @param  live_nodes  the set of live nodes, might be changed
1121  * @param  node        the current node
1122  */
1123 static void enforce_constraints(ir_nodeset_t *live_nodes, ir_node *node,
1124                                 unsigned *forbidden_regs)
1125 {
1126         /* see if any use constraints are not met and whether double-width
1127          * values are involved */
1128         bool double_width = false;
1129         bool good = true;
1130         be_foreach_use(node, cls, req, op, op_req,
1131                 /* are there any limitations for the i'th operand? */
1132                 if (req->width > 1)
1133                         double_width = true;
1134                 const arch_register_t *reg       = arch_get_irn_register(op);
1135                 unsigned               reg_index = reg->index;
1136                 if (arch_register_req_is(req, aligned)) {
1137                         if (!is_aligned(reg_index, req->width)) {
1138                                 good = false;
1139                                 continue;
1140                         }
1141                 }
1142                 if (!arch_register_req_is(req, limited))
1143                         continue;
1144
1145                 const unsigned *limited = req->limited;
1146                 if (!rbitset_is_set(limited, reg_index)) {
1147                         /* found an assignment outside the limited set */
1148                         good = false;
1149                         continue;
1150                 }
1151         );
1152
1153         /* is any of the live-throughs using a constrained output register? */
1154         unsigned *live_through_regs = NULL;
1155         be_foreach_definition(node, cls, value, req,
1156                 (void)value;
1157                 if (req->width > 1)
1158                         double_width = true;
1159                 if (!arch_register_req_is(req, limited))
1160                         continue;
1161                 if (live_through_regs == NULL) {
1162                         live_through_regs = rbitset_alloca(n_regs);
1163                         determine_live_through_regs(live_through_regs, node);
1164                 }
1165                 rbitset_or(forbidden_regs, req->limited, n_regs);
1166                 if (rbitsets_have_common(req->limited, live_through_regs, n_regs))
1167                         good = false;
1168         );
1169
1170         if (good)
1171                 return;
1172
1173         /* create these arrays if we haven't yet */
1174         if (live_through_regs == NULL) {
1175                 live_through_regs = rbitset_alloca(n_regs);
1176         }
1177
1178         if (double_width) {
1179                 /* only the ILP variant can solve this yet */
1180                 solve_lpp(live_nodes, node, forbidden_regs, live_through_regs);
1181                 return;
1182         }
1183
1184         /* at this point we have to construct a bipartite matching problem to see
1185          * which values should go to which registers
1186          * Note: We're building the matrix in "reverse" - source registers are
1187          *       right, destinations left because this will produce the solution
1188          *       in the format required for permute_values.
1189          */
1190         hungarian_problem_t *bp
1191                 = hungarian_new(n_regs, n_regs, HUNGARIAN_MATCH_PERFECT);
1192
1193         /* add all combinations, then remove not allowed ones */
1194         for (unsigned l = 0; l < n_regs; ++l) {
1195                 if (!rbitset_is_set(normal_regs, l)) {
1196                         hungarian_add(bp, l, l, 1);
1197                         continue;
1198                 }
1199
1200                 for (unsigned r = 0; r < n_regs; ++r) {
1201                         if (!rbitset_is_set(normal_regs, r))
1202                                 continue;
1203                         /* livethrough values may not use constrainted output registers */
1204                         if (rbitset_is_set(live_through_regs, l)
1205                                         && rbitset_is_set(forbidden_regs, r))
1206                                 continue;
1207
1208                         hungarian_add(bp, r, l, l == r ? 9 : 8);
1209                 }
1210         }
1211
1212         be_foreach_use(node, cls, req, op, op_req,
1213                 if (!arch_register_req_is(req, limited))
1214                         continue;
1215
1216                 const unsigned        *limited     = req->limited;
1217                 const arch_register_t *reg         = arch_get_irn_register(op);
1218                 unsigned               current_reg = reg->index;
1219                 for (unsigned r = 0; r < n_regs; ++r) {
1220                         if (rbitset_is_set(limited, r))
1221                                 continue;
1222                         hungarian_remove(bp, r, current_reg);
1223                 }
1224         );
1225
1226         //hungarian_print_cost_matrix(bp, 1);
1227         hungarian_prepare_cost_matrix(bp, HUNGARIAN_MODE_MAXIMIZE_UTIL);
1228
1229         unsigned *assignment = ALLOCAN(unsigned, n_regs);
1230         int res = hungarian_solve(bp, assignment, NULL, 0);
1231         assert(res == 0);
1232
1233         hungarian_free(bp);
1234
1235         permute_values(live_nodes, node, assignment);
1236 }
1237
1238 /** test whether a node @p n is a copy of the value of node @p of */
1239 static bool is_copy_of(ir_node *value, ir_node *test_value)
1240 {
1241         if (value == test_value)
1242                 return true;
1243
1244         allocation_info_t *info      = get_allocation_info(value);
1245         allocation_info_t *test_info = get_allocation_info(test_value);
1246         return test_info->original_value == info->original_value;
1247 }
1248
1249 /**
1250  * find a value in the end-assignment of a basic block
1251  * @returns the index into the assignment array if found
1252  *          -1 if not found
1253  */
1254 static int find_value_in_block_info(block_info_t *info, ir_node *value)
1255 {
1256         ir_node  **end_assignments = info->assignments;
1257         for (unsigned r = 0; r < n_regs; ++r) {
1258                 ir_node *a_value = end_assignments[r];
1259
1260                 if (a_value == NULL)
1261                         continue;
1262                 if (is_copy_of(a_value, value))
1263                         return (int) r;
1264         }
1265
1266         return -1;
1267 }
1268
1269 /**
1270  * Create the necessary permutations at the end of a basic block to fullfill
1271  * the register assignment for phi-nodes in the next block
1272  */
1273 static void add_phi_permutations(ir_node *block, int p)
1274 {
1275         ir_node      *pred      = get_Block_cfgpred_block(block, p);
1276         block_info_t *pred_info = get_block_info(pred);
1277
1278         /* predecessor not processed yet? nothing to do */
1279         if (!pred_info->processed)
1280                 return;
1281
1282         unsigned *permutation = ALLOCAN(unsigned, n_regs);
1283         for (unsigned r = 0; r < n_regs; ++r) {
1284                 permutation[r] = r;
1285         }
1286
1287         /* check phi nodes */
1288         bool     need_permutation = false;
1289         ir_node *phi              = sched_first(block);
1290         for ( ; is_Phi(phi); phi = sched_next(phi)) {
1291                 if (!arch_irn_consider_in_reg_alloc(cls, phi))
1292                         continue;
1293
1294                 ir_node *phi_pred = get_Phi_pred(phi, p);
1295                 int      a        = find_value_in_block_info(pred_info, phi_pred);
1296                 assert(a >= 0);
1297
1298                 const arch_register_t *reg  = arch_get_irn_register(phi);
1299                 int                    regn = reg->index;
1300                 /* same register? nothing to do */
1301                 if (regn == a)
1302                         continue;
1303
1304                 ir_node               *op     = pred_info->assignments[a];
1305                 const arch_register_t *op_reg = arch_get_irn_register(op);
1306                 /* Virtual registers are ok, too. */
1307                 if (op_reg->type & arch_register_type_virtual)
1308                         continue;
1309
1310                 permutation[regn] = a;
1311                 need_permutation  = true;
1312         }
1313
1314         if (need_permutation) {
1315                 /* permute values at end of predecessor */
1316                 ir_node **old_assignments = assignments;
1317                 assignments     = pred_info->assignments;
1318                 permute_values(NULL, be_get_end_of_block_insertion_point(pred),
1319                                permutation);
1320                 assignments = old_assignments;
1321         }
1322
1323         /* change phi nodes to use the copied values */
1324         phi = sched_first(block);
1325         for ( ; is_Phi(phi); phi = sched_next(phi)) {
1326                 if (!arch_irn_consider_in_reg_alloc(cls, phi))
1327                         continue;
1328
1329                 /* we have permuted all values into the correct registers so we can
1330                    simply query which value occupies the phis register in the
1331                    predecessor */
1332                 int      a  = arch_get_irn_register(phi)->index;
1333                 ir_node *op = pred_info->assignments[a];
1334                 set_Phi_pred(phi, p, op);
1335         }
1336 }
1337
1338 /**
1339  * Set preferences for a phis register based on the registers used on the
1340  * phi inputs.
1341  */
1342 static void adapt_phi_prefs(ir_node *phi)
1343 {
1344         ir_node           *block = get_nodes_block(phi);
1345         allocation_info_t *info  = get_allocation_info(phi);
1346
1347         int arity = get_irn_arity(phi);
1348         for (int i = 0; i < arity; ++i) {
1349                 ir_node               *op  = get_irn_n(phi, i);
1350                 const arch_register_t *reg = arch_get_irn_register(op);
1351
1352                 if (reg == NULL)
1353                         continue;
1354                 /* we only give the bonus if the predecessor already has registers
1355                  * assigned, otherwise we only see a dummy value
1356                  * and any conclusions about its register are useless */
1357                 ir_node      *pred_block      = get_Block_cfgpred_block(block, i);
1358                 block_info_t *pred_block_info = get_block_info(pred_block);
1359                 if (!pred_block_info->processed)
1360                         continue;
1361
1362                 /* give bonus for already assigned register */
1363                 float weight = (float)get_block_execfreq(pred_block);
1364                 info->prefs[reg->index] += weight * AFF_PHI;
1365         }
1366 }
1367
1368 /**
1369  * After a phi has been assigned a register propagate preference inputs
1370  * to the phi inputs.
1371  */
1372 static void propagate_phi_register(ir_node *phi, unsigned assigned_r)
1373 {
1374         ir_node *block = get_nodes_block(phi);
1375
1376         int arity = get_irn_arity(phi);
1377         for (int i = 0; i < arity; ++i) {
1378                 ir_node           *op         = get_Phi_pred(phi, i);
1379                 allocation_info_t *info       = get_allocation_info(op);
1380                 ir_node           *pred_block = get_Block_cfgpred_block(block, i);
1381                 float              weight
1382                         = (float)get_block_execfreq(pred_block) * AFF_PHI;
1383
1384                 if (info->prefs[assigned_r] >= weight)
1385                         continue;
1386
1387                 /* promote the prefered register */
1388                 for (unsigned r = 0; r < n_regs; ++r) {
1389                         if (info->prefs[r] > -weight) {
1390                                 info->prefs[r] = -weight;
1391                         }
1392                 }
1393                 info->prefs[assigned_r] = weight;
1394
1395                 if (is_Phi(op))
1396                         propagate_phi_register(op, assigned_r);
1397         }
1398 }
1399
1400 static void assign_phi_registers(ir_node *block)
1401 {
1402         /* count phi nodes */
1403         int n_phis = 0;
1404         sched_foreach(block, node) {
1405                 if (!is_Phi(node))
1406                         break;
1407                 if (!arch_irn_consider_in_reg_alloc(cls, node))
1408                         continue;
1409                 ++n_phis;
1410         }
1411
1412         if (n_phis == 0)
1413                 return;
1414
1415         /* build a bipartite matching problem for all phi nodes */
1416         hungarian_problem_t *bp
1417                 = hungarian_new(n_phis, n_regs, HUNGARIAN_MATCH_PERFECT);
1418         int n = 0;
1419         sched_foreach(block, node) {
1420                 if (!is_Phi(node))
1421                         break;
1422                 if (!arch_irn_consider_in_reg_alloc(cls, node))
1423                         continue;
1424
1425                 /* give boni for predecessor colorings */
1426                 adapt_phi_prefs(node);
1427                 /* add stuff to bipartite problem */
1428                 allocation_info_t *info = get_allocation_info(node);
1429                 DB((dbg, LEVEL_3, "Prefs for %+F: ", node));
1430                 for (unsigned r = 0; r < n_regs; ++r) {
1431                         if (!rbitset_is_set(normal_regs, r))
1432                                 continue;
1433
1434                         float costs = info->prefs[r];
1435                         costs = costs < 0 ? -logf(-costs+1) : logf(costs+1);
1436                         costs *= 100;
1437                         costs += 10000;
1438                         hungarian_add(bp, n, r, (int)costs);
1439                         DB((dbg, LEVEL_3, " %s(%f)", arch_register_for_index(cls, r)->name,
1440                                                 info->prefs[r]));
1441                 }
1442                 DB((dbg, LEVEL_3, "\n"));
1443                 ++n;
1444         }
1445
1446         //hungarian_print_cost_matrix(bp, 7);
1447         hungarian_prepare_cost_matrix(bp, HUNGARIAN_MODE_MAXIMIZE_UTIL);
1448
1449         unsigned *assignment = ALLOCAN(unsigned, n_regs);
1450         int       res        = hungarian_solve(bp, assignment, NULL, 0);
1451         assert(res == 0);
1452
1453         /* apply results */
1454         n = 0;
1455         sched_foreach(block, node) {
1456                 if (!is_Phi(node))
1457                         break;
1458                 if (!arch_irn_consider_in_reg_alloc(cls, node))
1459                         continue;
1460                 const arch_register_req_t *req
1461                         = arch_get_irn_register_req(node);
1462
1463                 unsigned r = assignment[n++];
1464                 assert(rbitset_is_set(normal_regs, r));
1465                 const arch_register_t *reg = arch_register_for_index(cls, r);
1466                 DB((dbg, LEVEL_2, "Assign %+F -> %s\n", node, reg->name));
1467                 use_reg(node, reg, req->width);
1468
1469                 /* adapt preferences for phi inputs */
1470                 propagate_phi_register(node, r);
1471         }
1472 }
1473
1474 static arch_register_req_t *allocate_reg_req(ir_graph *irg)
1475 {
1476         struct obstack *obst = be_get_be_obst(irg);
1477         arch_register_req_t *req = OALLOCZ(obst, arch_register_req_t);
1478         return req;
1479 }
1480
1481 /**
1482  * Walker: assign registers to all nodes of a block that
1483  * need registers from the currently considered register class.
1484  */
1485 static void allocate_coalesce_block(ir_node *block, void *data)
1486 {
1487         (void) data;
1488         DB((dbg, LEVEL_2, "* Block %+F\n", block));
1489
1490         /* clear assignments */
1491         block_info_t *block_info  = get_block_info(block);
1492         assignments = block_info->assignments;
1493
1494         ir_nodeset_t live_nodes;
1495         ir_nodeset_init(&live_nodes);
1496
1497         /* gather regalloc infos of predecessor blocks */
1498         int            n_preds          = get_Block_n_cfgpreds(block);
1499         block_info_t **pred_block_infos = ALLOCAN(block_info_t*, n_preds);
1500         for (int i = 0; i < n_preds; ++i) {
1501                 ir_node      *pred      = get_Block_cfgpred_block(block, i);
1502                 block_info_t *pred_info = get_block_info(pred);
1503                 pred_block_infos[i]     = pred_info;
1504         }
1505
1506         ir_node **phi_ins = ALLOCAN(ir_node*, n_preds);
1507
1508         /* collect live-in nodes and preassigned values */
1509         be_lv_foreach(lv, block, be_lv_state_in, node) {
1510                 const arch_register_req_t *req = arch_get_irn_register_req(node);
1511                 if (req->cls != cls)
1512                         continue;
1513
1514                 if (arch_register_req_is(req, ignore)) {
1515                         allocation_info_t *info = get_allocation_info(node);
1516                         info->current_value = node;
1517
1518                         const arch_register_t *reg = arch_get_irn_register(node);
1519                         assert(reg != NULL); /* ignore values must be preassigned */
1520                         use_reg(node, reg, req->width);
1521                         continue;
1522                 }
1523
1524                 /* check all predecessors for this value, if it is not everywhere the
1525                    same or unknown then we have to construct a phi
1526                    (we collect the potential phi inputs here) */
1527                 bool need_phi = false;
1528                 for (int p = 0; p < n_preds; ++p) {
1529                         block_info_t *pred_info = pred_block_infos[p];
1530
1531                         if (!pred_info->processed) {
1532                                 /* use node for now, it will get fixed later */
1533                                 phi_ins[p] = node;
1534                                 need_phi   = true;
1535                         } else {
1536                                 int a = find_value_in_block_info(pred_info, node);
1537
1538                                 /* must live out of predecessor */
1539                                 assert(a >= 0);
1540                                 phi_ins[p] = pred_info->assignments[a];
1541                                 /* different value from last time? then we need a phi */
1542                                 if (p > 0 && phi_ins[p-1] != phi_ins[p]) {
1543                                         need_phi = true;
1544                                 }
1545                         }
1546                 }
1547
1548                 if (need_phi) {
1549                         ir_mode *mode = get_irn_mode(node);
1550                         const arch_register_req_t *phi_req = cls->class_req;
1551                         if (req->width > 1) {
1552                                 arch_register_req_t *new_req = allocate_reg_req(irg);
1553                                 new_req->cls   = cls;
1554                                 new_req->type  = req->type & arch_register_req_type_aligned;
1555                                 new_req->width = req->width;
1556                                 phi_req = new_req;
1557                         }
1558                         ir_node *phi  = be_new_Phi(block, n_preds, phi_ins, mode,
1559                                                    phi_req);
1560
1561                         DB((dbg, LEVEL_3, "Create Phi %+F (for %+F) -", phi, node));
1562 #ifdef DEBUG_libfirm
1563                         for (int pi = 0; pi < n_preds; ++pi) {
1564                                 DB((dbg, LEVEL_3, " %+F", phi_ins[pi]));
1565                         }
1566                         DB((dbg, LEVEL_3, "\n"));
1567 #endif
1568                         mark_as_copy_of(phi, node);
1569                         sched_add_after(block, phi);
1570
1571                         node = phi;
1572                 } else {
1573                         allocation_info_t *info = get_allocation_info(node);
1574                         info->current_value = phi_ins[0];
1575
1576                         /* Grab 1 of the inputs we constructed (might not be the same as
1577                          * "node" as we could see the same copy of the value in all
1578                          * predecessors */
1579                         node = phi_ins[0];
1580                 }
1581
1582                 /* if the node already has a register assigned use it */
1583                 const arch_register_t *reg = arch_get_irn_register(node);
1584                 if (reg != NULL) {
1585                         use_reg(node, reg, req->width);
1586                 }
1587
1588                 /* remember that this node is live at the beginning of the block */
1589                 ir_nodeset_insert(&live_nodes, node);
1590         }
1591
1592         /** Collects registers which must not be used for optimistic splits. */
1593         unsigned *const forbidden_regs = rbitset_alloca(n_regs);
1594
1595         /* handle phis... */
1596         assign_phi_registers(block);
1597
1598         /* all live-ins must have a register */
1599 #ifndef NDEBUG
1600         foreach_ir_nodeset(&live_nodes, node, iter) {
1601                 const arch_register_t *reg = arch_get_irn_register(node);
1602                 assert(reg != NULL);
1603         }
1604 #endif
1605
1606         /* assign instructions in the block */
1607         sched_foreach(block, node) {
1608                 /* phis are already assigned */
1609                 if (is_Phi(node))
1610                         continue;
1611
1612                 rewire_inputs(node);
1613
1614                 /* enforce use constraints */
1615                 rbitset_clear_all(forbidden_regs, n_regs);
1616                 enforce_constraints(&live_nodes, node, forbidden_regs);
1617
1618                 rewire_inputs(node);
1619
1620                 /* we may not use registers used for inputs for optimistic splits */
1621                 be_foreach_use(node, cls, in_req, op, op_req,
1622                         const arch_register_t *reg = arch_get_irn_register(op);
1623                         rbitset_set(forbidden_regs, reg->index);
1624                 );
1625
1626                 /* free registers of values last used at this instruction */
1627                 free_last_uses(&live_nodes, node);
1628
1629                 /* assign output registers */
1630                 be_foreach_definition_(node, cls, value, req,
1631                         assign_reg(block, value, req, forbidden_regs);
1632                 );
1633         }
1634
1635         ir_nodeset_destroy(&live_nodes);
1636         assignments = NULL;
1637
1638         block_info->processed = true;
1639
1640         /* permute values at end of predecessor blocks in case of phi-nodes */
1641         if (n_preds > 1) {
1642                 for (int p = 0; p < n_preds; ++p) {
1643                         add_phi_permutations(block, p);
1644                 }
1645         }
1646
1647         /* if we have exactly 1 successor then we might be able to produce phi
1648            copies now */
1649         if (get_irn_n_edges_kind(block, EDGE_KIND_BLOCK) == 1) {
1650                 const ir_edge_t *edge
1651                         = get_irn_out_edge_first_kind(block, EDGE_KIND_BLOCK);
1652                 ir_node      *succ      = get_edge_src_irn(edge);
1653                 int           p         = get_edge_src_pos(edge);
1654                 block_info_t *succ_info = get_block_info(succ);
1655
1656                 if (succ_info->processed) {
1657                         add_phi_permutations(succ, p);
1658                 }
1659         }
1660 }
1661
1662 typedef struct block_costs_t block_costs_t;
1663 struct block_costs_t {
1664         float costs;   /**< costs of the block */
1665         int   dfs_num; /**< depth first search number (to detect backedges) */
1666 };
1667
1668 static int cmp_block_costs(const void *d1, const void *d2)
1669 {
1670         const ir_node       * const *block1 = (const ir_node**)d1;
1671         const ir_node       * const *block2 = (const ir_node**)d2;
1672         const block_costs_t *info1  = (const block_costs_t*)get_irn_link(*block1);
1673         const block_costs_t *info2  = (const block_costs_t*)get_irn_link(*block2);
1674         return QSORT_CMP(info2->costs, info1->costs);
1675 }
1676
1677 static void determine_block_order(void)
1678 {
1679         ir_node **blocklist = be_get_cfgpostorder(irg);
1680         size_t    n_blocks  = ARR_LEN(blocklist);
1681         int       dfs_num   = 0;
1682         pdeq     *worklist  = new_pdeq();
1683         ir_node **order     = XMALLOCN(ir_node*, n_blocks);
1684         size_t    order_p   = 0;
1685
1686         /* clear block links... */
1687         for (size_t p = 0; p < n_blocks; ++p) {
1688                 ir_node *block = blocklist[p];
1689                 set_irn_link(block, NULL);
1690         }
1691
1692         /* walk blocks in reverse postorder, the costs for each block are the
1693          * sum of the costs of its predecessors (excluding the costs on backedges
1694          * which we can't determine) */
1695         for (size_t p = n_blocks; p > 0;) {
1696                 block_costs_t *cost_info;
1697                 ir_node *block = blocklist[--p];
1698
1699                 float execfreq   = (float)get_block_execfreq(block);
1700                 float costs      = execfreq;
1701                 int   n_cfgpreds = get_Block_n_cfgpreds(block);
1702                 for (int p2 = 0; p2 < n_cfgpreds; ++p2) {
1703                         ir_node       *pred_block = get_Block_cfgpred_block(block, p2);
1704                         block_costs_t *pred_costs = (block_costs_t*)get_irn_link(pred_block);
1705                         /* we don't have any info for backedges */
1706                         if (pred_costs == NULL)
1707                                 continue;
1708                         costs += pred_costs->costs;
1709                 }
1710
1711                 cost_info          = OALLOCZ(&obst, block_costs_t);
1712                 cost_info->costs   = costs;
1713                 cost_info->dfs_num = dfs_num++;
1714                 set_irn_link(block, cost_info);
1715         }
1716
1717         /* sort array by block costs */
1718         qsort(blocklist, n_blocks, sizeof(blocklist[0]), cmp_block_costs);
1719
1720         ir_reserve_resources(irg, IR_RESOURCE_BLOCK_VISITED);
1721         inc_irg_block_visited(irg);
1722
1723         for (size_t p = 0; p < n_blocks; ++p) {
1724                 ir_node *block = blocklist[p];
1725                 if (Block_block_visited(block))
1726                         continue;
1727
1728                 /* continually add predecessors with highest costs to worklist
1729                  * (without using backedges) */
1730                 do {
1731                         block_costs_t *info       = (block_costs_t*)get_irn_link(block);
1732                         ir_node       *best_pred  = NULL;
1733                         float          best_costs = -1;
1734                         int            n_cfgpred  = get_Block_n_cfgpreds(block);
1735
1736                         pdeq_putr(worklist, block);
1737                         mark_Block_block_visited(block);
1738                         for (int i = 0; i < n_cfgpred; ++i) {
1739                                 ir_node       *pred_block = get_Block_cfgpred_block(block, i);
1740                                 block_costs_t *pred_info  = (block_costs_t*)get_irn_link(pred_block);
1741
1742                                 /* ignore backedges */
1743                                 if (pred_info->dfs_num > info->dfs_num)
1744                                         continue;
1745
1746                                 if (info->costs > best_costs) {
1747                                         best_costs = info->costs;
1748                                         best_pred  = pred_block;
1749                                 }
1750                         }
1751                         block = best_pred;
1752                 } while (block != NULL && !Block_block_visited(block));
1753
1754                 /* now put all nodes in the worklist in our final order */
1755                 while (!pdeq_empty(worklist)) {
1756                         ir_node *pblock = (ir_node*)pdeq_getr(worklist);
1757                         assert(order_p < n_blocks);
1758                         order[order_p++] = pblock;
1759                 }
1760         }
1761         assert(order_p == n_blocks);
1762         del_pdeq(worklist);
1763
1764         ir_free_resources(irg, IR_RESOURCE_BLOCK_VISITED);
1765
1766         DEL_ARR_F(blocklist);
1767
1768         obstack_free(&obst, NULL);
1769         obstack_init(&obst);
1770
1771         block_order   = order;
1772         n_block_order = n_blocks;
1773 }
1774
1775 static void free_block_order(void)
1776 {
1777         xfree(block_order);
1778 }
1779
1780 /**
1781  * Run the register allocator for the current register class.
1782  */
1783 static void be_pref_alloc_cls(void)
1784 {
1785         be_assure_live_sets(irg);
1786         lv = be_get_irg_liveness(irg);
1787
1788         ir_reserve_resources(irg, IR_RESOURCE_IRN_LINK);
1789
1790         DB((dbg, LEVEL_2, "=== Allocating registers of %s ===\n", cls->name));
1791
1792         be_clear_links(irg);
1793
1794         irg_block_walk_graph(irg, NULL, analyze_block, NULL);
1795         combine_congruence_classes();
1796
1797         for (size_t i = 0; i < n_block_order; ++i) {
1798                 ir_node *block = block_order[i];
1799                 allocate_coalesce_block(block, NULL);
1800         }
1801
1802         ir_free_resources(irg, IR_RESOURCE_IRN_LINK);
1803 }
1804
1805 static void dump(int mask, ir_graph *irg, const char *suffix)
1806 {
1807         if (be_options.dump_flags & mask)
1808                 dump_ir_graph(irg, suffix);
1809 }
1810
1811 /**
1812  * Run the spiller on the current graph.
1813  */
1814 static void spill(void)
1815 {
1816         /* make sure all nodes show their real register pressure */
1817         be_timer_push(T_RA_CONSTR);
1818         be_pre_spill_prepare_constr(irg, cls);
1819         be_timer_pop(T_RA_CONSTR);
1820
1821         dump(DUMP_RA, irg, "spillprepare");
1822
1823         /* spill */
1824         be_timer_push(T_RA_SPILL);
1825         be_do_spill(irg, cls);
1826         be_timer_pop(T_RA_SPILL);
1827
1828         be_timer_push(T_RA_SPILL_APPLY);
1829         check_for_memory_operands(irg);
1830         be_timer_pop(T_RA_SPILL_APPLY);
1831
1832         dump(DUMP_RA, irg, "spill");
1833 }
1834
1835 /**
1836  * The pref register allocator for a whole procedure.
1837  */
1838 static void be_pref_alloc(ir_graph *new_irg)
1839 {
1840         obstack_init(&obst);
1841
1842         irg = new_irg;
1843
1844         /* determine a good coloring order */
1845         determine_block_order();
1846
1847         const arch_env_t *arch_env = be_get_irg_arch_env(new_irg);
1848         int               n_cls    = arch_env->n_register_classes;
1849         for (int c = 0; c < n_cls; ++c) {
1850                 cls = &arch_env->register_classes[c];
1851                 if (arch_register_class_flags(cls) & arch_register_class_flag_manual_ra)
1852                         continue;
1853
1854                 stat_ev_ctx_push_str("regcls", cls->name);
1855
1856                 n_regs      = arch_register_class_n_regs(cls);
1857                 normal_regs = rbitset_malloc(n_regs);
1858                 be_set_allocatable_regs(irg, cls, normal_regs);
1859
1860                 spill();
1861
1862                 /* verify schedule and register pressure */
1863                 be_timer_push(T_VERIFY);
1864                 if (be_options.verify_option == BE_VERIFY_WARN) {
1865                         be_verify_schedule(irg);
1866                         be_verify_register_pressure(irg, cls);
1867                 } else if (be_options.verify_option == BE_VERIFY_ASSERT) {
1868                         assert(be_verify_schedule(irg) && "Schedule verification failed");
1869                         assert(be_verify_register_pressure(irg, cls)
1870                                 && "Register pressure verification failed");
1871                 }
1872                 be_timer_pop(T_VERIFY);
1873
1874                 be_timer_push(T_RA_COLOR);
1875                 be_pref_alloc_cls();
1876                 be_timer_pop(T_RA_COLOR);
1877
1878                 /* we most probably constructed new Phis so liveness info is invalid
1879                  * now */
1880                 be_invalidate_live_sets(irg);
1881                 free(normal_regs);
1882
1883                 stat_ev_ctx_pop("regcls");
1884         }
1885
1886         free_block_order();
1887
1888         be_timer_push(T_RA_SPILL_APPLY);
1889         be_abi_fix_stack_nodes(irg);
1890         be_timer_pop(T_RA_SPILL_APPLY);
1891
1892         be_timer_push(T_VERIFY);
1893         if (be_options.verify_option == BE_VERIFY_WARN) {
1894                 be_verify_register_allocation(irg);
1895         } else if (be_options.verify_option == BE_VERIFY_ASSERT) {
1896                 assert(be_verify_register_allocation(irg)
1897                        && "Register allocation invalid");
1898         }
1899         be_timer_pop(T_VERIFY);
1900
1901         obstack_free(&obst, NULL);
1902 }
1903
1904 BE_REGISTER_MODULE_CONSTRUCTOR(be_init_pref_alloc)
1905 void be_init_pref_alloc(void)
1906 {
1907         static be_ra_t be_ra_pref = { be_pref_alloc };
1908         be_register_allocator("pref", &be_ra_pref);
1909         FIRM_DBG_REGISTER(dbg, "firm.be.prefalloc");
1910 }