irgraph: Use get_irg_obstack() instead of accessing irg->obst directly.
[libfirm] / ir / be / beprefalloc.c
1 /*
2  * Copyright (C) 1995-2011 University of Karlsruhe.  All right reserved.
3  *
4  * This file is part of libFirm.
5  *
6  * This file may be distributed and/or modified under the terms of the
7  * GNU General Public License version 2 as published by the Free Software
8  * Foundation and appearing in the file LICENSE.GPL included in the
9  * packaging of this file.
10  *
11  * Licensees holding valid libFirm Professional Edition licenses may use
12  * this file in accordance with the libFirm Commercial License.
13  * Agreement provided with the Software.
14  *
15  * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
16  * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17  * PURPOSE.
18  */
19
20 /**
21  * @file
22  * @brief       Preference Guided Register Assignment
23  * @author      Matthias Braun
24  * @date        14.2.2009
25  *
26  * The idea is to allocate registers in 2 passes:
27  * 1. A first pass to determine "preferred" registers for live-ranges. This
28  *    calculates for each register and each live-range a value indicating
29  *    the usefulness. (You can roughly think of the value as the negative
30  *    costs needed for copies when the value is in the specific registers...)
31  *
32  * 2. Walk blocks and assigns registers in a greedy fashion. Preferring
33  *    registers with high preferences. When register constraints are not met,
34  *    add copies and split live-ranges.
35  *
36  * TODO:
37  *  - make use of free registers in the permute_values code
38  */
39 #include "config.h"
40
41 #include <float.h>
42 #include <stdbool.h>
43 #include <math.h>
44 #include "lpp.h"
45
46 #include "error.h"
47 #include "execfreq.h"
48 #include "ircons.h"
49 #include "irdom.h"
50 #include "iredges_t.h"
51 #include "irgraph_t.h"
52 #include "irgwalk.h"
53 #include "irnode_t.h"
54 #include "irprintf.h"
55 #include "irdump.h"
56 #include "irtools.h"
57 #include "util.h"
58 #include "obst.h"
59 #include "raw_bitset.h"
60 #include "unionfind.h"
61 #include "pdeq.h"
62 #include "hungarian.h"
63 #include "statev.h"
64 #include "beabi.h"
65 #include "bechordal_t.h"
66 #include "be.h"
67 #include "beirg.h"
68 #include "belive_t.h"
69 #include "bemodule.h"
70 #include "benode.h"
71 #include "bera.h"
72 #include "besched.h"
73 #include "bespill.h"
74 #include "bespillutil.h"
75 #include "beverify.h"
76 #include "beutil.h"
77 #include "bestack.h"
78
79 #define USE_FACTOR                     1.0f
80 #define DEF_FACTOR                     1.0f
81 #define NEIGHBOR_FACTOR                0.2f
82 #define AFF_SHOULD_BE_SAME             0.5f
83 #define AFF_PHI                        1.0f
84 #define SPLIT_DELTA                    1.0f
85 #define MAX_OPTIMISTIC_SPLIT_RECURSION 0
86
87 DEBUG_ONLY(static firm_dbg_module_t *dbg = NULL;)
88
89 static struct obstack               obst;
90 static ir_graph                    *irg;
91 static const arch_register_class_t *cls;
92 static be_lv_t                     *lv;
93 static unsigned                     n_regs;
94 static unsigned                    *normal_regs;
95 static int                         *congruence_classes;
96 static ir_node                    **block_order;
97 static size_t                       n_block_order;
98
99 /** currently active assignments (while processing a basic block)
100  * maps registers to values(their current copies) */
101 static ir_node **assignments;
102
103 /**
104  * allocation information: last_uses, register preferences
105  * the information is per firm-node.
106  */
107 struct allocation_info_t {
108         unsigned  last_uses[2];   /**< bitset indicating last uses (input pos) */
109         ir_node  *current_value;  /**< copy of the value that should be used */
110         ir_node  *original_value; /**< for copies point to original value */
111         float     prefs[];        /**< register preferences */
112 };
113 typedef struct allocation_info_t allocation_info_t;
114
115 /** helper datastructure used when sorting register preferences */
116 struct reg_pref_t {
117         unsigned num;
118         float    pref;
119 };
120 typedef struct reg_pref_t reg_pref_t;
121
122 /** per basic-block information */
123 struct block_info_t {
124         bool     processed;       /**< indicate whether block is processed */
125         ir_node *assignments[];   /**< register assignments at end of block */
126 };
127 typedef struct block_info_t block_info_t;
128
129 /**
130  * Get the allocation info for a node.
131  * The info is allocated on the first visit of a node.
132  */
133 static allocation_info_t *get_allocation_info(ir_node *node)
134 {
135         allocation_info_t *info = (allocation_info_t*)get_irn_link(node);
136         if (info == NULL) {
137                 info = OALLOCFZ(&obst, allocation_info_t, prefs, n_regs);
138                 info->current_value  = node;
139                 info->original_value = node;
140                 set_irn_link(node, info);
141         }
142
143         return info;
144 }
145
146 static allocation_info_t *try_get_allocation_info(const ir_node *node)
147 {
148         return (allocation_info_t*) get_irn_link(node);
149 }
150
151 /**
152  * Get allocation information for a basic block
153  */
154 static block_info_t *get_block_info(ir_node *block)
155 {
156         block_info_t *info = (block_info_t*)get_irn_link(block);
157
158         assert(is_Block(block));
159         if (info == NULL) {
160                 info = OALLOCFZ(&obst, block_info_t, assignments, n_regs);
161                 set_irn_link(block, info);
162         }
163
164         return info;
165 }
166
167 /**
168  * Link the allocation info of a node to a copy.
169  * Afterwards, both nodes uses the same allocation info.
170  * Copy must not have an allocation info assigned yet.
171  *
172  * @param copy   the node that gets the allocation info assigned
173  * @param value  the original node
174  */
175 static void mark_as_copy_of(ir_node *copy, ir_node *value)
176 {
177         allocation_info_t *info      = get_allocation_info(value);
178         allocation_info_t *copy_info = get_allocation_info(copy);
179
180         /* find original value */
181         ir_node *original = info->original_value;
182         if (original != value) {
183                 info = get_allocation_info(original);
184         }
185
186         assert(info->original_value == original);
187         info->current_value = copy;
188
189         /* the copy should not be linked to something else yet */
190         assert(copy_info->original_value == copy);
191         copy_info->original_value = original;
192
193         /* copy over allocation preferences */
194         memcpy(copy_info->prefs, info->prefs, n_regs * sizeof(copy_info->prefs[0]));
195 }
196
197 /**
198  * Calculate the penalties for every register on a node and its live neighbors.
199  *
200  * @param live_nodes  the set of live nodes at the current position, may be NULL
201  * @param penalty     the penalty to subtract from
202  * @param limited     a raw bitset containing the limited set for the node
203  * @param node        the node
204  */
205 static void give_penalties_for_limits(const ir_nodeset_t *live_nodes,
206                                       float penalty, const unsigned* limited,
207                                       ir_node *node)
208 {
209         allocation_info_t *info = get_allocation_info(node);
210
211         /* give penalty for all forbidden regs */
212         for (unsigned r = 0; r < n_regs; ++r) {
213                 if (rbitset_is_set(limited, r))
214                         continue;
215
216                 info->prefs[r] -= penalty;
217         }
218
219         /* all other live values should get a penalty for allowed regs */
220         if (live_nodes == NULL)
221                 return;
222
223         penalty   *= NEIGHBOR_FACTOR;
224         size_t n_allowed = rbitset_popcount(limited, n_regs);
225         if (n_allowed > 1) {
226                 /* only create a very weak penalty if multiple regs are allowed */
227                 penalty = (penalty * 0.8f) / n_allowed;
228         }
229         foreach_ir_nodeset(live_nodes, neighbor, iter) {
230                 allocation_info_t *neighbor_info;
231
232                 /* TODO: if op is used on multiple inputs we might not do a
233                  * continue here */
234                 if (neighbor == node)
235                         continue;
236
237                 neighbor_info = get_allocation_info(neighbor);
238                 for (unsigned r = 0; r < n_regs; ++r) {
239                         if (!rbitset_is_set(limited, r))
240                                 continue;
241
242                         neighbor_info->prefs[r] -= penalty;
243                 }
244         }
245 }
246
247 /**
248  * Calculate the preferences of a definition for the current register class.
249  * If the definition uses a limited set of registers, reduce the preferences
250  * for the limited register on the node and its neighbors.
251  *
252  * @param live_nodes  the set of live nodes at the current node
253  * @param weight      the weight
254  * @param node        the current node
255  */
256 static void check_defs(ir_nodeset_t const *const live_nodes, float const weight, ir_node *const node, arch_register_req_t const *const req)
257 {
258         if (arch_register_req_is(req, limited)) {
259                 const unsigned *limited = req->limited;
260                 float           penalty = weight * DEF_FACTOR;
261                 give_penalties_for_limits(live_nodes, penalty, limited, node);
262         }
263
264         if (arch_register_req_is(req, should_be_same)) {
265                 ir_node           *insn  = skip_Proj(node);
266                 allocation_info_t *info  = get_allocation_info(node);
267                 int                arity = get_irn_arity(insn);
268
269                 float factor = 1.0f / rbitset_popcount(&req->other_same, arity);
270                 for (int i = 0; i < arity; ++i) {
271                         if (!rbitset_is_set(&req->other_same, i))
272                                 continue;
273
274                         ir_node *op = get_irn_n(insn, i);
275
276                         /* if we the value at the should_be_same input doesn't die at the
277                          * node, then it is no use to propagate the constraints (since a
278                          * copy will emerge anyway) */
279                         if (ir_nodeset_contains(live_nodes, op))
280                                 continue;
281
282                         allocation_info_t *op_info = get_allocation_info(op);
283                         for (unsigned r = 0; r < n_regs; ++r) {
284                                 op_info->prefs[r] += info->prefs[r] * factor;
285                         }
286                 }
287         }
288 }
289
290 /**
291  * Walker: Runs an a block calculates the preferences for any
292  * node and every register from the considered register class.
293  */
294 static void analyze_block(ir_node *block, void *data)
295 {
296         float        weight = (float)get_block_execfreq(block);
297         ir_nodeset_t live_nodes;
298         (void) data;
299
300         ir_nodeset_init(&live_nodes);
301         be_liveness_end_of_block(lv, cls, block, &live_nodes);
302
303         sched_foreach_reverse(block, node) {
304                 if (is_Phi(node))
305                         break;
306
307                 be_foreach_definition(node, cls, value, req,
308                         check_defs(&live_nodes, weight, value, req);
309                 );
310
311                 /* mark last uses */
312                 int arity = get_irn_arity(node);
313
314                 /* the allocation info node currently only uses 1 unsigned value
315                    to mark last used inputs. So we will fail for a node with more than
316                    32 inputs. */
317                 allocation_info_t *info = get_allocation_info(node);
318                 if (arity >= (int) sizeof(info->last_uses) * 8) {
319                         panic("Node with more than %d inputs not supported yet",
320                                         (int) sizeof(info->last_uses) * 8);
321                 }
322
323                 for (int i = 0; i < arity; ++i) {
324                         ir_node                   *op  = get_irn_n(node, i);
325                         const arch_register_req_t *req = arch_get_irn_register_req(op);
326                         if (req->cls != cls)
327                                 continue;
328
329                         /* last usage of a value? */
330                         if (!ir_nodeset_contains(&live_nodes, op)) {
331                                 rbitset_set(info->last_uses, i);
332                         }
333                 }
334
335                 be_liveness_transfer(cls, node, &live_nodes);
336
337                 /* update weights based on usage constraints */
338                 be_foreach_use(node, cls, req, op, op_req,
339                         if (!arch_register_req_is(req, limited))
340                                 continue;
341
342                         give_penalties_for_limits(&live_nodes, weight * USE_FACTOR, req->limited, op);
343                 );
344         }
345
346         ir_nodeset_destroy(&live_nodes);
347 }
348
349 static void congruence_def(ir_nodeset_t *const live_nodes, ir_node const *const node, arch_register_req_t const *const req)
350 {
351         /* should be same constraint? */
352         if (arch_register_req_is(req, should_be_same)) {
353                 const ir_node *insn     = skip_Proj_const(node);
354                 int            arity    = get_irn_arity(insn);
355                 unsigned       node_idx = get_irn_idx(node);
356                 node_idx = uf_find(congruence_classes, node_idx);
357
358                 for (int i = 0; i < arity; ++i) {
359                         if (!rbitset_is_set(&req->other_same, i))
360                                 continue;
361
362                         ir_node *op     = get_irn_n(insn, i);
363                         int      op_idx = get_irn_idx(op);
364                         op_idx = uf_find(congruence_classes, op_idx);
365
366                         /* do we interfere with the value */
367                         bool interferes = false;
368                         foreach_ir_nodeset(live_nodes, live, iter) {
369                                 int lv_idx = get_irn_idx(live);
370                                 lv_idx     = uf_find(congruence_classes, lv_idx);
371                                 if (lv_idx == op_idx) {
372                                         interferes = true;
373                                         break;
374                                 }
375                         }
376                         /* don't put in same affinity class if we interfere */
377                         if (interferes)
378                                 continue;
379
380                         uf_union(congruence_classes, node_idx, op_idx);
381                         DB((dbg, LEVEL_3, "Merge %+F and %+F congruence classes\n",
382                             node, op));
383                         /* one should_be_same is enough... */
384                         break;
385                 }
386         }
387 }
388
389 static void create_congruence_class(ir_node *block, void *data)
390 {
391         ir_nodeset_t live_nodes;
392
393         (void) data;
394         ir_nodeset_init(&live_nodes);
395         be_liveness_end_of_block(lv, cls, block, &live_nodes);
396
397         /* check should be same constraints */
398         ir_node *last_phi = NULL;
399         sched_foreach_reverse(block, node) {
400                 if (is_Phi(node)) {
401                         last_phi = node;
402                         break;
403                 }
404
405                 be_foreach_definition(node, cls, value, req,
406                         congruence_def(&live_nodes, value, req);
407                 );
408                 be_liveness_transfer(cls, node, &live_nodes);
409         }
410         if (!last_phi) {
411                 ir_nodeset_destroy(&live_nodes);
412                 return;
413         }
414
415         /* check phi congruence classes */
416         sched_foreach_reverse_from(last_phi, phi) {
417                 assert(is_Phi(phi));
418
419                 if (!arch_irn_consider_in_reg_alloc(cls, phi))
420                         continue;
421
422                 int node_idx = get_irn_idx(phi);
423                 node_idx = uf_find(congruence_classes, node_idx);
424
425                 int arity = get_irn_arity(phi);
426                 for (int i = 0; i < arity; ++i) {
427                         ir_node *op     = get_Phi_pred(phi, i);
428                         int      op_idx = get_irn_idx(op);
429                         op_idx = uf_find(congruence_classes, op_idx);
430
431                         /* do we interfere with the value */
432                         bool interferes = false;
433                         foreach_ir_nodeset(&live_nodes, live, iter) {
434                                 int lv_idx = get_irn_idx(live);
435                                 lv_idx     = uf_find(congruence_classes, lv_idx);
436                                 if (lv_idx == op_idx) {
437                                         interferes = true;
438                                         break;
439                                 }
440                         }
441                         /* don't put in same affinity class if we interfere */
442                         if (interferes)
443                                 continue;
444                         /* any other phi has the same input? */
445                         sched_foreach(block, phi) {
446                                 ir_node *oop;
447                                 int      oop_idx;
448                                 if (!is_Phi(phi))
449                                         break;
450                                 if (!arch_irn_consider_in_reg_alloc(cls, phi))
451                                         continue;
452                                 oop = get_Phi_pred(phi, i);
453                                 if (oop == op)
454                                         continue;
455                                 oop_idx = get_irn_idx(oop);
456                                 oop_idx = uf_find(congruence_classes, oop_idx);
457                                 if (oop_idx == op_idx) {
458                                         interferes = true;
459                                         break;
460                                 }
461                         }
462                         if (interferes)
463                                 continue;
464
465                         /* merge the 2 congruence classes and sum up their preferences */
466                         int old_node_idx = node_idx;
467                         node_idx = uf_union(congruence_classes, node_idx, op_idx);
468                         DB((dbg, LEVEL_3, "Merge %+F and %+F congruence classes\n",
469                             phi, op));
470
471                         old_node_idx = node_idx == old_node_idx ? op_idx : old_node_idx;
472                         allocation_info_t *head_info
473                                 = get_allocation_info(get_idx_irn(irg, node_idx));
474                         allocation_info_t *other_info
475                                 = get_allocation_info(get_idx_irn(irg, old_node_idx));
476                         for (unsigned r = 0; r < n_regs; ++r) {
477                                 head_info->prefs[r] += other_info->prefs[r];
478                         }
479                 }
480         }
481         ir_nodeset_destroy(&live_nodes);
482 }
483
484 static void set_congruence_prefs(ir_node *node, void *data)
485 {
486         (void) data;
487         unsigned node_idx = get_irn_idx(node);
488         unsigned node_set = uf_find(congruence_classes, node_idx);
489
490         /* head of congruence class or not in any class */
491         if (node_set == node_idx)
492                 return;
493
494         if (!arch_irn_consider_in_reg_alloc(cls, node))
495                 return;
496
497         ir_node *head = get_idx_irn(irg, node_set);
498         allocation_info_t *head_info = get_allocation_info(head);
499         allocation_info_t *info      = get_allocation_info(node);
500
501         memcpy(info->prefs, head_info->prefs, n_regs * sizeof(info->prefs[0]));
502 }
503
504 static void combine_congruence_classes(void)
505 {
506         size_t n = get_irg_last_idx(irg);
507         congruence_classes = XMALLOCN(int, n);
508         uf_init(congruence_classes, n);
509
510         /* create congruence classes */
511         irg_block_walk_graph(irg, create_congruence_class, NULL, NULL);
512         /* merge preferences */
513         irg_walk_graph(irg, set_congruence_prefs, NULL, NULL);
514         free(congruence_classes);
515 }
516
517
518
519 /**
520  * Assign register reg to the given node.
521  *
522  * @param node  the node
523  * @param reg   the register
524  */
525 static void use_reg(ir_node *node, const arch_register_t *reg, unsigned width)
526 {
527         unsigned r = reg->index;
528         for (unsigned r0 = r; r0 < r + width; ++r0)
529                 assignments[r0] = node;
530         arch_set_irn_register(node, reg);
531 }
532
533 static void free_reg_of_value(ir_node *node)
534 {
535         if (!arch_irn_consider_in_reg_alloc(cls, node))
536                 return;
537
538         const arch_register_t     *reg = arch_get_irn_register(node);
539         const arch_register_req_t *req = arch_get_irn_register_req(node);
540         unsigned                   r   = reg->index;
541         /* assignment->value may be NULL if a value is used at 2 inputs
542          * so it gets freed twice. */
543         for (unsigned r0 = r; r0 < r + req->width; ++r0) {
544                 assert(assignments[r0] == node || assignments[r0] == NULL);
545                 assignments[r0] = NULL;
546         }
547 }
548
549 /**
550  * Compare two register preferences in decreasing order.
551  */
552 static int compare_reg_pref(const void *e1, const void *e2)
553 {
554         const reg_pref_t *rp1 = (const reg_pref_t*) e1;
555         const reg_pref_t *rp2 = (const reg_pref_t*) e2;
556         if (rp1->pref < rp2->pref)
557                 return 1;
558         if (rp1->pref > rp2->pref)
559                 return -1;
560         return 0;
561 }
562
563 static void fill_sort_candidates(reg_pref_t *regprefs,
564                                  const allocation_info_t *info)
565 {
566         for (unsigned r = 0; r < n_regs; ++r) {
567                 float pref = info->prefs[r];
568                 regprefs[r].num  = r;
569                 regprefs[r].pref = pref;
570         }
571         /* TODO: use a stable sort here to avoid unnecessary register jumping */
572         qsort(regprefs, n_regs, sizeof(regprefs[0]), compare_reg_pref);
573 }
574
575 static bool try_optimistic_split(ir_node *to_split, ir_node *before,
576                                  float pref, float pref_delta,
577                                  unsigned *forbidden_regs, int recursion)
578 {
579         (void) pref;
580         unsigned           r = 0;
581         allocation_info_t *info = get_allocation_info(to_split);
582         float              delta = 0;
583
584         /* stupid hack: don't optimistically split don't spill nodes...
585          * (so we don't split away the values produced because of
586          *  must_be_different constraints) */
587         ir_node *original_insn = skip_Proj(info->original_value);
588         if (arch_get_irn_flags(original_insn) & arch_irn_flags_dont_spill)
589                 return false;
590
591         const arch_register_t *from_reg        = arch_get_irn_register(to_split);
592         unsigned               from_r          = from_reg->index;
593         ir_node               *block           = get_nodes_block(before);
594         float split_threshold = (float)get_block_execfreq(block) * SPLIT_DELTA;
595
596         if (pref_delta < split_threshold*0.5)
597                 return false;
598
599         /* find the best free position where we could move to */
600         reg_pref_t *prefs = ALLOCAN(reg_pref_t, n_regs);
601         fill_sort_candidates(prefs, info);
602         unsigned i;
603         for (i = 0; i < n_regs; ++i) {
604                 /* we need a normal register which is not an output register
605                    an different from the current register of to_split */
606                 r = prefs[i].num;
607                 if (!rbitset_is_set(normal_regs, r))
608                         continue;
609                 if (rbitset_is_set(forbidden_regs, r))
610                         continue;
611                 if (r == from_r)
612                         continue;
613
614                 /* is the split worth it? */
615                 delta = pref_delta + prefs[i].pref;
616                 if (delta < split_threshold) {
617                         DB((dbg, LEVEL_3, "Not doing optimistical split of %+F (depth %d), win %f too low\n",
618                                 to_split, recursion, delta));
619                         return false;
620                 }
621
622                 /* if the register is free then we can do the split */
623                 if (assignments[r] == NULL)
624                         break;
625
626                 /* otherwise we might try recursively calling optimistic_split */
627                 if (recursion+1 > MAX_OPTIMISTIC_SPLIT_RECURSION)
628                         continue;
629
630                 float apref        = prefs[i].pref;
631                 float apref_delta  = i+1 < n_regs ? apref - prefs[i+1].pref : 0;
632                 apref_delta += pref_delta - split_threshold;
633
634                 /* our source register isn't a useful destination for recursive
635                    splits */
636                 bool old_source_state = rbitset_is_set(forbidden_regs, from_r);
637                 rbitset_set(forbidden_regs, from_r);
638                 /* try recursive split */
639                 bool res = try_optimistic_split(assignments[r], before, apref,
640                                               apref_delta, forbidden_regs, recursion+1);
641                 /* restore our destination */
642                 if (old_source_state) {
643                         rbitset_set(forbidden_regs, from_r);
644                 } else {
645                         rbitset_clear(forbidden_regs, from_r);
646                 }
647
648                 if (res)
649                         break;
650         }
651         if (i >= n_regs)
652                 return false;
653
654         const arch_register_t *reg   = arch_register_for_index(cls, r);
655         ir_node               *copy  = be_new_Copy(block, to_split);
656         unsigned               width = 1;
657         mark_as_copy_of(copy, to_split);
658         /* hacky, but correct here */
659         if (assignments[from_reg->index] == to_split)
660                 free_reg_of_value(to_split);
661         use_reg(copy, reg, width);
662         sched_add_before(before, copy);
663
664         DB((dbg, LEVEL_3,
665             "Optimistic live-range split %+F move %+F(%s) -> %s before %+F (win %f, depth %d)\n",
666             copy, to_split, from_reg->name, reg->name, before, delta, recursion));
667         return true;
668 }
669
670 /**
671  * Determine and assign a register for node @p node
672  */
673 static void assign_reg(ir_node const *const block, ir_node *const node, arch_register_req_t const *const req, unsigned *const forbidden_regs)
674 {
675         assert(!is_Phi(node));
676         /* preassigned register? */
677         arch_register_t const *final_reg = arch_get_irn_register(node);
678         unsigned         const width     = req->width;
679         if (final_reg != NULL) {
680                 DB((dbg, LEVEL_2, "Preassignment %+F -> %s\n", node, final_reg->name));
681                 use_reg(node, final_reg, width);
682                 return;
683         }
684
685         /* ignore reqs must be preassigned */
686         assert(!arch_register_req_is(req, ignore));
687
688         /* give should_be_same boni */
689         allocation_info_t *info    = get_allocation_info(node);
690         ir_node           *in_node = skip_Proj(node);
691         if (arch_register_req_is(req, should_be_same)) {
692                 float weight = (float)get_block_execfreq(block);
693                 int   arity  = get_irn_arity(in_node);
694
695                 assert(arity <= (int) sizeof(req->other_same) * 8);
696                 for (int i = 0; i < arity; ++i) {
697                         if (!rbitset_is_set(&req->other_same, i))
698                                 continue;
699
700                         ir_node               *in        = get_irn_n(in_node, i);
701                         const arch_register_t *reg       = arch_get_irn_register(in);
702                         unsigned               reg_index = reg->index;
703
704                         /* if the value didn't die here then we should not propagate the
705                          * should_be_same info */
706                         if (assignments[reg_index] == in)
707                                 continue;
708
709                         info->prefs[reg_index] += weight * AFF_SHOULD_BE_SAME;
710                 }
711         }
712
713         /* create list of register candidates and sort by their preference */
714         DB((dbg, LEVEL_2, "Candidates for %+F:", node));
715         reg_pref_t *reg_prefs = ALLOCAN(reg_pref_t, n_regs);
716         fill_sort_candidates(reg_prefs, info);
717         for (unsigned r = 0; r < n_regs; ++r) {
718                 unsigned num = reg_prefs[r].num;
719                 if (!rbitset_is_set(normal_regs, num))
720                         continue;
721                 const arch_register_t *reg = arch_register_for_index(cls, num);
722                 DB((dbg, LEVEL_2, " %s(%f)", reg->name, reg_prefs[r].pref));
723         }
724         DB((dbg, LEVEL_2, "\n"));
725
726         const unsigned *allowed_regs = normal_regs;
727         if (arch_register_req_is(req, limited)) {
728                 allowed_regs = req->limited;
729         }
730
731         unsigned final_reg_index = 0;
732         unsigned r;
733         for (r = 0; r < n_regs; ++r) {
734                 final_reg_index = reg_prefs[r].num;
735                 if (!rbitset_is_set(allowed_regs, final_reg_index))
736                         continue;
737                 /* alignment constraint? */
738                 if (width > 1) {
739                         if (arch_register_req_is(req, aligned) && (final_reg_index % width) != 0)
740                                 continue;
741                         bool fine = true;
742                         for (unsigned r0 = r+1; r0 < r+width; ++r0) {
743                                 if (assignments[r0] != NULL)
744                                         fine = false;
745                         }
746                         /* TODO: attempt optimistic split here */
747                         if (!fine)
748                                 continue;
749                 }
750
751                 if (assignments[final_reg_index] == NULL)
752                         break;
753                 float    pref   = reg_prefs[r].pref;
754                 float    delta  = r+1 < n_regs ? pref - reg_prefs[r+1].pref : 0;
755                 ir_node *before = skip_Proj(node);
756                 bool     res
757                         = try_optimistic_split(assignments[final_reg_index], before, pref,
758                                                delta, forbidden_regs, 0);
759                 if (res)
760                         break;
761         }
762         if (r >= n_regs) {
763                 /* the common reason to hit this panic is when 1 of your nodes is not
764                  * register pressure faithful */
765                 panic("No register left for %+F\n", node);
766         }
767
768         final_reg = arch_register_for_index(cls, final_reg_index);
769         DB((dbg, LEVEL_2, "Assign %+F -> %s\n", node, final_reg->name));
770         use_reg(node, final_reg, width);
771 }
772
773 /**
774  * Add an permutation in front of a node and change the assignments
775  * due to this permutation.
776  *
777  * To understand this imagine a permutation like this:
778  *
779  * 1 -> 2
780  * 2 -> 3
781  * 3 -> 1, 5
782  * 4 -> 6
783  * 5
784  * 6
785  * 7 -> 7
786  *
787  * First we count how many destinations a single value has. At the same time
788  * we can be sure that each destination register has at most 1 source register
789  * (it can have 0 which means we don't care what value is in it).
790  * We ignore all fulfilled permuations (like 7->7)
791  * In a first pass we create as much copy instructions as possible as they
792  * are generally cheaper than exchanges. We do this by counting into how many
793  * destinations a register has to be copied (in the example it's 2 for register
794  * 3, or 1 for the registers 1,2,4 and 7).
795  * We can then create a copy into every destination register when the usecount
796  * of that register is 0 (= noone else needs the value in the register).
797  *
798  * After this step we should only have cycles left. We implement a cyclic
799  * permutation of n registers with n-1 transpositions.
800  *
801  * @param live_nodes   the set of live nodes, updated due to live range split
802  * @param before       the node before we add the permutation
803  * @param permutation  the permutation array indices are the destination
804  *                     registers, the values in the array are the source
805  *                     registers.
806  */
807 static void permute_values(ir_nodeset_t *live_nodes, ir_node *before,
808                            unsigned *permutation)
809 {
810         unsigned *n_used = ALLOCANZ(unsigned, n_regs);
811
812         /* determine how often each source register needs to be read */
813         for (unsigned r = 0; r < n_regs; ++r) {
814                 unsigned  old_reg = permutation[r];
815                 ir_node  *value;
816
817                 value = assignments[old_reg];
818                 if (value == NULL) {
819                         /* nothing to do here, reg is not live. Mark it as fixpoint
820                          * so we ignore it in the next steps */
821                         permutation[r] = r;
822                         continue;
823                 }
824
825                 ++n_used[old_reg];
826         }
827
828         ir_node *block = get_nodes_block(before);
829
830         /* step1: create copies where immediately possible */
831         for (unsigned r = 0; r < n_regs; /* empty */) {
832                 unsigned old_r = permutation[r];
833
834                 /* - no need to do anything for fixed points.
835                    - we can't copy if the value in the dest reg is still needed */
836                 if (old_r == r || n_used[r] > 0) {
837                         ++r;
838                         continue;
839                 }
840
841                 /* create a copy */
842                 ir_node *src  = assignments[old_r];
843                 ir_node *copy = be_new_Copy(block, src);
844                 sched_add_before(before, copy);
845                 const arch_register_t *reg = arch_register_for_index(cls, r);
846                 DB((dbg, LEVEL_2, "Copy %+F (from %+F, before %+F) -> %s\n",
847                     copy, src, before, reg->name));
848                 mark_as_copy_of(copy, src);
849                 unsigned width = 1; /* TODO */
850                 use_reg(copy, reg, width);
851
852                 if (live_nodes != NULL) {
853                         ir_nodeset_insert(live_nodes, copy);
854                 }
855
856                 /* old register has 1 user less, permutation is resolved */
857                 assert(arch_get_irn_register(src)->index == old_r);
858                 permutation[r] = r;
859
860                 assert(n_used[old_r] > 0);
861                 --n_used[old_r];
862                 if (n_used[old_r] == 0) {
863                         if (live_nodes != NULL) {
864                                 ir_nodeset_remove(live_nodes, src);
865                         }
866                         free_reg_of_value(src);
867                 }
868
869                 /* advance or jump back (if this copy enabled another copy) */
870                 if (old_r < r && n_used[old_r] == 0) {
871                         r = old_r;
872                 } else {
873                         ++r;
874                 }
875         }
876
877         /* at this point we only have "cycles" left which we have to resolve with
878          * perm instructions
879          * TODO: if we have free registers left, then we should really use copy
880          * instructions for any cycle longer than 2 registers...
881          * (this is probably architecture dependent, there might be archs where
882          *  copies are preferable even for 2-cycles) */
883
884         /* create perms with the rest */
885         for (unsigned r = 0; r < n_regs; /* empty */) {
886                 unsigned old_r = permutation[r];
887
888                 if (old_r == r) {
889                         ++r;
890                         continue;
891                 }
892
893                 /* we shouldn't have copies from 1 value to multiple destinations left*/
894                 assert(n_used[old_r] == 1);
895
896                 /* exchange old_r and r2; after that old_r is a fixed point */
897                 unsigned r2 = permutation[old_r];
898
899                 ir_node *in[2] = { assignments[r2], assignments[old_r] };
900                 ir_node *perm = be_new_Perm(cls, block, 2, in);
901                 sched_add_before(before, perm);
902                 DB((dbg, LEVEL_2, "Perm %+F (perm %+F,%+F, before %+F)\n",
903                     perm, in[0], in[1], before));
904
905                 unsigned width = 1; /* TODO */
906
907                 ir_node *proj0 = new_r_Proj(perm, get_irn_mode(in[0]), 0);
908                 mark_as_copy_of(proj0, in[0]);
909                 const arch_register_t *reg0 = arch_register_for_index(cls, old_r);
910                 use_reg(proj0, reg0, width);
911
912                 ir_node *proj1 = new_r_Proj(perm, get_irn_mode(in[1]), 1);
913                 mark_as_copy_of(proj1, in[1]);
914                 const arch_register_t *reg1 = arch_register_for_index(cls, r2);
915                 use_reg(proj1, reg1, width);
916
917                 /* 1 value is now in the correct register */
918                 permutation[old_r] = old_r;
919                 /* the source of r changed to r2 */
920                 permutation[r] = r2;
921
922                 /* if we have reached a fixpoint update data structures */
923                 if (live_nodes != NULL) {
924                         ir_nodeset_remove(live_nodes, in[0]);
925                         ir_nodeset_remove(live_nodes, in[1]);
926                         ir_nodeset_remove(live_nodes, proj0);
927                         ir_nodeset_insert(live_nodes, proj1);
928                 }
929         }
930
931 #ifndef NDEBUG
932         /* now we should only have fixpoints left */
933         for (unsigned r = 0; r < n_regs; ++r) {
934                 assert(permutation[r] == r);
935         }
936 #endif
937 }
938
939 /**
940  * Free regs for values last used.
941  *
942  * @param live_nodes   set of live nodes, will be updated
943  * @param node         the node to consider
944  */
945 static void free_last_uses(ir_nodeset_t *live_nodes, ir_node *node)
946 {
947         allocation_info_t *info      = get_allocation_info(node);
948         const unsigned    *last_uses = info->last_uses;
949         int                arity     = get_irn_arity(node);
950
951         for (int i = 0; i < arity; ++i) {
952                 /* check if one operand is the last use */
953                 if (!rbitset_is_set(last_uses, i))
954                         continue;
955
956                 ir_node *op = get_irn_n(node, i);
957                 free_reg_of_value(op);
958                 ir_nodeset_remove(live_nodes, op);
959         }
960 }
961
962 /**
963  * change inputs of a node to the current value (copies/perms)
964  */
965 static void rewire_inputs(ir_node *node)
966 {
967         int arity = get_irn_arity(node);
968         for (int i = 0; i < arity; ++i) {
969                 ir_node           *op = get_irn_n(node, i);
970                 allocation_info_t *info = try_get_allocation_info(op);
971
972                 if (info == NULL)
973                         continue;
974
975                 info = get_allocation_info(info->original_value);
976                 if (info->current_value != op) {
977                         set_irn_n(node, i, info->current_value);
978                 }
979         }
980 }
981
982 /**
983  * Create a bitset of registers occupied with value living through an
984  * instruction
985  */
986 static void determine_live_through_regs(unsigned *bitset, ir_node *node)
987 {
988         const allocation_info_t *info = get_allocation_info(node);
989
990         /* mark all used registers as potentially live-through */
991         for (unsigned r = 0; r < n_regs; ++r) {
992                 if (assignments[r] == NULL)
993                         continue;
994                 if (!rbitset_is_set(normal_regs, r))
995                         continue;
996
997                 rbitset_set(bitset, r);
998         }
999
1000         /* remove registers of value dying at the instruction */
1001         int arity = get_irn_arity(node);
1002         for (int i = 0; i < arity; ++i) {
1003                 if (!rbitset_is_set(info->last_uses, i))
1004                         continue;
1005
1006                 ir_node               *op  = get_irn_n(node, i);
1007                 const arch_register_t *reg = arch_get_irn_register(op);
1008                 rbitset_clear(bitset, reg->index);
1009         }
1010 }
1011
1012 static void solve_lpp(ir_nodeset_t *live_nodes, ir_node *node,
1013                       unsigned *forbidden_regs, unsigned *live_through_regs)
1014 {
1015         unsigned *forbidden_edges = rbitset_malloc(n_regs * n_regs);
1016         int      *lpp_vars        = XMALLOCNZ(int, n_regs*n_regs);
1017
1018         lpp_t *lpp = lpp_new("prefalloc", lpp_minimize);
1019         //lpp_set_time_limit(lpp, 20);
1020         lpp_set_log(lpp, stdout);
1021
1022         /** mark some edges as forbidden */
1023         be_foreach_use(node, cls, req, op, op_req,
1024                 if (!arch_register_req_is(req, limited))
1025                         continue;
1026
1027                 const unsigned        *limited     = req->limited;
1028                 const arch_register_t *reg         = arch_get_irn_register(op);
1029                 unsigned               current_reg = reg->index;
1030                 for (unsigned r = 0; r < n_regs; ++r) {
1031                         if (rbitset_is_set(limited, r))
1032                                 continue;
1033
1034                         rbitset_set(forbidden_edges, current_reg*n_regs + r);
1035                 }
1036         );
1037
1038         /* add all combinations, except for not allowed ones */
1039         for (unsigned l = 0; l < n_regs; ++l) {
1040                 if (!rbitset_is_set(normal_regs, l)) {
1041                         char name[15];
1042                         snprintf(name, sizeof(name), "%u_to_%u", l, l);
1043                         lpp_vars[l*n_regs+l] = lpp_add_var(lpp, name, lpp_binary, 1);
1044                         continue;
1045                 }
1046
1047                 for (unsigned r = 0; r < n_regs; ++r) {
1048                         if (!rbitset_is_set(normal_regs, r))
1049                                 continue;
1050                         if (rbitset_is_set(forbidden_edges, l*n_regs + r))
1051                                 continue;
1052                         /* livethrough values may not use constrained output registers */
1053                         if (rbitset_is_set(live_through_regs, l)
1054                             && rbitset_is_set(forbidden_regs, r))
1055                                 continue;
1056
1057                         char name[15];
1058                         snprintf(name, sizeof(name), "%u_to_%u", l, r);
1059
1060                         double costs = l==r ? 9 : 8;
1061                         lpp_vars[l*n_regs+r]
1062                                 = lpp_add_var(lpp, name, lpp_binary, costs);
1063                         assert(lpp_vars[l*n_regs+r] > 0);
1064                 }
1065         }
1066         /* add constraints */
1067         for (unsigned l = 0; l < n_regs; ++l) {
1068                 /* only 1 destination per register */
1069                 int constraint = -1;
1070                 for (unsigned r = 0; r < n_regs; ++r) {
1071                         int var = lpp_vars[l*n_regs+r];
1072                         if (var == 0)
1073                                 continue;
1074                         if (constraint < 0) {
1075                                 char name[64];
1076                                 snprintf(name, sizeof(name), "%u_to_dest", l);
1077                                 constraint = lpp_add_cst(lpp, name, lpp_equal, 1);
1078                         }
1079                         lpp_set_factor_fast(lpp, constraint, var, 1);
1080                 }
1081                 /* each destination used by at most 1 value */
1082                 constraint = -1;
1083                 for (unsigned r = 0; r < n_regs; ++r) {
1084                         int var = lpp_vars[r*n_regs+l];
1085                         if (var == 0)
1086                                 continue;
1087                         if (constraint < 0) {
1088                                 char name[64];
1089                                 snprintf(name, sizeof(name), "one_to_%u", l);
1090                                 constraint = lpp_add_cst(lpp, name, lpp_less_equal, 1);
1091                         }
1092                         lpp_set_factor_fast(lpp, constraint, var, 1);
1093                 }
1094         }
1095
1096         lpp_dump_plain(lpp, fopen("lppdump.txt", "w"));
1097
1098         /* solve lpp */
1099         lpp_solve(lpp, be_options.ilp_server, be_options.ilp_solver);
1100         if (!lpp_is_sol_valid(lpp))
1101                 panic("ilp solution not valid!");
1102
1103         unsigned *assignment = ALLOCAN(unsigned, n_regs);
1104         for (unsigned l = 0; l < n_regs; ++l) {
1105                 unsigned dest_reg = (unsigned)-1;
1106                 for (unsigned r = 0; r < n_regs; ++r) {
1107                         int var = lpp_vars[l*n_regs+r];
1108                         if (var == 0)
1109                                 continue;
1110                         double val = lpp_get_var_sol(lpp, var);
1111                         if (val == 1) {
1112                                 assert(dest_reg == (unsigned)-1);
1113                                 dest_reg = r;
1114                         }
1115                 }
1116                 assert(dest_reg != (unsigned)-1);
1117                 assignment[dest_reg] = l;
1118         }
1119
1120         fprintf(stderr, "Assignment: ");
1121         for (unsigned l = 0; l < n_regs; ++l) {
1122                 fprintf(stderr, "%u ", assignment[l]);
1123         }
1124         fprintf(stderr, "\n");
1125         fflush(stdout);
1126         permute_values(live_nodes, node, assignment);
1127         lpp_free(lpp);
1128 }
1129
1130 static bool is_aligned(unsigned num, unsigned alignment)
1131 {
1132         unsigned mask = alignment-1;
1133         assert(is_po2(alignment));
1134         return (num&mask) == 0;
1135 }
1136
1137 /**
1138  * Enforce constraints at a node by live range splits.
1139  *
1140  * @param  live_nodes  the set of live nodes, might be changed
1141  * @param  node        the current node
1142  */
1143 static void enforce_constraints(ir_nodeset_t *live_nodes, ir_node *node,
1144                                 unsigned *forbidden_regs)
1145 {
1146         /* see if any use constraints are not met and whether double-width
1147          * values are involved */
1148         bool double_width = false;
1149         bool good = true;
1150         be_foreach_use(node, cls, req, op, op_req,
1151                 /* are there any limitations for the i'th operand? */
1152                 if (req->width > 1)
1153                         double_width = true;
1154                 const arch_register_t *reg       = arch_get_irn_register(op);
1155                 unsigned               reg_index = reg->index;
1156                 if (arch_register_req_is(req, aligned)) {
1157                         if (!is_aligned(reg_index, req->width)) {
1158                                 good = false;
1159                                 continue;
1160                         }
1161                 }
1162                 if (!arch_register_req_is(req, limited))
1163                         continue;
1164
1165                 const unsigned *limited = req->limited;
1166                 if (!rbitset_is_set(limited, reg_index)) {
1167                         /* found an assignment outside the limited set */
1168                         good = false;
1169                         continue;
1170                 }
1171         );
1172
1173         /* is any of the live-throughs using a constrained output register? */
1174         unsigned *live_through_regs = NULL;
1175         be_foreach_definition(node, cls, value, req,
1176                 (void)value;
1177                 if (req->width > 1)
1178                         double_width = true;
1179                 if (!arch_register_req_is(req, limited))
1180                         continue;
1181                 if (live_through_regs == NULL) {
1182                         live_through_regs = rbitset_alloca(n_regs);
1183                         determine_live_through_regs(live_through_regs, node);
1184                 }
1185                 rbitset_or(forbidden_regs, req->limited, n_regs);
1186                 if (rbitsets_have_common(req->limited, live_through_regs, n_regs))
1187                         good = false;
1188         );
1189
1190         if (good)
1191                 return;
1192
1193         /* create these arrays if we haven't yet */
1194         if (live_through_regs == NULL) {
1195                 live_through_regs = rbitset_alloca(n_regs);
1196         }
1197
1198         if (double_width) {
1199                 /* only the ILP variant can solve this yet */
1200                 solve_lpp(live_nodes, node, forbidden_regs, live_through_regs);
1201                 return;
1202         }
1203
1204         /* at this point we have to construct a bipartite matching problem to see
1205          * which values should go to which registers
1206          * Note: We're building the matrix in "reverse" - source registers are
1207          *       right, destinations left because this will produce the solution
1208          *       in the format required for permute_values.
1209          */
1210         hungarian_problem_t *bp
1211                 = hungarian_new(n_regs, n_regs, HUNGARIAN_MATCH_PERFECT);
1212
1213         /* add all combinations, then remove not allowed ones */
1214         for (unsigned l = 0; l < n_regs; ++l) {
1215                 if (!rbitset_is_set(normal_regs, l)) {
1216                         hungarian_add(bp, l, l, 1);
1217                         continue;
1218                 }
1219
1220                 for (unsigned r = 0; r < n_regs; ++r) {
1221                         if (!rbitset_is_set(normal_regs, r))
1222                                 continue;
1223                         /* livethrough values may not use constrainted output registers */
1224                         if (rbitset_is_set(live_through_regs, l)
1225                                         && rbitset_is_set(forbidden_regs, r))
1226                                 continue;
1227
1228                         hungarian_add(bp, r, l, l == r ? 9 : 8);
1229                 }
1230         }
1231
1232         be_foreach_use(node, cls, req, op, op_req,
1233                 if (!arch_register_req_is(req, limited))
1234                         continue;
1235
1236                 const unsigned        *limited     = req->limited;
1237                 const arch_register_t *reg         = arch_get_irn_register(op);
1238                 unsigned               current_reg = reg->index;
1239                 for (unsigned r = 0; r < n_regs; ++r) {
1240                         if (rbitset_is_set(limited, r))
1241                                 continue;
1242                         hungarian_remove(bp, r, current_reg);
1243                 }
1244         );
1245
1246         //hungarian_print_cost_matrix(bp, 1);
1247         hungarian_prepare_cost_matrix(bp, HUNGARIAN_MODE_MAXIMIZE_UTIL);
1248
1249         unsigned *assignment = ALLOCAN(unsigned, n_regs);
1250         int res = hungarian_solve(bp, assignment, NULL, 0);
1251         assert(res == 0);
1252
1253 #if 0
1254         fprintf(stderr, "Swap result:");
1255         for (i = 0; i < (int) n_regs; ++i) {
1256                 fprintf(stderr, " %d", assignment[i]);
1257         }
1258         fprintf(stderr, "\n");
1259 #endif
1260
1261         hungarian_free(bp);
1262
1263         permute_values(live_nodes, node, assignment);
1264 }
1265
1266 /** test whether a node @p n is a copy of the value of node @p of */
1267 static bool is_copy_of(ir_node *value, ir_node *test_value)
1268 {
1269         if (value == test_value)
1270                 return true;
1271
1272         allocation_info_t *info      = get_allocation_info(value);
1273         allocation_info_t *test_info = get_allocation_info(test_value);
1274         return test_info->original_value == info->original_value;
1275 }
1276
1277 /**
1278  * find a value in the end-assignment of a basic block
1279  * @returns the index into the assignment array if found
1280  *          -1 if not found
1281  */
1282 static int find_value_in_block_info(block_info_t *info, ir_node *value)
1283 {
1284         ir_node  **end_assignments = info->assignments;
1285         for (unsigned r = 0; r < n_regs; ++r) {
1286                 ir_node *a_value = end_assignments[r];
1287
1288                 if (a_value == NULL)
1289                         continue;
1290                 if (is_copy_of(a_value, value))
1291                         return (int) r;
1292         }
1293
1294         return -1;
1295 }
1296
1297 /**
1298  * Create the necessary permutations at the end of a basic block to fullfill
1299  * the register assignment for phi-nodes in the next block
1300  */
1301 static void add_phi_permutations(ir_node *block, int p)
1302 {
1303         ir_node      *pred      = get_Block_cfgpred_block(block, p);
1304         block_info_t *pred_info = get_block_info(pred);
1305
1306         /* predecessor not processed yet? nothing to do */
1307         if (!pred_info->processed)
1308                 return;
1309
1310         unsigned *permutation = ALLOCAN(unsigned, n_regs);
1311         for (unsigned r = 0; r < n_regs; ++r) {
1312                 permutation[r] = r;
1313         }
1314
1315         /* check phi nodes */
1316         bool     need_permutation = false;
1317         ir_node *phi              = sched_first(block);
1318         for ( ; is_Phi(phi); phi = sched_next(phi)) {
1319                 if (!arch_irn_consider_in_reg_alloc(cls, phi))
1320                         continue;
1321
1322                 ir_node *phi_pred = get_Phi_pred(phi, p);
1323                 int      a        = find_value_in_block_info(pred_info, phi_pred);
1324                 assert(a >= 0);
1325
1326                 const arch_register_t *reg  = arch_get_irn_register(phi);
1327                 int                    regn = reg->index;
1328                 /* same register? nothing to do */
1329                 if (regn == a)
1330                         continue;
1331
1332                 ir_node               *op     = pred_info->assignments[a];
1333                 const arch_register_t *op_reg = arch_get_irn_register(op);
1334                 /* Virtual registers are ok, too. */
1335                 if (op_reg->type & arch_register_type_virtual)
1336                         continue;
1337
1338                 permutation[regn] = a;
1339                 need_permutation  = true;
1340         }
1341
1342         if (need_permutation) {
1343                 /* permute values at end of predecessor */
1344                 ir_node **old_assignments = assignments;
1345                 assignments     = pred_info->assignments;
1346                 permute_values(NULL, be_get_end_of_block_insertion_point(pred),
1347                                permutation);
1348                 assignments = old_assignments;
1349         }
1350
1351         /* change phi nodes to use the copied values */
1352         phi = sched_first(block);
1353         for ( ; is_Phi(phi); phi = sched_next(phi)) {
1354                 if (!arch_irn_consider_in_reg_alloc(cls, phi))
1355                         continue;
1356
1357                 /* we have permuted all values into the correct registers so we can
1358                    simply query which value occupies the phis register in the
1359                    predecessor */
1360                 int      a  = arch_get_irn_register(phi)->index;
1361                 ir_node *op = pred_info->assignments[a];
1362                 set_Phi_pred(phi, p, op);
1363         }
1364 }
1365
1366 /**
1367  * Set preferences for a phis register based on the registers used on the
1368  * phi inputs.
1369  */
1370 static void adapt_phi_prefs(ir_node *phi)
1371 {
1372         ir_node           *block = get_nodes_block(phi);
1373         allocation_info_t *info  = get_allocation_info(phi);
1374
1375         int arity = get_irn_arity(phi);
1376         for (int i = 0; i < arity; ++i) {
1377                 ir_node               *op  = get_irn_n(phi, i);
1378                 const arch_register_t *reg = arch_get_irn_register(op);
1379
1380                 if (reg == NULL)
1381                         continue;
1382                 /* we only give the bonus if the predecessor already has registers
1383                  * assigned, otherwise we only see a dummy value
1384                  * and any conclusions about its register are useless */
1385                 ir_node      *pred_block      = get_Block_cfgpred_block(block, i);
1386                 block_info_t *pred_block_info = get_block_info(pred_block);
1387                 if (!pred_block_info->processed)
1388                         continue;
1389
1390                 /* give bonus for already assigned register */
1391                 float weight = (float)get_block_execfreq(pred_block);
1392                 info->prefs[reg->index] += weight * AFF_PHI;
1393         }
1394 }
1395
1396 /**
1397  * After a phi has been assigned a register propagate preference inputs
1398  * to the phi inputs.
1399  */
1400 static void propagate_phi_register(ir_node *phi, unsigned assigned_r)
1401 {
1402         ir_node *block = get_nodes_block(phi);
1403
1404         int arity = get_irn_arity(phi);
1405         for (int i = 0; i < arity; ++i) {
1406                 ir_node           *op         = get_Phi_pred(phi, i);
1407                 allocation_info_t *info       = get_allocation_info(op);
1408                 ir_node           *pred_block = get_Block_cfgpred_block(block, i);
1409                 float              weight
1410                         = (float)get_block_execfreq(pred_block) * AFF_PHI;
1411
1412                 if (info->prefs[assigned_r] >= weight)
1413                         continue;
1414
1415                 /* promote the prefered register */
1416                 for (unsigned r = 0; r < n_regs; ++r) {
1417                         if (info->prefs[r] > -weight) {
1418                                 info->prefs[r] = -weight;
1419                         }
1420                 }
1421                 info->prefs[assigned_r] = weight;
1422
1423                 if (is_Phi(op))
1424                         propagate_phi_register(op, assigned_r);
1425         }
1426 }
1427
1428 static void assign_phi_registers(ir_node *block)
1429 {
1430         /* count phi nodes */
1431         int n_phis = 0;
1432         sched_foreach(block, node) {
1433                 if (!is_Phi(node))
1434                         break;
1435                 if (!arch_irn_consider_in_reg_alloc(cls, node))
1436                         continue;
1437                 ++n_phis;
1438         }
1439
1440         if (n_phis == 0)
1441                 return;
1442
1443         /* build a bipartite matching problem for all phi nodes */
1444         hungarian_problem_t *bp
1445                 = hungarian_new(n_phis, n_regs, HUNGARIAN_MATCH_PERFECT);
1446         int n = 0;
1447         sched_foreach(block, node) {
1448                 if (!is_Phi(node))
1449                         break;
1450                 if (!arch_irn_consider_in_reg_alloc(cls, node))
1451                         continue;
1452
1453                 /* give boni for predecessor colorings */
1454                 adapt_phi_prefs(node);
1455                 /* add stuff to bipartite problem */
1456                 allocation_info_t *info = get_allocation_info(node);
1457                 DB((dbg, LEVEL_3, "Prefs for %+F: ", node));
1458                 for (unsigned r = 0; r < n_regs; ++r) {
1459                         if (!rbitset_is_set(normal_regs, r))
1460                                 continue;
1461
1462                         float costs = info->prefs[r];
1463                         costs = costs < 0 ? -logf(-costs+1) : logf(costs+1);
1464                         costs *= 100;
1465                         costs += 10000;
1466                         hungarian_add(bp, n, r, (int)costs);
1467                         DB((dbg, LEVEL_3, " %s(%f)", arch_register_for_index(cls, r)->name,
1468                                                 info->prefs[r]));
1469                 }
1470                 DB((dbg, LEVEL_3, "\n"));
1471                 ++n;
1472         }
1473
1474         //hungarian_print_cost_matrix(bp, 7);
1475         hungarian_prepare_cost_matrix(bp, HUNGARIAN_MODE_MAXIMIZE_UTIL);
1476
1477         unsigned *assignment = ALLOCAN(unsigned, n_regs);
1478         int       res        = hungarian_solve(bp, assignment, NULL, 0);
1479         assert(res == 0);
1480
1481         /* apply results */
1482         n = 0;
1483         sched_foreach(block, node) {
1484                 if (!is_Phi(node))
1485                         break;
1486                 if (!arch_irn_consider_in_reg_alloc(cls, node))
1487                         continue;
1488                 const arch_register_req_t *req
1489                         = arch_get_irn_register_req(node);
1490
1491                 unsigned r = assignment[n++];
1492                 assert(rbitset_is_set(normal_regs, r));
1493                 const arch_register_t *reg = arch_register_for_index(cls, r);
1494                 DB((dbg, LEVEL_2, "Assign %+F -> %s\n", node, reg->name));
1495                 use_reg(node, reg, req->width);
1496
1497                 /* adapt preferences for phi inputs */
1498                 propagate_phi_register(node, r);
1499         }
1500 }
1501
1502 static arch_register_req_t *allocate_reg_req(ir_graph *irg)
1503 {
1504         struct obstack *obst = be_get_be_obst(irg);
1505         arch_register_req_t *req = OALLOCZ(obst, arch_register_req_t);
1506         return req;
1507 }
1508
1509 /**
1510  * Walker: assign registers to all nodes of a block that
1511  * need registers from the currently considered register class.
1512  */
1513 static void allocate_coalesce_block(ir_node *block, void *data)
1514 {
1515         (void) data;
1516         DB((dbg, LEVEL_2, "* Block %+F\n", block));
1517
1518         /* clear assignments */
1519         block_info_t *block_info  = get_block_info(block);
1520         assignments = block_info->assignments;
1521
1522         ir_nodeset_t live_nodes;
1523         ir_nodeset_init(&live_nodes);
1524
1525         /* gather regalloc infos of predecessor blocks */
1526         int            n_preds          = get_Block_n_cfgpreds(block);
1527         block_info_t **pred_block_infos = ALLOCAN(block_info_t*, n_preds);
1528         for (int i = 0; i < n_preds; ++i) {
1529                 ir_node      *pred      = get_Block_cfgpred_block(block, i);
1530                 block_info_t *pred_info = get_block_info(pred);
1531                 pred_block_infos[i]     = pred_info;
1532         }
1533
1534         ir_node **phi_ins = ALLOCAN(ir_node*, n_preds);
1535
1536         /* collect live-in nodes and preassigned values */
1537         be_lv_foreach(lv, block, be_lv_state_in, node) {
1538                 const arch_register_req_t *req = arch_get_irn_register_req(node);
1539                 if (req->cls != cls)
1540                         continue;
1541
1542                 if (arch_register_req_is(req, ignore)) {
1543                         allocation_info_t *info = get_allocation_info(node);
1544                         info->current_value = node;
1545
1546                         const arch_register_t *reg = arch_get_irn_register(node);
1547                         assert(reg != NULL); /* ignore values must be preassigned */
1548                         use_reg(node, reg, req->width);
1549                         continue;
1550                 }
1551
1552                 /* check all predecessors for this value, if it is not everywhere the
1553                    same or unknown then we have to construct a phi
1554                    (we collect the potential phi inputs here) */
1555                 bool need_phi = false;
1556                 for (int p = 0; p < n_preds; ++p) {
1557                         block_info_t *pred_info = pred_block_infos[p];
1558
1559                         if (!pred_info->processed) {
1560                                 /* use node for now, it will get fixed later */
1561                                 phi_ins[p] = node;
1562                                 need_phi   = true;
1563                         } else {
1564                                 int a = find_value_in_block_info(pred_info, node);
1565
1566                                 /* must live out of predecessor */
1567                                 assert(a >= 0);
1568                                 phi_ins[p] = pred_info->assignments[a];
1569                                 /* different value from last time? then we need a phi */
1570                                 if (p > 0 && phi_ins[p-1] != phi_ins[p]) {
1571                                         need_phi = true;
1572                                 }
1573                         }
1574                 }
1575
1576                 if (need_phi) {
1577                         ir_mode *mode = get_irn_mode(node);
1578                         const arch_register_req_t *phi_req = cls->class_req;
1579                         if (req->width > 1) {
1580                                 arch_register_req_t *new_req = allocate_reg_req(irg);
1581                                 new_req->cls   = cls;
1582                                 new_req->type  = req->type & arch_register_req_type_aligned;
1583                                 new_req->width = req->width;
1584                                 phi_req = new_req;
1585                         }
1586                         ir_node *phi  = be_new_Phi(block, n_preds, phi_ins, mode,
1587                                                    phi_req);
1588
1589                         DB((dbg, LEVEL_3, "Create Phi %+F (for %+F) -", phi, node));
1590 #ifdef DEBUG_libfirm
1591                         for (int pi = 0; pi < n_preds; ++pi) {
1592                                 DB((dbg, LEVEL_3, " %+F", phi_ins[pi]));
1593                         }
1594                         DB((dbg, LEVEL_3, "\n"));
1595 #endif
1596                         mark_as_copy_of(phi, node);
1597                         sched_add_after(block, phi);
1598
1599                         node = phi;
1600                 } else {
1601                         allocation_info_t *info = get_allocation_info(node);
1602                         info->current_value = phi_ins[0];
1603
1604                         /* Grab 1 of the inputs we constructed (might not be the same as
1605                          * "node" as we could see the same copy of the value in all
1606                          * predecessors */
1607                         node = phi_ins[0];
1608                 }
1609
1610                 /* if the node already has a register assigned use it */
1611                 const arch_register_t *reg = arch_get_irn_register(node);
1612                 if (reg != NULL) {
1613                         use_reg(node, reg, req->width);
1614                 }
1615
1616                 /* remember that this node is live at the beginning of the block */
1617                 ir_nodeset_insert(&live_nodes, node);
1618         }
1619
1620         /** Collects registers which must not be used for optimistic splits. */
1621         unsigned *const forbidden_regs = rbitset_alloca(n_regs);
1622
1623         /* handle phis... */
1624         assign_phi_registers(block);
1625
1626         /* all live-ins must have a register */
1627 #ifndef NDEBUG
1628         foreach_ir_nodeset(&live_nodes, node, iter) {
1629                 const arch_register_t *reg = arch_get_irn_register(node);
1630                 assert(reg != NULL);
1631         }
1632 #endif
1633
1634         /* assign instructions in the block */
1635         sched_foreach(block, node) {
1636                 /* phis are already assigned */
1637                 if (is_Phi(node))
1638                         continue;
1639
1640                 rewire_inputs(node);
1641
1642                 /* enforce use constraints */
1643                 rbitset_clear_all(forbidden_regs, n_regs);
1644                 enforce_constraints(&live_nodes, node, forbidden_regs);
1645
1646                 rewire_inputs(node);
1647
1648                 /* we may not use registers used for inputs for optimistic splits */
1649                 be_foreach_use(node, cls, in_req, op, op_req,
1650                         const arch_register_t *reg = arch_get_irn_register(op);
1651                         rbitset_set(forbidden_regs, reg->index);
1652                 );
1653
1654                 /* free registers of values last used at this instruction */
1655                 free_last_uses(&live_nodes, node);
1656
1657                 /* assign output registers */
1658                 be_foreach_definition_(node, cls, value, req,
1659                         assign_reg(block, value, req, forbidden_regs);
1660                 );
1661         }
1662
1663         ir_nodeset_destroy(&live_nodes);
1664         assignments = NULL;
1665
1666         block_info->processed = true;
1667
1668         /* permute values at end of predecessor blocks in case of phi-nodes */
1669         if (n_preds > 1) {
1670                 for (int p = 0; p < n_preds; ++p) {
1671                         add_phi_permutations(block, p);
1672                 }
1673         }
1674
1675         /* if we have exactly 1 successor then we might be able to produce phi
1676            copies now */
1677         if (get_irn_n_edges_kind(block, EDGE_KIND_BLOCK) == 1) {
1678                 const ir_edge_t *edge
1679                         = get_irn_out_edge_first_kind(block, EDGE_KIND_BLOCK);
1680                 ir_node      *succ      = get_edge_src_irn(edge);
1681                 int           p         = get_edge_src_pos(edge);
1682                 block_info_t *succ_info = get_block_info(succ);
1683
1684                 if (succ_info->processed) {
1685                         add_phi_permutations(succ, p);
1686                 }
1687         }
1688 }
1689
1690 typedef struct block_costs_t block_costs_t;
1691 struct block_costs_t {
1692         float costs;   /**< costs of the block */
1693         int   dfs_num; /**< depth first search number (to detect backedges) */
1694 };
1695
1696 static int cmp_block_costs(const void *d1, const void *d2)
1697 {
1698         const ir_node       * const *block1 = (const ir_node**)d1;
1699         const ir_node       * const *block2 = (const ir_node**)d2;
1700         const block_costs_t *info1  = (const block_costs_t*)get_irn_link(*block1);
1701         const block_costs_t *info2  = (const block_costs_t*)get_irn_link(*block2);
1702         return QSORT_CMP(info2->costs, info1->costs);
1703 }
1704
1705 static void determine_block_order(void)
1706 {
1707         ir_node **blocklist = be_get_cfgpostorder(irg);
1708         size_t    n_blocks  = ARR_LEN(blocklist);
1709         int       dfs_num   = 0;
1710         pdeq     *worklist  = new_pdeq();
1711         ir_node **order     = XMALLOCN(ir_node*, n_blocks);
1712         size_t    order_p   = 0;
1713
1714         /* clear block links... */
1715         for (size_t p = 0; p < n_blocks; ++p) {
1716                 ir_node *block = blocklist[p];
1717                 set_irn_link(block, NULL);
1718         }
1719
1720         /* walk blocks in reverse postorder, the costs for each block are the
1721          * sum of the costs of its predecessors (excluding the costs on backedges
1722          * which we can't determine) */
1723         for (size_t p = n_blocks; p > 0;) {
1724                 block_costs_t *cost_info;
1725                 ir_node *block = blocklist[--p];
1726
1727                 float execfreq   = (float)get_block_execfreq(block);
1728                 float costs      = execfreq;
1729                 int   n_cfgpreds = get_Block_n_cfgpreds(block);
1730                 for (int p2 = 0; p2 < n_cfgpreds; ++p2) {
1731                         ir_node       *pred_block = get_Block_cfgpred_block(block, p2);
1732                         block_costs_t *pred_costs = (block_costs_t*)get_irn_link(pred_block);
1733                         /* we don't have any info for backedges */
1734                         if (pred_costs == NULL)
1735                                 continue;
1736                         costs += pred_costs->costs;
1737                 }
1738
1739                 cost_info          = OALLOCZ(&obst, block_costs_t);
1740                 cost_info->costs   = costs;
1741                 cost_info->dfs_num = dfs_num++;
1742                 set_irn_link(block, cost_info);
1743         }
1744
1745         /* sort array by block costs */
1746         qsort(blocklist, n_blocks, sizeof(blocklist[0]), cmp_block_costs);
1747
1748         ir_reserve_resources(irg, IR_RESOURCE_BLOCK_VISITED);
1749         inc_irg_block_visited(irg);
1750
1751         for (size_t p = 0; p < n_blocks; ++p) {
1752                 ir_node *block = blocklist[p];
1753                 if (Block_block_visited(block))
1754                         continue;
1755
1756                 /* continually add predecessors with highest costs to worklist
1757                  * (without using backedges) */
1758                 do {
1759                         block_costs_t *info       = (block_costs_t*)get_irn_link(block);
1760                         ir_node       *best_pred  = NULL;
1761                         float          best_costs = -1;
1762                         int            n_cfgpred  = get_Block_n_cfgpreds(block);
1763
1764                         pdeq_putr(worklist, block);
1765                         mark_Block_block_visited(block);
1766                         for (int i = 0; i < n_cfgpred; ++i) {
1767                                 ir_node       *pred_block = get_Block_cfgpred_block(block, i);
1768                                 block_costs_t *pred_info  = (block_costs_t*)get_irn_link(pred_block);
1769
1770                                 /* ignore backedges */
1771                                 if (pred_info->dfs_num > info->dfs_num)
1772                                         continue;
1773
1774                                 if (info->costs > best_costs) {
1775                                         best_costs = info->costs;
1776                                         best_pred  = pred_block;
1777                                 }
1778                         }
1779                         block = best_pred;
1780                 } while (block != NULL && !Block_block_visited(block));
1781
1782                 /* now put all nodes in the worklist in our final order */
1783                 while (!pdeq_empty(worklist)) {
1784                         ir_node *pblock = (ir_node*)pdeq_getr(worklist);
1785                         assert(order_p < n_blocks);
1786                         order[order_p++] = pblock;
1787                 }
1788         }
1789         assert(order_p == n_blocks);
1790         del_pdeq(worklist);
1791
1792         ir_free_resources(irg, IR_RESOURCE_BLOCK_VISITED);
1793
1794         DEL_ARR_F(blocklist);
1795
1796         obstack_free(&obst, NULL);
1797         obstack_init(&obst);
1798
1799         block_order   = order;
1800         n_block_order = n_blocks;
1801 }
1802
1803 static void free_block_order(void)
1804 {
1805         xfree(block_order);
1806 }
1807
1808 /**
1809  * Run the register allocator for the current register class.
1810  */
1811 static void be_pref_alloc_cls(void)
1812 {
1813         be_assure_live_sets(irg);
1814         lv = be_get_irg_liveness(irg);
1815
1816         ir_reserve_resources(irg, IR_RESOURCE_IRN_LINK);
1817
1818         DB((dbg, LEVEL_2, "=== Allocating registers of %s ===\n", cls->name));
1819
1820         be_clear_links(irg);
1821
1822         irg_block_walk_graph(irg, NULL, analyze_block, NULL);
1823         combine_congruence_classes();
1824
1825         for (size_t i = 0; i < n_block_order; ++i) {
1826                 ir_node *block = block_order[i];
1827                 allocate_coalesce_block(block, NULL);
1828         }
1829
1830         ir_free_resources(irg, IR_RESOURCE_IRN_LINK);
1831 }
1832
1833 static void dump(int mask, ir_graph *irg, const char *suffix)
1834 {
1835         if (be_options.dump_flags & mask)
1836                 dump_ir_graph(irg, suffix);
1837 }
1838
1839 /**
1840  * Run the spiller on the current graph.
1841  */
1842 static void spill(void)
1843 {
1844         /* make sure all nodes show their real register pressure */
1845         be_timer_push(T_RA_CONSTR);
1846         be_pre_spill_prepare_constr(irg, cls);
1847         be_timer_pop(T_RA_CONSTR);
1848
1849         dump(DUMP_RA, irg, "spillprepare");
1850
1851         /* spill */
1852         be_timer_push(T_RA_SPILL);
1853         be_do_spill(irg, cls);
1854         be_timer_pop(T_RA_SPILL);
1855
1856         be_timer_push(T_RA_SPILL_APPLY);
1857         check_for_memory_operands(irg);
1858         be_timer_pop(T_RA_SPILL_APPLY);
1859
1860         dump(DUMP_RA, irg, "spill");
1861 }
1862
1863 /**
1864  * The pref register allocator for a whole procedure.
1865  */
1866 static void be_pref_alloc(ir_graph *new_irg)
1867 {
1868         obstack_init(&obst);
1869
1870         irg = new_irg;
1871
1872         /* determine a good coloring order */
1873         determine_block_order();
1874
1875         const arch_env_t *arch_env = be_get_irg_arch_env(new_irg);
1876         int               n_cls    = arch_env->n_register_classes;
1877         for (int c = 0; c < n_cls; ++c) {
1878                 cls = &arch_env->register_classes[c];
1879                 if (arch_register_class_flags(cls) & arch_register_class_flag_manual_ra)
1880                         continue;
1881
1882                 stat_ev_ctx_push_str("regcls", cls->name);
1883
1884                 n_regs      = arch_register_class_n_regs(cls);
1885                 normal_regs = rbitset_malloc(n_regs);
1886                 be_set_allocatable_regs(irg, cls, normal_regs);
1887
1888                 spill();
1889
1890                 /* verify schedule and register pressure */
1891                 be_timer_push(T_VERIFY);
1892                 if (be_options.verify_option == BE_VERIFY_WARN) {
1893                         be_verify_schedule(irg);
1894                         be_verify_register_pressure(irg, cls);
1895                 } else if (be_options.verify_option == BE_VERIFY_ASSERT) {
1896                         assert(be_verify_schedule(irg) && "Schedule verification failed");
1897                         assert(be_verify_register_pressure(irg, cls)
1898                                 && "Register pressure verification failed");
1899                 }
1900                 be_timer_pop(T_VERIFY);
1901
1902                 be_timer_push(T_RA_COLOR);
1903                 be_pref_alloc_cls();
1904                 be_timer_pop(T_RA_COLOR);
1905
1906                 /* we most probably constructed new Phis so liveness info is invalid
1907                  * now */
1908                 be_invalidate_live_sets(irg);
1909                 free(normal_regs);
1910
1911                 stat_ev_ctx_pop("regcls");
1912         }
1913
1914         free_block_order();
1915
1916         be_timer_push(T_RA_SPILL_APPLY);
1917         be_abi_fix_stack_nodes(irg);
1918         be_timer_pop(T_RA_SPILL_APPLY);
1919
1920         be_timer_push(T_VERIFY);
1921         if (be_options.verify_option == BE_VERIFY_WARN) {
1922                 be_verify_register_allocation(irg);
1923         } else if (be_options.verify_option == BE_VERIFY_ASSERT) {
1924                 assert(be_verify_register_allocation(irg)
1925                        && "Register allocation invalid");
1926         }
1927         be_timer_pop(T_VERIFY);
1928
1929         obstack_free(&obst, NULL);
1930 }
1931
1932 BE_REGISTER_MODULE_CONSTRUCTOR(be_init_pref_alloc)
1933 void be_init_pref_alloc(void)
1934 {
1935         static be_ra_t be_ra_pref = { be_pref_alloc };
1936         be_register_allocator("pref", &be_ra_pref);
1937         FIRM_DBG_REGISTER(dbg, "firm.be.prefalloc");
1938 }