handle ia32_l_vfist node
[libfirm] / ir / be / bespillbelady2.c
index ab93c12..0f5ac51 100644 (file)
  * @file
  * @brief       Beladys spillalgorithm version 2.
  * @author      Daniel Grund, Matthias Braun, Sebastian Hack
- * @date        20.09.2005
+ * @date        01.08.2007
  * @version     $Id$
+ *
+ * The main differences to the original Belady are:
+ * - The workset is empty at the start of a block
+ *   There is no attempt to fill it with variables which
+ *   are not used in the block.
+ * - There is a global pass which tries to use the remaining
+ *   capacity of the blocks to let global variables live through
+ *   them.
  */
 #ifdef HAVE_CONFIG_H
 #include "config.h"
 #endif
 
 #include <math.h>
+#include <limits.h>
 
 #include "obst.h"
 #include "irnodeset.h"
+#include "irbitset.h"
 #include "irprintf_t.h"
 #include "irgraph.h"
 #include "irnode.h"
@@ -39,6 +49,7 @@
 #include "irgwalk.h"
 #include "irloop.h"
 #include "iredges_t.h"
+#include "irphase_t.h"
 #include "ircons_t.h"
 #include "irprintf.h"
 #include "execfreq.h"
@@ -47,7 +58,6 @@
 #include "beutil.h"
 #include "bearch_t.h"
 #include "bespillbelady.h"
-#include "beuses.h"
 #include "besched_t.h"
 #include "beirgmod.h"
 #include "belive_t.h"
 #define DBG_TRACE    64
 #define DBG_WORKSET 128
 #define DBG_GLOBAL  256
-DEBUG_ONLY(static firm_dbg_module_t *dbg = NULL;)
 
+#define DEAD     UINT_MAX
+#define LIVE_END (DEAD-1)
+
+DEBUG_ONLY(static firm_dbg_module_t *dbg = NULL;)
 
 /**
  * An association between a node and a point in time.
@@ -78,8 +91,7 @@ typedef struct _loc_t {
   unsigned time;       /**< A use time (see beuses.h). */
   unsigned version;    /**< That is used in the global pass below.
                                                 For usage see the comments below.
-                                                In the local belady pass, this is not
-                                                important. */
+                                                In the local belady pass, this is not important. */
 } loc_t;
 
 typedef struct _workset_t {
@@ -89,6 +101,7 @@ typedef struct _workset_t {
 
 typedef struct _belady_env_t {
        struct obstack ob;
+       ir_graph *irg;
        const arch_env_t *arch;
        const arch_register_class_t *cls;
        be_lv_t *lv;
@@ -98,7 +111,6 @@ typedef struct _belady_env_t {
        int n_blocks;       /**< Number of blocks in the graph. */
        int n_regs;                     /**< number of regs in this reg-class */
        workset_t *ws;          /**< the main workset used while processing a block. ob-allocated */
-       be_uses_t *uses;        /**< env for the next-use magic */
        ir_node *instr;         /**< current instruction */
        unsigned instr_nr;      /**< current instruction number (relative to block start) */
 
@@ -106,18 +118,11 @@ typedef struct _belady_env_t {
 } belady_env_t;
 
 
-static int loc_compare_idx(const void *a, const void *b)
-{
-       const loc_t *p = a;
-       const loc_t *q = b;
-       return (int) get_irn_idx(p->irn) - (int) get_irn_idx(q->irn);
-}
 static int loc_compare(const void *a, const void *b)
 {
        const loc_t *p = a;
        const loc_t *q = b;
-       int diff  = (int) p->time - (int) q->time;
-       return diff != 0 ? diff : loc_compare_idx(a, b);
+       return (p->time > q->time) - (p->time < q->time);
 }
 
 static INLINE void workset_print(const workset_t *w)
@@ -241,17 +246,15 @@ static INLINE int workset_get_index(const workset_t *ws, const ir_node *val) {
 #define workset_sort(ws) qsort((ws)->vals, (ws)->len, sizeof((ws)->vals[0]), loc_compare);
 #define workset_contains(ws, n) (workset_get_index(ws, n) >= 0)
 
-typedef struct _block_use_t {
-       struct _block_use_t *next;
-       ir_node *insn;
-       int pos, tick;
-} block_use_t;
-
 typedef struct _block_info_t {
        belady_env_t *bel;
        const ir_node *bl;
-       ir_node *first_non_in;   /**< First node in block which is not a phi.  */
        workset_t *ws_start, *ws_end;
+       ir_phase next_uses;
+
+       ir_node *first_non_in;   /**< First node in block which is not a phi.  */
+       ir_node *last_ins;       /**< The instruction before which end of
+                                                          block reloads will be inserted. */
 
        workset_t *entrance_reg; /**< That set will contain all values
                                                                  transported into the block which
@@ -260,12 +263,6 @@ typedef struct _block_info_t {
                                                                  bring them into the block in a register
                                                                  or reload them at the entry of the block. */
 
-       workset_t * entrance_mem; /**< That set will contain all variables
-                                                                 (transported into the block)
-                                                                 which are in the memory upon entering
-                                                                 the block:
-                                                                 entrance_reg U entrance_mem = live-in + local phis. */
-
        int pressure; /**< The amount of registers which remain free
                                        in this block. This capacity can be used to let
                                        global variables, transported into other blocks,
@@ -274,10 +271,11 @@ typedef struct _block_info_t {
        double exec_freq; /**< The execution frequency of this block. */
 } block_info_t;
 
-
 static INLINE void *new_block_info(belady_env_t *bel, ir_node *bl) {
        block_info_t *res = obstack_alloc(&bel->ob, sizeof(*res));
        memset(res, 0, sizeof(res[0]));
+       res->first_non_in = NULL;
+       res->last_ins = NULL;
        res->bel = bel;
        res->bl  = bl;
        res->entrance_reg = new_workset(bel, &bel->ob);
@@ -289,38 +287,114 @@ static INLINE void *new_block_info(belady_env_t *bel, ir_node *bl) {
 #define get_block_info(block)        ((block_info_t *)get_irn_link(block))
 #define set_block_info(block, info)  set_irn_link(block, info)
 
-/**
- * @return The distance to the next use or 0 if irn has dont_spill flag set
- */
-static INLINE unsigned get_distance(belady_env_t *env, ir_node *from, unsigned from_step, const ir_node *def, int skip_from_uses)
+static INLINE ir_node *block_info_get_last_ins(block_info_t *bi)
+{
+       if (!bi->last_ins)
+               bi->last_ins = be_get_end_of_block_insertion_point(bi->bl);
+
+       return bi->last_ins;
+}
+
+typedef struct _next_use_t {
+       unsigned is_first_use : 1; /**< Indicate that this use is the first
+                                                                in the block. Needed to identify
+                                                                transport in values for the global
+                                                                pass. */
+       int step;                  /**< The time step of the use. */
+       struct _next_use_t *next;  /**< The next use int this block
+                                                                or NULL. */
+} next_use_t;
+
+static void *next_use_init(ir_phase *phase, ir_node *irn, void *old)
+{
+       (void) phase;
+       (void) irn;
+       (void) old;
+       return NULL;
+}
+
+static void build_next_uses(block_info_t *bi)
+{
+       ir_node *irn;
+
+       phase_init(&bi->next_uses, "next uses", bi->bel->irg, PHASE_DEFAULT_GROWTH, next_use_init, NULL);
+       sched_foreach_reverse(bi->bl, irn) {
+               int i, step = sched_get_time_step(irn);
+
+               if (is_Phi(irn))
+                       break;
+
+               for (i = get_irn_arity(irn) - 1; i >= 0; --i) {
+                       ir_node *op = get_irn_n(irn, i);
+                       next_use_t *curr = phase_get_irn_data(&bi->next_uses, op);
+                       next_use_t *use  = phase_alloc(&bi->next_uses, sizeof(use[0]));
+
+                       assert(step >= 0);
+                       use->is_first_use = 1;
+                       use->step         = step;
+                       use->next         = curr;
+
+                       if (curr)
+                               curr->is_first_use = 0;
+
+                       phase_set_irn_data(&bi->next_uses, op, use);
+               }
+       }
+}
+
+#define get_current_use(bi, irn)        phase_get_irn_data(&(bi)->next_uses, (irn))
+
+static INLINE void advance_current_use(block_info_t *bi, const ir_node *irn)
 {
-       be_next_use_t use;
-       int flags = arch_irn_get_flags(env->arch, def);
+       next_use_t *use = get_current_use(bi, irn);
+
+       assert(use);
+       phase_set_irn_data(&bi->next_uses, irn, use->next);
+}
 
-       assert(! (flags & arch_irn_flags_ignore));
+static INLINE unsigned get_curr_distance(block_info_t *bi, const ir_node *irn, int is_usage)
+{
+       belady_env_t *env = bi->bel;
+       next_use_t *use   = get_current_use(bi, irn);
+       int curr_step     = sched_get_time_step(env->instr);
+       int flags         = arch_irn_get_flags(env->arch, irn);
 
-       use = be_get_next_use(env->uses, from, from_step, def, skip_from_uses);
-       if(USES_IS_INFINITE(use.time))
-               return USES_INFINITY;
+       assert(!(flags & arch_irn_flags_ignore));
 
        /* We have to keep nonspillable nodes in the workingset */
        if(flags & arch_irn_flags_dont_spill)
                return 0;
 
-       return use.time;
+       if (!is_usage && use && use->step == curr_step)
+               use = use->next;
+
+       if (use) {
+               assert(use->step >= curr_step);
+               return use->step - curr_step;
+       }
+
+       return be_is_live_end(env->lv, bi->bl, irn) ? LIVE_END : DEAD;
+}
+
+static INLINE int is_local_phi(const ir_node *bl, const ir_node *irn)
+{
+       return is_Phi(irn) && get_nodes_block(irn) == bl;
 }
 
 /**
  * Check, if the value is something that is transported into a block.
- * That is, the value is live in or defined by a Phi in the block.
+ * That is, the value is defined elsewhere or defined by a Phi in the block.
  * @param env  The belady environment.
  * @param bl   The block in question.
  * @param irn  The node in question.
  * @return     1, if node is something transported into @p bl, 0 if not.
+ * @note       The function will only give correct answers in the case
+ *             where @p irn is unsed in the block @p bl which is always
+ *             the case in our usage scenario.
  */
-static INLINE int is_transport_in(belady_env_t *env, const ir_node *bl, const ir_node *irn)
+static INLINE int is_transport_in(const ir_node *bl, const ir_node *irn)
 {
-       return (is_Phi(irn) && get_nodes_block(irn) == bl) || be_is_live_in(env->lv, bl, irn);
+       return is_local_phi(bl, irn) || get_nodes_block(irn) != bl;
 }
 
 /**
@@ -329,21 +403,16 @@ static INLINE int is_transport_in(belady_env_t *env, const ir_node *bl, const ir
  * - as few as possible other values are disposed
  * - the worst values get disposed
  *
- * TODO Sebastian:
- * Actually, we should displace a value immediately after it was used.
- * If we don't, the cardinality of the workset does not reflect the register pressure.
- * That might be necessary to determine the capacity left in the block.
- *
  * @p is_usage indicates that the values in new_vals are used (not defined)
  * In this case reloads must be performed
  */
 static void displace(block_info_t *bi, workset_t *new_vals, int is_usage) {
-       belady_env_t *env = bi->bel;
-       ir_node *val;
-       int     i, len, max_allowed, demand, iter;
+       belady_env_t *env       = bi->bel;
+       workset_t    *ws        = env->ws;
+       ir_node     **to_insert = alloca(env->n_regs * sizeof(to_insert[0]));
 
-       workset_t *ws         = env->ws;
-       ir_node   **to_insert = alloca(env->n_regs * sizeof(*to_insert));
+       int i, len, max_allowed, demand, iter;
+       ir_node *val;
 
        /*
                1. Identify the number of needed slots and the values to reload
@@ -357,20 +426,23 @@ static void displace(block_info_t *bi, workset_t *new_vals, int is_usage) {
                        to_insert[demand++] = val;
                        if (is_usage) {
                                int insert_reload = 1;
+                               next_use_t *use = get_current_use(bi, val);
 
                                /*
-                                * if we use a value which is transported in this block,
-                                * i.e. a phi defined here or a live in, we check if there
-                                * is room for that guy to survive from the block's entrance
-                                * to here or not.
+                                * if we use a value which is transported in this block, i.e. a
+                                * phi defined here or a live in, for the first time, we check
+                                * if there is room for that guy to survive from the block's
+                                * entrance to here or not.
                                 */
-                               if (is_transport_in(env, bi->bl, val)) {
-                                       DBG((dbg, DBG_SPILL, "entrance node %+F, capacity %d:\n", val, bi->pressure));
+                               assert(use);
+                               assert(sched_get_time_step(env->instr) == use->step);
+                               if (is_transport_in(bi->bl, val) && use->is_first_use) {
+                                       DBG((dbg, DBG_DECIDE, "entrance node %+F, pressure %d:\n", val, bi->pressure));
                                        if (bi->pressure < env->n_regs) {
-                                               ++bi->pressure;
                                                workset_insert(env, bi->entrance_reg, val);
                                                insert_reload = 0;
-                                               DBG((dbg, DBG_SPILL, "... no reload. must be considered at block start\n"));
+                                               ++bi->pressure;
+                                               DBG((dbg, DBG_DECIDE, "... no reload. must be considered at block start\n"));
                                        }
                                }
 
@@ -379,8 +451,7 @@ static void displace(block_info_t *bi, workset_t *new_vals, int is_usage) {
                                        be_add_reload(env->senv, val, env->instr, env->cls, 1);
                                }
                        }
-               }
-               else {
+               } else {
                        assert(is_usage || "Defined value already in workset?!?");
                        DBG((dbg, DBG_DECIDE, "    skip %+F\n", val));
                }
@@ -393,13 +464,16 @@ static void displace(block_info_t *bi, workset_t *new_vals, int is_usage) {
        len         = workset_get_length(ws);
        max_allowed = env->n_regs - demand;
 
-       DBG((dbg, DBG_DECIDE, "    disposing %d values\n", ws->len - max_allowed));
-
        /* Only make more free room if we do not have enough */
        if (len > max_allowed) {
+               // int curr_step = sched_get_time_step(env->instr);
+
+               DBG((dbg, DBG_DECIDE, "    disposing %d values\n", len - max_allowed));
+
                /* get current next-use distance */
                for (i = 0; i < ws->len; ++i) {
-                       unsigned dist = get_distance(env, env->instr, env->instr_nr, workset_get_val(ws, i), !is_usage);
+                       ir_node *val  = workset_get_val(ws, i);
+                       unsigned dist = get_curr_distance(bi, val, is_usage);
                        workset_set_time(ws, i, dist);
                }
 
@@ -412,9 +486,15 @@ static void displace(block_info_t *bi, workset_t *new_vals, int is_usage) {
 
        /*
                3. Insert the new values into the workset
+                  Also, we update the pressure in the block info.
+                  That is important for the global pass to decide
+                  how many values can live through the block.
        */
        for (i = 0; i < demand; ++i)
                workset_insert(env, env->ws, to_insert[i]);
+
+       bi->pressure = MAX(bi->pressure, workset_get_length(env->ws));
+
 }
 
 /**
@@ -425,16 +505,23 @@ static void displace(block_info_t *bi, workset_t *new_vals, int is_usage) {
 static void belady(ir_node *block, void *data) {
        belady_env_t *env        = data;
        block_info_t *block_info = new_block_info(env, block);
+       void *obst_state         = obstack_base(&env->ob);
+
        workset_t *new_vals;
        ir_node *irn;
        int iter;
 
-       /* process the block from start to end */
-       DBG((dbg, DBG_WSETS, "Processing...\n"));
-       env->instr_nr = 0;
+       DBG((dbg, DBG_WSETS, "Processing %+F...\n", block_info->bl));
        new_vals = new_workset(env, &env->ob);
+       workset_clear(env->ws);
+
+       /* build the next use information for this block. */
+       build_next_uses(block_info);
+
+       env->instr_nr = 0;
        block_info->first_non_in = NULL;
 
+       /* process the block from start to end */
        sched_foreach(block, irn) {
                int i, arity;
                assert(workset_get_length(env->ws) <= env->n_regs && "Too much values in workset!");
@@ -461,6 +548,23 @@ static void belady(ir_node *block, void *data) {
                }
                displace(block_info, new_vals, 1);
 
+               /*
+                * set all used variables to the next use in their next_use_t list
+                * Also, kill all dead variables from the workset. They are only
+                * augmenting the pressure. Note, that a variable is dead
+                * if it has no further use in this block and is *not* live end
+                */
+               for(i = 0, arity = get_irn_arity(irn); i < arity; ++i) {
+                       ir_node *op = get_irn_n(irn, i);
+                       next_use_t *use = get_current_use(block_info, op);
+
+                       assert(use);
+                       if (!use->next && !be_is_live_end(env->lv, block, op))
+                               workset_remove(env->ws, op);
+
+                       advance_current_use(block_info, op);
+               }
+
                /* allocate all values _defined_ by this instruction */
                workset_clear(new_vals);
                if (get_irn_mode(irn) == mode_T) { /* special handling for tuples and projs */
@@ -475,15 +579,18 @@ static void belady(ir_node *block, void *data) {
                }
                displace(block_info, new_vals, 0);
 
-               block_info->pressure = MAX(block_info->pressure, workset_get_length(env->ws));
                env->instr_nr++;
        }
 
+       phase_free(&block_info->next_uses);
+       obstack_free(&env->ob, obst_state);
+
        /* Remember end-workset for this block */
        block_info->ws_end = workset_clone(env, &env->ob, env->ws);
        DBG((dbg, DBG_WSETS, "End workset for %+F:\n", block));
        workset_foreach(block_info->ws_end, irn, iter)
                DBG((dbg, DBG_WSETS, "  %+F (%u)\n", irn, workset_get_time(block_info->ws_end, iter)));
+       DBG((dbg, DBG_WSETS, "Max pressure in block: %d\n", block_info->pressure));
 }
 
 /*
@@ -502,7 +609,8 @@ static int block_freq_gt(const void *a, const void *b)
        const ir_node * const *q = b;
        block_info_t *pi = get_block_info(*p);
        block_info_t *qi = get_block_info(*q);
-       return qi->exec_freq - pi->exec_freq;
+       double diff = qi->exec_freq - pi->exec_freq;
+       return (diff > 0) - (diff < 0);
 }
 
 typedef struct _block_end_state_t {
@@ -516,26 +624,48 @@ typedef struct _block_end_state_t {
 
 typedef struct _global_end_state_t {
        belady_env_t *env;
+       bitset_t *succ_phis;
        struct obstack obst;
        block_end_state_t *end_info;
        unsigned gauge;
        unsigned version;
 } global_end_state_t;
 
-static block_end_state_t *get_block_end_state(global_end_state_t *state, ir_node *bl, ir_node *irn)
+typedef struct {
+       void *obst_level;
+       unsigned gauge;
+} rollback_info_t;
+
+static INLINE rollback_info_t trans_begin(global_end_state_t *ges)
+{
+       rollback_info_t rb;
+       rb.obst_level = obstack_base(&ges->obst);
+       rb.gauge      = ges->gauge;
+       return rb;
+}
+
+static INLINE void trans_rollback(global_end_state_t *ges, rollback_info_t *rb)
+{
+       ges->gauge = rb->gauge;
+       obstack_free(&ges->obst, rb->obst_level);
+}
+
+static unsigned get_block_end_state(global_end_state_t *state, ir_node *bl, ir_node *irn)
 {
        unsigned i;
 
        for (i = 0; i < state->gauge; ++i) {
                block_end_state_t *bei = &state->end_info[i];
                if (bei->bl == bl && bei->irn == irn)
-                       return bei;
+                       return i;
        }
 
        {
                block_info_t *bi = get_block_info(bl);
                block_end_state_t *curr;
 
+               i = state->gauge;
+
                /* make sure we have room in the array */
                ARR_EXTO(block_end_state_t, state->end_info, (int) state->gauge);
 
@@ -547,21 +677,22 @@ static block_end_state_t *get_block_end_state(global_end_state_t *state, ir_node
                curr->end_state = workset_clone(state->env, &state->obst, bi->ws_end);
                curr->costs = -1.0;
                ++state->gauge;
-               return curr;
+               return i;
        }
 }
 
-static double can_bring_in(global_end_state_t *ges, ir_node *bl, ir_node *irn, int level);
+static double can_bring_in(global_end_state_t *ges, ir_node *bl, ir_node *irn, double limit, int level);
 
-static double can_make_available_at_end(global_end_state_t *ges, ir_node *bl, ir_node *irn, int level)
+static double can_make_available_at_end(global_end_state_t *ges, ir_node *bl, ir_node *irn, double limit, int level)
 {
-       block_end_state_t *bes = get_block_end_state(ges, bl, irn);
+       unsigned bes_index     = get_block_end_state(ges, bl, irn);
+       block_end_state_t *bes = &ges->end_info[bes_index];
        workset_t *end         = bes->end_state;
        block_info_t *bi       = get_block_info(bl);
        int n_regs             = bi->bel->n_regs;
        int index;
 
-       DBG((dbg, DBG_GLOBAL, "\t%{firm:indent}can make avail %+F at end of %+F (pressure %d)\n",
+       DBG((dbg, DBG_GLOBAL, "\t%2Dcan make avail %+F at end of %+F (pressure %d)\n",
                                level, irn, bl, bi->pressure));
 
        /*
@@ -589,7 +720,7 @@ static double can_make_available_at_end(global_end_state_t *ges, ir_node *bl, ir
         * so we can exit safely.
         */
        if (bes->costs >= 0.0) {
-               DBG((dbg, DBG_GLOBAL, "\t%{firm:indent}we\'ve been here before\n", level));
+               DBG((dbg, DBG_GLOBAL, "\t%2Dwe\'ve been here before\n", level));
                goto end;
        }
 
@@ -598,8 +729,8 @@ static double can_make_available_at_end(global_end_state_t *ges, ir_node *bl, ir
        index = workset_get_index(end, irn);
        if (index >= 0) {
                unsigned ver = end->vals[index].version;
-               DBG((dbg, DBG_GLOBAL, "\t%{firm:indent}node is in the end set and is %s fixed\n",
-                                       level, ver > ges->version ? "" : "not"));
+               DBG((dbg, DBG_GLOBAL, "\t%2Dnode is in the end set and is %s fixed\n",
+                                       level, ver > ges->version ? "already" : "not yet"));
 
                /*
                 * if the version is older, the value is already fixed
@@ -643,28 +774,45 @@ static double can_make_available_at_end(global_end_state_t *ges, ir_node *bl, ir
                 * There is just room at the *end*
                 */
                if (len < n_regs) {
-                       DBG((dbg, DBG_GLOBAL, "\t%{firm:indent}the end set has %d free slots\n",
+                       DBG((dbg, DBG_GLOBAL, "\t%2Dthe end set has %d free slots\n",
                                                level, n_regs - len));
                        slot = len;
-               }
-               else {
+               } else {
                        for (i = 0; i < len; ++i)
                                if (end->vals[i].version < ges->version)
                                        break;
 
                        if (i < len) {
-                               DBG((dbg, DBG_GLOBAL, "\t%{firm:indent}%+F (slot %d) can be erased from the end set\n",
+                               DBG((dbg, DBG_GLOBAL, "\t%2D%+F (slot %d) can be erased from the end set\n",
                                                        level, end->vals[i].irn, i));
                                slot = i;
                        }
                }
 
-               if (slot > 0) {
-                       int gauge          = ges->gauge;
-                       double reload_here = get_block_execfreq(bi->bel->ef, bl);
-                       double bring_in    = bi->pressure < n_regs ? can_bring_in(ges, bl, irn, level + 1) : HUGE_VAL;
+               if (slot >= 0) {
+                       rollback_info_t rb  = trans_begin(ges);
+                       ir_node *ins_before = block_info_get_last_ins(bi);
+                       double reload_here  = be_get_reload_costs(bi->bel->senv, irn, ins_before);
+                       double bring_in     = HUGE_VAL;
+
+                       /* allocate a slot before recursively descending. */
+                       end->vals[slot].irn     = irn;
+                       end->vals[slot].version = ges->version;
+                       end->len = MAX(end->len, slot + 1);
+
+                       /* look if we can bring the value in. */
+                       if (bi->pressure < n_regs) {
+                               double new_limit = MIN(reload_here, limit);
+                               bring_in = can_bring_in(ges, bl, irn, new_limit, level + 1);
+                       }
 
-                       DBG((dbg, DBG_GLOBAL, "\t%{firm:indent}there is a free slot. capacity=%d, reload here=%f, bring in=%f\n",
+                       /*
+                        * re-read the bes descriptor since meanwhile, the
+                        * array could have been displaced by recursive calls
+                        */
+                       assert(bes_index < ges->gauge);
+                       bes = &ges->end_info[bes_index];
+                       DBG((dbg, DBG_GLOBAL, "\t%2Dthere is a free slot. capacity=%d, reload here=%f, bring in=%f\n",
                                                level, n_regs - bi->pressure, reload_here, bring_in));
 
                        /*
@@ -673,55 +821,63 @@ static double can_make_available_at_end(global_end_state_t *ges, ir_node *bl, ir
                         * the gauge.
                         */
                        if (reload_here <= bring_in) {
-                               ges->gauge = gauge;
+                               trans_rollback(ges, &rb);
                                bes->costs = reload_here;
                                bes->reload_at_end = 1;
-                       }
-
-                       else {
+                               DBG((dbg, DBG_GLOBAL, "\t%2Ddoing a reload %p\n", level, bes));
+                       } else {
                                bes->live_through = 1;
                                bes->costs = bring_in;
                        }
+
                }
        }
 
 end:
-       DBG((dbg, DBG_GLOBAL, "\t%{firm:indent}-> %f\n", level, bes->costs));
+       DBG((dbg, DBG_GLOBAL, "\t%2D-> %f\n", level, bes->costs));
        return bes->costs;
 }
 
-static double can_bring_in(global_end_state_t *ges, ir_node *bl, ir_node *irn, int level)
+static double can_bring_in(global_end_state_t *ges, ir_node *bl, ir_node *irn, double limit, int level)
 {
+       belady_env_t *env = ges->env;
        double glob_costs = HUGE_VAL;
-       int def_block     = bl == get_nodes_block(irn);
-       int phi           = is_Phi(irn);
 
-       DBG((dbg, DBG_GLOBAL, "\t%{firm:indent}can bring in for %+F at block %+F\n", level, irn, bl));
+       DBG((dbg, DBG_GLOBAL, "\t%2Dcan bring in (max %f) for %+F at block %+F\n", level, limit, irn, bl));
 
-       if (phi || !def_block) {
+       if (is_transport_in(bl, irn)) {
                int i, n           = get_irn_arity(bl);
                ir_node **nodes    = alloca(get_irn_arity(bl) * sizeof(nodes[0]));
+               rollback_info_t rb = trans_begin(ges);
 
-               int gauge_begin    = ges->gauge;
 
                glob_costs = 0.0;
                for (i = 0; i < n; ++i) {
                        ir_node *pr = get_Block_cfgpred_block(bl, i);
-                       ir_node *op = phi && def_block ? get_irn_n(irn, i) : irn;
-                       double c    = can_make_available_at_end(ges, pr, op, level + 1);
+                       ir_node *op = is_local_phi(bl, irn) ? get_irn_n(irn, i) : irn;
+                       double c;
+
+                       /*
+                        * there might by unknwons as operands of phis in that case
+                        * we set the costs to zero, since they won't get spilled.
+                        */
+                       if (arch_irn_consider_in_reg_alloc(env->arch, env->cls, op))
+                               c = can_make_available_at_end(ges, pr, op, limit - glob_costs, level + 1);
+                       else
+                               c = 0.0;
 
-                       if (c >= HUGE_VAL) {
-                               ges->gauge = gauge_begin;
+                       glob_costs += c;
+
+                       if (glob_costs >= limit) {
                                glob_costs = HUGE_VAL;
+                               trans_rollback(ges, &rb);
                                goto end;
                        }
-
-                       glob_costs += c;
                }
        }
 
 end:
-       DBG((dbg, DBG_GLOBAL, "\t%{firm:indent}-> %f\n", level, glob_costs));
+       DBG((dbg, DBG_GLOBAL, "\t%2D-> %f\n", level, glob_costs));
        return glob_costs;
 }
 
@@ -734,22 +890,22 @@ static void materialize_and_commit_end_state(global_end_state_t *ges)
        for (i = 0; i < ges->gauge; ++i) {
                block_end_state_t *bes = &ges->end_info[i];
                block_info_t *bi       = get_block_info(bes->bl);
-               int idx;
+               int idx, end_pressure;
+
+               DBG((dbg, DBG_GLOBAL, "\t\t%+F in %+F, cost: %f through: %d, rel: %d %p\n",
+                               bes->irn, bes->bl, bes->costs, bes->live_through, bes->reload_at_end, bes));
 
                /* insert the reload if the val was reloaded at the block's end */
                if (bes->reload_at_end) {
-                       be_add_reload(env->senv, bes->irn, bes->bl, env->cls, 1);
+                       be_add_reload_at_end(env->senv, bes->irn, bes->bl, env->cls, 1);
                        DBG((dbg, DBG_GLOBAL, "\t\tadding reload of %+F at end of %+F\n", bes->irn, bes->bl));
                }
 
-               /*
-                * if the variable is live through the block,
-                * update the pressure indicator.
-                */
-               bi->pressure += bes->live_through;
-
                idx = workset_get_index(bes->end_state, bes->irn);
 
+               if (is_local_phi(bes->bl, bes->irn) && bes->live_through)
+                       bitset_add_irn(ges->succ_phis, bes->irn);
+
                /*
                 * set the version number in the workset.
                 * That will mark this value as fixed in the end set
@@ -763,6 +919,23 @@ static void materialize_and_commit_end_state(global_end_state_t *ges)
                        bes->end_state->vals[idx].version = ges->version;
                        workset_copy(env, bi->ws_end, bes->end_state);
                }
+
+               end_pressure = 0;
+               for (idx = workset_get_length(bes->end_state) - 1; idx >= 0; --idx)
+                       if (bes->end_state->vals[idx].version >= ges->version)
+                               end_pressure += 1;
+
+               /*
+                * if the variable is live through the block,
+                * update the pressure indicator.
+                */
+               DBG((dbg, DBG_GLOBAL, "\t\told pressure %d, ", bi->pressure));
+
+               bi->pressure = MAX(bi->pressure + bes->live_through, end_pressure);
+
+               DBG((dbg, DBG_GLOBAL, "new pressure: %d, end pressure: %d, end length: %d\n",
+                                       bi->pressure, end_pressure, workset_get_length(bes->end_state)));
+
        }
 }
 
@@ -777,33 +950,45 @@ static void fix_block_borders(global_end_state_t *ges, ir_node *block) {
        ir_node *irn;
        int i;
 
-       DBG((dbg, DBG_GLOBAL, "fixing block borders at %+F (%fHz)\n", block, bi->exec_freq));
+       DBG((dbg, DBG_GLOBAL, "fixing block borders at %+F (%f)\n", block, bi->exec_freq));
 
        /* process all variables which shall be in a reg at
         * the beginning of the block in the order of the next use. */
        workset_foreach(bi->entrance_reg, irn, i) {
+               double local_costs = be_get_reload_costs(env->senv, irn, bi->first_non_in);
                double bring_in_costs;
 
-               /* reset the gauge and begin the search */
-               ges->gauge = 0;
-               --ges->version;
+               /* reset the gauge and create a new version. */
+               ges->gauge    = 0;
+               ges->version -= 1;
 
                DBG((dbg, DBG_GLOBAL, "\ttrans in var %+F, version %x\n", irn, ges->version));
 
-               bring_in_costs = can_bring_in(ges, block, irn, 0);
+               bring_in_costs = can_bring_in(ges, block, irn, local_costs, 1);
 
-               /* we were not able to let the value arrive
+               DBG((dbg, DBG_GLOBAL, "\tbring in: %f, local: %f", bring_in_costs, local_costs));
+
+               /*
+                * we were not able to let the value arrive
                 * in a register at the entrance of the block
-                * so we have to do the reload locally */
-               if (bring_in_costs >= HUGE_VAL) {
-                       if (is_Phi(irn) && get_nodes_block(irn) == block)
-                               be_spill_phi(env->senv, irn);
-                       else
-                               be_add_reload(env->senv, irn, bi->first_non_in, env->cls, 1);
-               }
+                * or it is too costly, so we have to do the reload locally
+                */
+               if (bring_in_costs >= local_costs) {
+                       DBG((dbg, DBG_GLOBAL, " -> do local reload\n"));
+                       be_add_reload(env->senv, irn, bi->first_non_in, env->cls, 1);
+               } else {
+                       /*
+                        * if the transport-in was a phi (that is actually used in block)
+                        * it will no longer remain and we have to spill it completely.
+                        */
+                       if (is_local_phi(block, irn))
+                               bitset_add_irn(ges->succ_phis, irn);
 
-               else
+                       DBG((dbg, DBG_GLOBAL, " -> do remote reload\n"));
                        materialize_and_commit_end_state(ges);
+               }
+
+               DBG((dbg, DBG_GLOBAL, "\n"));
        }
 }
 
@@ -813,10 +998,11 @@ static void global_assign(belady_env_t *env)
        int i;
 
        obstack_init(&ges.obst);
-       ges.gauge    = 0;
-       ges.env      = env;
-       ges.version  = -1;
-       ges.end_info = NEW_ARR_F(block_end_state_t, env->n_blocks);
+       ges.gauge     = 0;
+       ges.env       = env;
+       ges.version   = -1;
+       ges.end_info  = NEW_ARR_F(block_end_state_t, env->n_blocks);
+       ges.succ_phis = bitset_irg_obstack_alloc(&env->ob, env->irg);
 
        /*
         * sort the blocks according to execution frequency.
@@ -826,6 +1012,26 @@ static void global_assign(belady_env_t *env)
 
        for (i = 0; i < env->n_blocks; ++i)
                fix_block_borders(&ges, env->blocks[i]);
+
+       /*
+        * Now we spill phis which cannot be kept since they were replaced
+        * by reloads at the block entrances.
+        */
+       for (i = 0; i < env->n_blocks; ++i) {
+               ir_node *bl = env->blocks[i];
+               ir_node *irn;
+
+               sched_foreach(bl, irn) {
+                       if (!is_Phi(irn))
+                               break;
+
+                       if (arch_irn_consider_in_reg_alloc(env->arch, env->cls, irn)
+                                       && !bitset_contains_irn(ges.succ_phis, irn))
+                               be_spill_phi(env->senv, irn);
+               }
+       }
+
+       DEL_ARR_F(ges.end_info);
 }
 
 static void collect_blocks(ir_node *bl, void *data)
@@ -836,8 +1042,8 @@ static void collect_blocks(ir_node *bl, void *data)
 }
 
 void be_spill_belady_spill_env2(be_irg_t *birg, const arch_register_class_t *cls, spill_env_t *spill_env) {
-       belady_env_t env;
        ir_graph *irg = be_get_birg_irg(birg);
+       belady_env_t env;
        int i, n_regs;
 
        /* some special classes contain only ignore regs, nothing to do then */
@@ -849,12 +1055,12 @@ void be_spill_belady_spill_env2(be_irg_t *birg, const arch_register_class_t *cls
 
        /* init belady env */
        obstack_init(&env.ob);
+       env.irg       = irg;
        env.arch      = birg->main_env->arch_env;
        env.cls       = cls;
        env.lv        = be_get_birg_liveness(birg);
        env.n_regs    = n_regs;
        env.ws        = new_workset(&env, &env.ob);
-       env.uses      = be_begin_uses(irg, env.lv);
        env.senv      = spill_env ? spill_env : be_new_spill_env(birg);
        env.ef        = be_get_birg_exec_freq(birg);
        env.n_blocks  = 0;
@@ -863,7 +1069,7 @@ void be_spill_belady_spill_env2(be_irg_t *birg, const arch_register_class_t *cls
        obstack_ptr_grow(&env.ob, NULL);
        env.blocks = obstack_finish(&env.ob);
 
-       /* Fix high register pressure with belady algorithm */
+       /* Fix high register pressure in blocks with belady algorithm */
        for (i = 0; i < env.n_blocks; ++i)
                belady(env.blocks[i], &env);
 
@@ -875,7 +1081,7 @@ void be_spill_belady_spill_env2(be_irg_t *birg, const arch_register_class_t *cls
        /* clean up */
        if(spill_env == NULL)
                be_delete_spill_env(env.senv);
-       be_end_uses(env.uses);
+
        obstack_free(&env.ob, NULL);
 }