init_sp Unknown node constructed with CSE disabled
[libfirm] / ir / be / bespillremat.c
index 6f9e90a..3efbf99 100644 (file)
@@ -29,8 +29,9 @@
 #include "irnode_t.h"
 #include "ircons_t.h"
 #include "irloop_t.h"
-#include "execfreq.h"
 #include "phiclass.h"
+#include "iredges.h"
+#include "execfreq.h"
 
 #include <lpp/lpp.h>
 #include <lpp/lpp_net.h>
 
 #define DUMP_SOLUTION
 #define DUMP_ILP
-#define KEEPALIVE
-#define COLLECT_REMATS
-#define REMAT_WHILE_LIVE
-#define NO_ENLARGE_L1V3N355
-#define EXECFREQ_LOOPDEPH
-#define MAY_DIE_AT_PRE_REMAT
+//#define KEEPALIVE /* keep alive all inserted remats and dump graph with remats */
+#define COLLECT_REMATS /* enable rematerialization */
+#define COLLECT_INVERSE_REMATS /* enable placement of inverse remats */
+#define REMAT_WHILE_LIVE /* only remat values that are live */
+//#define NO_ENLARGE_L1V3N355 /* do not remat after the death of some operand */
+//#define EXECFREQ_LOOPDEPH /* compute execution frequency from loop depth only */
+//#define MAY_DIE_AT_PRE_REMAT /* allow values to die after a pre remat */
+#define CHECK_POST_REMAT /* check pressure after post remats (conservative but otherwise we can temporarily exceed the register pressure) */
+#define NO_SINGLE_USE_REMATS /* do not repair schedule */
+//#define KEEPALIVE_SPILLS
+//#define KEEPALIVE_RELOADS
+#define GOODWIN_REDUCTION
 
 #define  SOLVE
-#undef  SOLVE_LOCAL
+//#define  SOLVE_LOCAL
 #define LPP_SERVER "i44pc52"
 #define LPP_SOLVER "cplex"
 
@@ -70,7 +77,7 @@
 #define COST_STORE     50
 #define COST_REMAT     1
 
-#define ILP_TIMEOUT    20
+#define ILP_TIMEOUT    120
 
 #define ILP_UNDEF              -1
 
@@ -83,13 +90,14 @@ typedef struct _spill_ilp_t {
        struct obstack               *obst;
        set                          *remat_info;
        pset                         *all_possible_remats;
+       pset                         *inverse_ops;
 #ifdef KEEPALIVE
        ir_node                      *keep;
 #endif
        set                          *values; /**< for collecting all definitions of values before running ssa-construction */
-#ifndef EXECFREQ_LOOPDEPH
        set                          *execfreqs;
-#endif
+       pset                         *spills;
+       ir_node                      *m_unknown;
        DEBUG_ONLY(firm_dbg_module_t * dbg);
 } spill_ilp_t;
 
@@ -107,6 +115,7 @@ typedef struct _remat_t {
        const ir_node        *proj; /**< not NULL if the above op produces a tuple */
        const ir_node        *value; /**< the value which is being recomputed by this remat */
        int                   cost; /**< cost of this remat */
+       int                   inverse; /**< nonzero if this is an inverse remat */
 } remat_t;
 
 /**
@@ -276,21 +285,35 @@ cmp_keyval(const void *a, const void *b, size_t size)
        return !(p->key == q->key);
 }
 
-static float
+static double
 execution_frequency(const spill_ilp_t * si, const ir_node * irn)
 {
-#ifdef EXECFREQ_LOOPDEPH
-       if(is_Block(irn))
-               return expf((float)get_loop_depth(get_irn_loop(irn)) * logf(10));
-       else
-               return expf((float)get_loop_depth(get_irn_loop(get_nodes_block(irn))) * logf(10));
-#else
-       if(is_Block(irn)) {
-               return get_block_execfreq(si->execfreqs, irn);
+#define FUDGE 0.001
+       if(si->execfreqs) {
+               if(is_Block(irn)) {
+                       return get_block_execfreq(si->execfreqs, irn) + FUDGE;
+               } else {
+                       return get_block_execfreq(si->execfreqs, get_nodes_block(irn)) + FUDGE;
+               }
        } else {
-               return get_block_execfreq(si->execfreqs, get_nodes_block(irn));
+               if(is_Block(irn))
+                       return exp(get_loop_depth(get_irn_loop(irn)) * log(10)) + FUDGE;
+               else
+                       return exp(get_loop_depth(get_irn_loop(get_nodes_block(irn))) * log(10)) + FUDGE;
        }
-#endif
+}
+
+static double
+get_cost(const spill_ilp_t * si, const ir_node * irn)
+{
+       if(be_is_Spill(irn)) {
+               return COST_STORE;
+       } else if(be_is_Reload(irn)){
+               return COST_LOAD;
+       } else {
+               return arch_get_op_estimated_cost(si->chordal_env->birg->main_env->arch_env, irn);
+       }
+
 }
 
 /**
@@ -344,9 +367,10 @@ get_remat_from_op(spill_ilp_t * si, const ir_node * dest_value, const ir_node *
 
                remat = obstack_alloc(si->obst, sizeof(*remat));
                remat->op = op;
-               remat->cost = COST_REMAT; /* TODO ask backend for real cost */
+               remat->cost = get_cost(si, op);
                remat->value = dest_value;
                remat->proj = proj;
+               remat->inverse = 0;
        } else {
                arch_inverse_t     inverse;
                int                i,
@@ -364,20 +388,27 @@ get_remat_from_op(spill_ilp_t * si, const ir_node * dest_value, const ir_node *
 
                /* else ask the backend to give an inverse op */
                if(arch_get_inverse(si->chordal_env->birg->main_env->arch_env, op, i, &inverse, si->obst)) {
+                       int   i;
+
                        DBG((si->dbg, LEVEL_4, "\t  backend gave us an inverse op with %d nodes and cost %d\n", inverse.n, inverse.costs));
 
                        assert(inverse.n > 0 && "inverse op should have at least one node");
 
+                       for(i=0; i<inverse.n; ++i) {
+                               pset_insert_ptr(si->inverse_ops, inverse.nodes[i]);
+                       }
+
                        if(inverse.n <= 2) {
                                remat = obstack_alloc(si->obst, sizeof(*remat));
                                remat->op = inverse.nodes[0];
                                remat->cost = inverse.costs;
                                remat->value = dest_value;
                                remat->proj = (inverse.n==2)?inverse.nodes[1]:NULL;
+                               remat->inverse = 1;
 
                                assert(is_Proj(remat->proj));
                        } else {
-                               DBG((si->dbg, LEVEL_1, "\t  I can not handle remats with %d nodes\n", inverse.n));
+                               assert(0 && "I can not handle remats with more than 2 nodes");
                        }
                }
        }
@@ -430,6 +461,22 @@ add_remat(const spill_ilp_t * si, const remat_t * remat)
        }
 }
 
+static int
+get_irn_n_nonremat_edges(const spill_ilp_t * si, const ir_node * irn)
+{
+       const ir_edge_t   *edge = get_irn_out_edge_first(irn);
+       int                i = 0;
+
+       while(edge) {
+               if(!pset_find_ptr(si->inverse_ops, edge->src)) {
+                       ++i;
+               }
+               edge = get_irn_out_edge_next(irn, edge);
+       }
+
+       return i;
+}
+
 static INLINE void
 get_remats_from_op(spill_ilp_t * si, const ir_node * op)
 {
@@ -437,13 +484,18 @@ get_remats_from_op(spill_ilp_t * si, const ir_node * op)
                      n;
        remat_t *remat;
 
+#ifdef NO_SINGLE_USE_REMATS
+       if(has_reg_class(si, op) && (get_irn_n_nonremat_edges(si, op) > 1)) {
+#else
        if(has_reg_class(si, op)) {
+#endif
                remat = get_remat_from_op(si, op, op);
                if(remat) {
                        add_remat(si, remat);
                }
        }
 
+#ifdef COLLECT_INVERSE_REMATS
        /* repeat the whole stuff for each remat retrieved by get_remat_from_op(op, arg)
           for each arg */
        for (i = 0, n = get_irn_arity(op); i < n; ++i) {
@@ -457,6 +509,7 @@ get_remats_from_op(spill_ilp_t * si, const ir_node * op)
                        }
                }
        }
+#endif
 
 }
 
@@ -480,6 +533,8 @@ value_is_defined_before(const spill_ilp_t * si, const ir_node * pos, const ir_no
        /* else if this is a normal operation */
        block = get_nodes_block(pos);
        if(block == def_block) {
+               if(!sched_is_scheduled(val)) return 1;
+
                ret = sched_comes_after(val, pos);
 //             ir_fprintf(stderr, "(def(same block)=%d) ",ret);
                return ret;
@@ -758,6 +813,39 @@ insert_remat_before(spill_ilp_t * si, const remat_t * remat, const ir_node * pos
        }
 }
 
+static int
+get_block_n_succs(const ir_node *block) {
+       const ir_edge_t *edge;
+
+       assert(edges_activated(current_ir_graph));
+
+       edge = get_block_succ_first(block);
+       if (! edge)
+               return 0;
+
+       edge = get_block_succ_next(block, edge);
+       return edge ? 2 : 1;
+}
+
+static int
+is_merge_edge(const ir_node * bb)
+{
+#ifdef GOODWIN_REDUCTION
+       return get_block_n_succs(bb) == 1;
+#else
+       return 1;
+#endif
+}
+
+static int
+is_diverge_edge(const ir_node * bb)
+{
+#ifdef GOODWIN_REDUCTION
+       return get_Block_n_cfgpreds(bb) == 1;
+#else
+       return 1;
+#endif
+}
 
 /**
  * Insert (so far unused) remats into the irg to
@@ -925,53 +1013,48 @@ walker_remat_insertor(ir_node * bb, void * data)
        live_foreach(bb, li) {
                ir_node        *value = (ir_node *) li->irn;
 
-               /* add remats at end of block */
-               if (live_is_end(li) && has_reg_class(si, value)) {
-                       remat_info_t   *remat_info,
-                                                   query;
-                       remat_t        *remat;
+               /* add remats at end if successor has multiple predecessors */
+               if(is_merge_edge(bb)) {
+                       /* add remats at end of block */
+                       if (live_is_end(li) && has_reg_class(si, value)) {
+                               remat_info_t   *remat_info,
+                                                          query;
+                               remat_t        *remat;
 
-                       query.irn = value;
-                       query.remats = NULL;
-                       query.remats_by_operand = NULL;
-                       remat_info = set_find(si->remat_info, &query, sizeof(query), HASH_PTR(value));
+                               query.irn = value;
+                               query.remats = NULL;
+                               query.remats_by_operand = NULL;
+                               remat_info = set_find(si->remat_info, &query, sizeof(query), HASH_PTR(value));
 
-                       if(remat_info && remat_info->remats) {
-                               pset_foreach(remat_info->remats, remat) {
-                                       DBG((si->dbg, LEVEL_4, "\t  considering remat %+F at end of block %+F\n", remat->op, bb));
-#ifdef REMAT_WHILE_LIVE
-                                       if(is_live_end(bb, remat->value)) {
-                                               insert_remat_before(si, remat, bb, NULL); //TODO
+                               if(remat_info && remat_info->remats) {
+                                       pset_foreach(remat_info->remats, remat) {
+                                               DBG((si->dbg, LEVEL_4, "\t  considering remat %+F at end of block %+F\n", remat->op, bb));
+
+                                               insert_remat_before(si, remat, bb, NULL);
                                        }
-#else
-                                       insert_remat_before(si, remat, bb, NULL);
-#endif
                                }
                        }
                }
+               if(is_diverge_edge(bb)) {
+                       /* add remat2s at beginning of block */
+                       if ((live_is_in(li) || (is_Phi(value) && get_nodes_block(value)==bb)) && has_reg_class(si, value)) {
+                               remat_info_t   *remat_info,
+                                                          query;
+                               remat_t        *remat;
 
+                               query.irn = value;
+                               query.remats = NULL;
+                               query.remats_by_operand = NULL;
+                               remat_info = set_find(si->remat_info, &query, sizeof(query), HASH_PTR(value));
 
-               /* add remat2s at beginning of block */
-               if ((live_is_in(li) || is_Phi(value)) && has_reg_class(si, value)) {
-                       remat_info_t   *remat_info,
-                                                   query;
-                       remat_t        *remat;
+                               if(remat_info && remat_info->remats) {
+                                       pset_foreach(remat_info->remats, remat) {
+                                               DBG((si->dbg, LEVEL_4, "\t  considering remat %+F at beginning of block %+F\n", remat->op, bb));
 
-                       query.irn = value;
-                       query.remats = NULL;
-                       query.remats_by_operand = NULL;
-                       remat_info = set_find(si->remat_info, &query, sizeof(query), HASH_PTR(value));
-
-                       if(remat_info && remat_info->remats_by_operand) {
-                               pset_foreach(remat_info->remats_by_operand, remat) {
-                                       DBG((si->dbg, LEVEL_4, "\t  considering remat %+F at beginning of block %+F\n", remat->op, bb));
-#ifdef REMAT_WHILE_LIVE
-                                       if(is_live_in(bb, remat->value)) {
+                                               /* put the remat here if all its args are available */
                                                insert_remat_after(si, remat, bb, NULL);
+
                                        }
-#else
-                                       insert_remat_after(si, remat, bb, NULL);
-#endif
                                }
                        }
                }
@@ -1038,17 +1121,17 @@ luke_endwalker(ir_node * bb, void * data)
                        query.irn = irn;
                        spill = set_insert(spill_bb->ilp, &query, sizeof(query), HASH_PTR(irn));
 
-                       ir_snprintf(buf, sizeof(buf), "reg_out_%N_%N", bb, irn);
+                       ir_snprintf(buf, sizeof(buf), "reg_out_%N_%N", irn, bb);
                        spill->reg_out = lpp_add_var(si->lpp, buf, lpp_binary, 0.0);
                        /* if irn is used at the end of the block, then it is live anyway */
                        if(!pset_find_ptr(use_end, irn))
                                lpp_set_factor_fast(si->lpp, cst, spill->reg_out, 1.0);
 
-                       ir_snprintf(buf, sizeof(buf), "mem_out_%N_%N", bb, irn);
+                       ir_snprintf(buf, sizeof(buf), "mem_out_%N_%N", irn, bb);
                        spill->mem_out = lpp_add_var(si->lpp, buf, lpp_binary, 0.0);
 
-                       ir_snprintf(buf, sizeof(buf), "spill_%N_%N", bb, irn);
-                       spill->spill = lpp_add_var(si->lpp, buf, lpp_binary, COST_STORE*execution_frequency(si, irn));
+                       ir_snprintf(buf, sizeof(buf), "spill_%N_%N", irn, bb);
+                       spill->spill = lpp_add_var(si->lpp, buf, lpp_binary, COST_STORE*execution_frequency(si, bb));
 
                        spill->reg_in = ILP_UNDEF;
                        spill->mem_in = ILP_UNDEF;
@@ -1212,8 +1295,12 @@ luke_blockwalker(ir_node * bb, void * data)
                }
        }
 
-       spill_bb->reloads = obstack_alloc(si->obst, pset_count(live) * sizeof(*spill_bb->reloads));
-       memset(spill_bb->reloads, 0xFF, pset_count(live) * sizeof(*spill_bb->reloads));
+       if(is_merge_edge(bb)) {
+               spill_bb->reloads = obstack_alloc(si->obst, pset_count(live) * sizeof(*spill_bb->reloads));
+               memset(spill_bb->reloads, 0xFF, pset_count(live) * sizeof(*spill_bb->reloads));
+       } else {
+               spill_bb->reloads = NULL;
+       }
 
        i=0;
        live_foreach(bb, li) {
@@ -1224,13 +1311,15 @@ luke_blockwalker(ir_node * bb, void * data)
                        spill = set_find_spill(spill_bb->ilp, irn);
                        assert(spill);
 
-                       ir_snprintf(buf, sizeof(buf), "reload_%N_%N", bb, irn);
-                       spill_bb->reloads[i] = lpp_add_var(si->lpp, buf, lpp_binary, COST_LOAD*execution_frequency(si, bb));
+                       if(spill_bb->reloads) {
+                               ir_snprintf(buf, sizeof(buf), "reload_%N_%N", bb, irn);
+                               spill_bb->reloads[i] = lpp_add_var(si->lpp, buf, lpp_binary, COST_LOAD*execution_frequency(si, bb));
 
-                       /* reload <= mem_out */
-                       cst = lpp_add_cst(si->lpp, buf, lpp_less, 0.0);
-                       lpp_set_factor_fast(si->lpp, cst, spill_bb->reloads[i], 1.0);
-                       lpp_set_factor_fast(si->lpp, cst, spill->mem_out, -1.0);
+                               /* reload <= mem_out */
+                               cst = lpp_add_cst(si->lpp, buf, lpp_less, 0.0);
+                               lpp_set_factor_fast(si->lpp, cst, spill_bb->reloads[i], 1.0);
+                               lpp_set_factor_fast(si->lpp, cst, spill->mem_out, -1.0);
+                       }
 
                        op = get_irn_link(irn);
                        assert(!op->is_remat);
@@ -1244,7 +1333,7 @@ luke_blockwalker(ir_node * bb, void * data)
 
                        /* reg_out - reload - remat - live_range <= 0 */
                        lpp_set_factor_fast(si->lpp, cst, spill->reg_out, 1.0);
-                       lpp_set_factor_fast(si->lpp, cst, spill_bb->reloads[i], -1.0);
+                       if(spill_bb->reloads) lpp_set_factor_fast(si->lpp, cst, spill_bb->reloads[i], -1.0);
                        lpp_set_factor_fast(si->lpp, cst, op->attr.live_range.ilp, -1.0);
                        foreach_pre_remat(si, bb, tmp) {
                                op_t     *remat_op = get_irn_link(tmp);
@@ -1265,8 +1354,10 @@ luke_blockwalker(ir_node * bb, void * data)
                                        k,
                                        d = 0;
                ilp_cst_t       check_pre,
-                                       check_post,
-                                       check_post_remat;
+                                       check_post;
+#ifdef CHECK_POST_REMAT
+               ilp_cst_t       check_post_remat;
+#endif
                set        *args = new_set(cmp_keyval, get_irn_arity(irn));
                keyval_t   *keyval;
 
@@ -1291,6 +1382,7 @@ luke_blockwalker(ir_node * bb, void * data)
                        }
                }
 
+#ifdef CHECK_POST_REMAT
                /* check the register pressure after the epilog */
                ir_snprintf(buf, sizeof(buf), "check_post_remat_%N", irn);
                check_post_remat = lpp_add_cst(si->lpp, buf, lpp_less, si->n_regs);
@@ -1312,12 +1404,17 @@ luke_blockwalker(ir_node * bb, void * data)
 
                        assert(remat_op->is_remat && !remat_op->attr.remat.pre);
 
-                       /* values that are defined by remats are not counted */
-                       /* TODO assert(val_op->attr.live_range.ilp)) */
+                       /* values that are defined by remat2s are not counted */
+#ifdef REMAT_WHILE_LIVE
+                       assert(val_op->attr.live_range.ilp);
+                       lpp_set_factor_fast(si->lpp, check_post_remat, val_op->attr.live_range.ilp, 0.0);
+#else
                        if(val_op->attr.live_range.ilp != ILP_UNDEF) {
                                lpp_set_factor_fast(si->lpp, check_post_remat, val_op->attr.live_range.ilp, 0.0);
                        }
+#endif /* REMAT_WHILE_LIVE */
                }
+#endif /* CHECK_POST_REMAT */
 
 
                /* new live ranges for values from L\U defined by remat2s or used by remats */
@@ -1374,9 +1471,9 @@ luke_blockwalker(ir_node * bb, void * data)
                                                }
                                        }
                                }
+fertig:
 #endif
 
-fertig:
                                if(prev_lr != ILP_UNDEF) {
                                        value_op->attr.live_range.ilp = prev_lr;
                                        value_op->attr.live_range.op = irn;
@@ -1393,7 +1490,9 @@ fertig:
                        while(is_Proj(proj)) {
                                if(has_reg_class(si, proj)) {
                                        ++d;
+#ifdef CHECK_POST_REMAT
                                        lpp_set_factor_fast(si->lpp, check_post_remat, proj_op->attr.live_range.ilp, 1.0);
+#endif
                                }
                                proj = sched_next(proj);
                                proj_op = get_irn_link(proj);
@@ -1401,7 +1500,9 @@ fertig:
                } else {
                        if(has_reg_class(si, irn)) {
                                 d = 1;
+#ifdef CHECK_POST_REMAT
                                 lpp_set_factor_fast(si->lpp, check_post_remat, op->attr.live_range.ilp, 1.0);
+#endif
                        }
                }
                DBG((si->dbg, LEVEL_4, "\t   %+F produces %d values in my register class\n", irn, d));
@@ -1435,7 +1536,7 @@ fertig:
                        assert(i<n);
 
                        ir_snprintf(buf, sizeof(buf), "reload_%N_%N", arg, irn);
-                       op->attr.live_range.reloads[i] = lpp_add_var(si->lpp, buf, lpp_binary, COST_LOAD*execution_frequency(si, irn));
+                       op->attr.live_range.reloads[i] = lpp_add_var(si->lpp, buf, lpp_binary, COST_LOAD*execution_frequency(si, bb));
 
                        /* reload <= mem_out */
                        cst = lpp_add_cst(si->lpp, buf, lpp_less, 0.0);
@@ -1473,7 +1574,9 @@ fertig:
                                lpp_set_factor_fast(si->lpp, cst, post_use, -1.0);
                                lpp_set_factor_fast(si->lpp, cst, arg_op->attr.live_range.ilp, 1.0);
 
+#ifdef CHECK_POST_REMAT
                                lpp_set_factor_fast(si->lpp, check_post_remat, arg_op->attr.live_range.ilp, 1.0);
+#endif
                        }
 
                        /*forall remat2 which use arg add a similar cst*/
@@ -1617,6 +1720,7 @@ fertig:
                        }
                }
 
+#ifdef CHECK_POST_REMAT
                /* iterate over following remats and add them to check_post_remat */
                foreach_post_remat(irn, tmp) {
                        op_t           *remat_op = get_irn_link(tmp);
@@ -1625,6 +1729,8 @@ fertig:
 
                        lpp_set_factor_fast(si->lpp, check_post_remat, remat_op->attr.remat.ilp, 1.0);
                }
+#endif
+
 
 
                DBG((si->dbg, LEVEL_4, "\t   %d values live at %+F\n", pset_count(live), irn));
@@ -1723,54 +1829,28 @@ fertig:
        }
 
        /* mem_in/reg_in for live_in values, especially phis and their arguments */
-//     if(get_Block_n_cfgpreds(bb) > 1) {
-               pset_foreach(live, irn) {
-                       int          p = 0,
-                                                i,
-                                                n;
-
-                       spill = set_find_spill(spill_bb->ilp, irn);
-                       assert(spill && spill->irn == irn);
-
-                       if(is_Phi(irn) && get_nodes_block(irn) == bb) {
-                               for (i = 0, n = get_Phi_n_preds(irn); i < n; ++i) {
-                                       ilp_cst_t       mem_in,
-                                                                       reg_in;
-                                       ir_node        *phi_arg = get_Phi_pred(irn, i);
-                                       ir_node        *bb_p = get_Block_cfgpred_block(bb, i);
-                                       spill_bb_t     *spill_bb_p = get_irn_link(bb_p);
-                                       spill_t        *spill_p;
-
-                                       /* although the phi is in the right regclass one or more of
-                                        * its arguments can be in a different one or at least to
-                                        * ignore
-                                        */
-                                       if(has_reg_class(si, phi_arg)) {
-                                               ir_snprintf(buf, sizeof(buf), "mem_in_%N_%N-%d", irn, bb, p);
-                                               mem_in = lpp_add_cst(si->lpp, buf, lpp_less, 0.0);
-                                               ir_snprintf(buf, sizeof(buf), "reg_in_%N_%N-%d", irn, bb, p++);
-                                               reg_in = lpp_add_cst(si->lpp, buf, lpp_less, 0.0);
-
-                                               lpp_set_factor_fast(si->lpp, mem_in, spill->mem_in, 1.0);
-                                               lpp_set_factor_fast(si->lpp, reg_in, spill->reg_in, 1.0);
-
-                                               spill_p = set_find_spill(spill_bb_p->ilp, phi_arg);
-                                               assert(spill_p);
-
-                                               lpp_set_factor_fast(si->lpp, mem_in, spill_p->mem_out, -1.0);
-                                               lpp_set_factor_fast(si->lpp, reg_in, spill_p->reg_out, -1.0);
-                                       }
-                               }
-                       } else {
-                               /* else assure the value arrives on all paths in the same resource */
+       pset_foreach(live, irn) {
+               int          p = 0,
+                                        i,
+                                        n;
 
-                               for (i = 0, n = get_Block_n_cfgpreds(bb); i < n; ++i) {
-                                       ilp_cst_t       mem_in,
-                                                                       reg_in;
-                                       ir_node        *bb_p = get_Block_cfgpred_block(bb, i);
-                                       spill_bb_t     *spill_bb_p = get_irn_link(bb_p);
-                                       spill_t        *spill_p;
+               spill = set_find_spill(spill_bb->ilp, irn);
+               assert(spill && spill->irn == irn);
 
+               if(is_Phi(irn) && get_nodes_block(irn) == bb) {
+                       for (i = 0, n = get_Phi_n_preds(irn); i < n; ++i) {
+                               ilp_cst_t       mem_in,
+                                                               reg_in;
+                               ir_node        *phi_arg = get_Phi_pred(irn, i);
+                               ir_node        *bb_p = get_Block_cfgpred_block(bb, i);
+                               spill_bb_t     *spill_bb_p = get_irn_link(bb_p);
+                               spill_t        *spill_p;
+
+                               /* although the phi is in the right regclass one or more of
+                                * its arguments can be in a different one or at least to
+                                * ignore
+                                */
+                               if(has_reg_class(si, phi_arg)) {
                                        ir_snprintf(buf, sizeof(buf), "mem_in_%N_%N-%d", irn, bb, p);
                                        mem_in = lpp_add_cst(si->lpp, buf, lpp_less, 0.0);
                                        ir_snprintf(buf, sizeof(buf), "reg_in_%N_%N-%d", irn, bb, p++);
@@ -1779,15 +1859,39 @@ fertig:
                                        lpp_set_factor_fast(si->lpp, mem_in, spill->mem_in, 1.0);
                                        lpp_set_factor_fast(si->lpp, reg_in, spill->reg_in, 1.0);
 
-                                       spill_p = set_find_spill(spill_bb_p->ilp, irn);
+                                       spill_p = set_find_spill(spill_bb_p->ilp, phi_arg);
                                        assert(spill_p);
 
                                        lpp_set_factor_fast(si->lpp, mem_in, spill_p->mem_out, -1.0);
                                        lpp_set_factor_fast(si->lpp, reg_in, spill_p->reg_out, -1.0);
                                }
                        }
+               } else {
+                       /* else assure the value arrives on all paths in the same resource */
+
+                       for (i = 0, n = get_Block_n_cfgpreds(bb); i < n; ++i) {
+                               ilp_cst_t       mem_in,
+                                                               reg_in;
+                               ir_node        *bb_p = get_Block_cfgpred_block(bb, i);
+                               spill_bb_t     *spill_bb_p = get_irn_link(bb_p);
+                               spill_t        *spill_p;
+
+                               ir_snprintf(buf, sizeof(buf), "mem_in_%N_%N-%d", irn, bb, p);
+                               mem_in = lpp_add_cst(si->lpp, buf, lpp_less, 0.0);
+                               ir_snprintf(buf, sizeof(buf), "reg_in_%N_%N-%d", irn, bb, p++);
+                               reg_in = lpp_add_cst(si->lpp, buf, lpp_less, 0.0);
+
+                               lpp_set_factor_fast(si->lpp, mem_in, spill->mem_in, 1.0);
+                               lpp_set_factor_fast(si->lpp, reg_in, spill->reg_in, 1.0);
+
+                               spill_p = set_find_spill(spill_bb_p->ilp, irn);
+                               assert(spill_p);
+
+                               lpp_set_factor_fast(si->lpp, mem_in, spill_p->mem_out, -1.0);
+                               lpp_set_factor_fast(si->lpp, reg_in, spill_p->reg_out, -1.0);
+                       }
                }
-//     }
+       }
 
        /* first live ranges from reg_ins */
        pset_foreach(live, irn) {
@@ -1812,15 +1916,16 @@ fertig:
 
        /* walk forward now and compute constraints for placing spills */
        /* this must only be done for values that are not defined in this block */
+       /* TODO are these values at start of block? if yes, just check whether this is a diverge edge and skip the loop */
        pset_foreach(live, irn) {
-               ir_snprintf(buf, sizeof(buf), "req_spill_%N_%N", spill->irn, bb);
-               cst = lpp_add_cst(si->lpp, buf, lpp_less, 0.0);
-
                spill = set_find_spill(spill_bb->ilp, irn);
                assert(spill);
 
+               ir_snprintf(buf, sizeof(buf), "req_spill_%N_%N", irn, bb);
+               cst = lpp_add_cst(si->lpp, buf, lpp_less, 0.0);
+
                lpp_set_factor_fast(si->lpp, cst, spill->spill, 1.0);
-               lpp_set_factor_fast(si->lpp, cst, spill->reg_in, -1.0);
+               if(is_diverge_edge(bb)) lpp_set_factor_fast(si->lpp, cst, spill->reg_in, -1.0);
 
                sched_foreach_op(bb, tmp) {
                        op_t   *op = get_irn_link(tmp);
@@ -1865,89 +1970,15 @@ fertig:
         */
 
 
-/*     ir_snprintf(buf, sizeof(buf), "next_lr_%N_%N", value, irn);
-       cst = lpp_add_cst(si->lpp, buf, lpp_less, 0.0);
-       lpp_set_factor_fast(si->lpp, cst, value_op->attr.live_range.ilp, 1.0);
-*/
-
-#if 0
-       ir_snprintf(buf, sizeof(buf), "ce2_%N_%N_%d", op, irn, i);
-       cst = lpp_add_cst(si->lpp, buf, lpp_less, n_operands);
-       lpp_set_factor_fast(si->lpp, cst, op_lr->in_mem_var, -1.0);
-       op_lr->is_remat_var = lpp_add_var(si->lpp, buf, lpp_binary, COST_REMAT);
-#endif
-
        del_pset(live);
 }
 
 
 #if 0
-static void
-process_block(ir_node * block, void *data)
-{
-       spill_ilp_t    *si = data;
-       int             step = 0;
-       int             n_regs = arch_register_class_n_regs(si->cls);
-       int             n_preds = get_irn_arity(block);
-       irn_live_t     *li;
-       ir_node        *irn;
-
-       DBG((si->dbg, LEVEL_3, "\nProcessing %+F\n", block));
-
-       /*
-        * compute all possible remats, sorted by operands *and* resulting
-        * values
-        */
-       ir_node        *new_r_Tuple(ir_graph * irg, ir_node * block, int arity, ir_node * in[]);
-       ir_node        *new_r_Proj(ir_graph * irg, ir_node * block, ir_node * arg, ir_mode * mode, long proj);
-       void           *get_irn_link(const ir_node * node);
-       void            set_irn_link(ir_node * node, void *link);
-
-       int             block_dominates(const ir_node * a, const ir_node * b);
-
-
-       void            compute_doms(ir_graph * irg);
-       void            free_dom(ir_graph * irg);
-
-
-       /*
-        * nochmal: - laufe in dominatorreihenfolge die bloecke ab -
-        * sammle dabei moeglichkeiten zur rematerialisierung von werten
-        * (auch inverse etc.) - mehrere listen: fuer zielwert und fuer
-        * jeden operanden (get_remats_with_operand) - ignoriere projs
-        * (und phis?) und andere registerklassen - wenn ich nun auf eine
-        * op treffe füge ich für alle operanden mögliche remats ein
-        * (keine reloads und spills) in den schedule direkt vor der op
-        * ein (in der Hoffnung, dass diese nicht wegoptimiert werden...).
-        * - dann Ã¼berspringe ich die folgenden projs - nach den projs
-        * wird der epilog eingefügt (nur remats, keine spills) -
-        * lebendigkeit neu berechnen
-        *
-        * -durchlaufe den ganzen mist nochmal und emittiere dabei das ILP:
-        *
-        *
-        * datenhaltung: - remat merkt sich die zugehörige ilp_var und op
-        * - op merkt sich (zugehörige remats) - block merkt sich ilp_vars
-        * für spills
-        *
-        * nachbearbeitung: - durchlaufe alle instruktionen, dabei entferne
-        * nicht benötigte remats und füge benötigte reloads ein - wenn
-        * spill in Grundblock eingesetzt werden soll durchlaufen wir den
-        * Block von oben runter und setzen das spill an der letzten
-        * möglichen Stelle vor der ersten Benutzung/Reload ein. -
         * Speicherkopienminimierung: teste Speicherwerte auf Interferenz
         * und weise Spillkontexte zu. Sorge bei Phis dafuer, dass gleiche
         * Kontexte zusammenfliessen (Operanden und Ergebnis hat gleichen
         * Kontext)
-        *
-        * nachoptimierung: - laufe wieder von oben nach unten durch jeden
-        * block - wenn ich ein reload finde, dann schiebe es hoch solange
-        * es der Registerdruck erlaubt (wenn ich es Ã¼ber sein
-        * zugehöriges spill schiebe ist das ein bug, oder?)
-        *
-        */
-
-}
 #endif
 
 static INLINE int
@@ -1972,6 +2003,7 @@ is_mem_phi(const ir_node * phi, void *data)
        return 0;
 }
 
+#ifdef KEEPALIVE
 static int mark_remat_nodes_hook(FILE *F, ir_node *n, ir_node *l)
 {
        spill_ilp_t *si = get_irg_link(current_ir_graph);
@@ -1980,13 +2012,26 @@ static int mark_remat_nodes_hook(FILE *F, ir_node *n, ir_node *l)
                op_t   *op = (op_t*)get_irn_link(n);
                assert(op && op->is_remat);
 
-               if(op->attr.remat.pre) {
-                       ir_fprintf(F, "color:red info3:\"remat value: %+F\"", op->attr.remat.remat->value);
+               if(!op->attr.remat.remat->inverse) {
+                       if(op->attr.remat.pre) {
+                               ir_fprintf(F, "color:red info3:\"remat value: %+F\"", op->attr.remat.remat->value);
+                       } else {
+                               ir_fprintf(F, "color:orange info3:\"remat2 value: %+F\"", op->attr.remat.remat->value);
+                       }
+
+                       return 1;
                } else {
-                       ir_fprintf(F, "color:orange info3:\"remat2 value: %+F\"", op->attr.remat.remat->value);
-               }
+                       op_t   *op = (op_t*)get_irn_link(n);
+                       assert(op && op->is_remat);
 
-               return 1;
+                       if(op->attr.remat.pre) {
+                               ir_fprintf(F, "color:cyan info3:\"remat inverse value: %+F\"", op->attr.remat.remat->value);
+                       } else {
+                               ir_fprintf(F, "color:lightcyan info3:\"remat2 inverse value: %+F\"", op->attr.remat.remat->value);
+                       }
+
+                       return 1;
+               }
        }
 
        return 0;
@@ -1999,6 +2044,7 @@ dump_graph_with_remats(ir_graph * irg, const char * suffix)
        be_dump(irg, suffix, dump_ir_block_graph_sched);
        set_dump_node_vcgattr_hook(NULL);
 }
+#endif
 
 /**
  * Edge hook to dump the schedule edges with annotated register pressure.
@@ -2048,11 +2094,11 @@ walker_pressure_annotator(ir_node * bb, void * data)
                }
        }
 
-       set_irn_link(bb, (void*)pset_count(live));
+       set_irn_link(bb, INT_TO_PTR(pset_count(live)));
 
        sched_foreach_reverse(bb, irn) {
                if(is_Phi(irn)) {
-                       set_irn_link(irn, (void*)pset_count(live));
+                       set_irn_link(irn, INT_TO_PTR(pset_count(live)));
                        continue;
                }
 
@@ -2068,7 +2114,7 @@ walker_pressure_annotator(ir_node * bb, void * data)
 
                        if(has_reg_class(si, arg)) pset_insert_ptr(live, arg);
                }
-               set_irn_link(irn, (void*)pset_count(live)+projs);
+               set_irn_link(irn, INT_TO_PTR(pset_count(live)+projs));
        }
 
        del_pset(live);
@@ -2107,6 +2153,32 @@ connect_all_remats_with_keep(spill_ilp_t * si)
 }
 #endif
 
+static void
+connect_all_spills_with_keep(spill_ilp_t * si)
+{
+       ir_node   *irn;
+       ir_node  **ins,
+                        **pos;
+       int        n_spills;
+       ir_node   *keep;
+
+
+       n_spills = pset_count(si->spills);
+       if(n_spills) {
+               ins = obstack_alloc(si->obst, n_spills * sizeof(*ins));
+
+               pos = ins;
+               pset_foreach(si->spills, irn) {
+                       *pos = irn;
+                       ++pos;
+               }
+
+               keep = be_new_Keep(si->chordal_env->cls, si->chordal_env->irg, get_irg_end_block(si->chordal_env->irg), n_spills, ins);
+
+               obstack_free(si->obst, ins);
+       }
+}
+
 /** insert a spill at an arbitrary position */
 ir_node *be_spill2(const arch_env_t *arch_env, ir_node *irn, ir_node *insert, ir_node *ctx)
 {
@@ -2191,11 +2263,11 @@ clean_remat_info(spill_ilp_t * si)
 static void
 delete_unnecessary_remats(spill_ilp_t * si)
 {
+#ifdef KEEPALIVE
        int       i,
                      n;
        ir_node  *bad = get_irg_bad(si->chordal_env->irg);
 
-#ifdef KEEPALIVE
        if(si->keep) {
                ir_node   *end = get_irg_end(si->chordal_env->irg);
                ir_node  **keeps;
@@ -2214,10 +2286,18 @@ delete_unnecessary_remats(spill_ilp_t * si)
                                /* TODO check whether reload is preferred over remat (could be bug) */
                                delete_remat(si, keep_arg);
                        } else {
-                               if(arg_op->attr.remat.pre) {
-                                       DBG((si->dbg, LEVEL_2, "\t**remat kept: %+F\n", keep_arg));
+                               if(!arg_op->attr.remat.remat->inverse) {
+                                       if(arg_op->attr.remat.pre) {
+                                               DBG((si->dbg, LEVEL_2, "\t**remat kept: %+F\n", keep_arg));
+                                       } else {
+                                               DBG((si->dbg, LEVEL_2, "\t%%%%remat2 kept: %+F\n", keep_arg));
+                                       }
                                } else {
-                                       DBG((si->dbg, LEVEL_2, "\t%%%%remat2 kept: %+F\n", keep_arg));
+                                       if(arg_op->attr.remat.pre) {
+                                               DBG((si->dbg, LEVEL_2, "\t**INVERSE remat kept: %+F\n", keep_arg));
+                                       } else {
+                                               DBG((si->dbg, LEVEL_2, "\t%%%%INVERSE remat2 kept: %+F\n", keep_arg));
+                                       }
                                }
                        }
 
@@ -2250,10 +2330,18 @@ delete_unnecessary_remats(spill_ilp_t * si)
                        /* TODO check whether reload is preferred over remat (could be bug) */
                        delete_remat(si, remat);
                } else {
-                       if(remat_op->attr.remat.pre) {
-                               DBG((si->dbg, LEVEL_2, "\t**remat kept: %+F\n", remat));
+                       if(!remat_op->attr.remat.remat->inverse) {
+                               if(remat_op->attr.remat.pre) {
+                                       DBG((si->dbg, LEVEL_2, "\t**remat kept: %+F\n", remat));
+                               } else {
+                                       DBG((si->dbg, LEVEL_2, "\t%%%%remat2 kept: %+F\n", remat));
+                               }
                        } else {
-                               DBG((si->dbg, LEVEL_2, "\t%%%%remat2 kept: %+F\n", remat));
+                               if(remat_op->attr.remat.pre) {
+                                       DBG((si->dbg, LEVEL_2, "\t**INVERSE remat kept: %+F\n", remat));
+                               } else {
+                                       DBG((si->dbg, LEVEL_2, "\t%%%%INVERSE remat2 kept: %+F\n", remat));
+                               }
                        }
                }
        }
@@ -2283,12 +2371,47 @@ insert_spill(spill_ilp_t * si, const ir_node * irn, const ir_node * value, const
        defs->spills = spill;
 
 #ifdef KEEPALIVE_SPILLS
-       keep_alive(spill);
+       pset_insert_ptr(si->spills, spill);
 #endif
 
        return spill;
 }
 
+/**
+ * @param before   The Phi node which has to be spilled
+ */
+static ir_node *
+insert_mem_phi(spill_ilp_t * si, const ir_node * phi)
+{
+       ir_node   *mem_phi;
+       ir_node  **ins;
+       defs_t    *defs;
+       int        i,
+                          n;
+
+       NEW_ARR_A(ir_node*, ins, get_irn_arity(phi));
+
+       for(i=0,n=get_irn_arity(phi); i<n; ++i) {
+               ins[i] = si->m_unknown;
+       }
+
+       mem_phi =  new_r_Phi(si->chordal_env->irg, get_nodes_block(phi), get_irn_arity(phi), ins, mode_M);
+
+       defs = set_insert_def(si->values, phi);
+       assert(defs);
+
+       /* enter into the linked list */
+       set_irn_link(mem_phi, defs->spills);
+       defs->spills = mem_phi;
+
+       sched_add_after(phi, mem_phi);
+
+#ifdef KEEPALIVE_SPILLS
+       pset_insert_ptr(si->spills, mem_phi);
+#endif
+
+       return mem_phi;
+}
 
 /**
  * Add remat to list of defs, destroys link field!
@@ -2309,6 +2432,7 @@ insert_remat(spill_ilp_t * si, ir_node * remat)
        defs->remats = remat;
 }
 
+#if 0
 static void
 collect_spills(spill_ilp_t * si, ir_node * value, pset * spills, pset * visited)
 {
@@ -2338,15 +2462,26 @@ collect_spills(spill_ilp_t * si, ir_node * value, pset * spills, pset * visited)
 //             assert(0 && "Phi operand not spilled");
        }
 }
+#endif
 
 static pset *
 get_spills_for_value(spill_ilp_t * si, ir_node * value)
 {
        pset     *spills = pset_new_ptr_default();
-       pset     *visited = pset_new_ptr_default();
+//     pset     *visited = pset_new_ptr_default();
+
+//     collect_spills(si, value, spills, visited);
+//     del_pset(visited);
+       ir_node  *next;
+       defs_t   *defs;
 
-       collect_spills(si, value, spills, visited);
-       del_pset(visited);
+       defs = set_find_def(si->values, value);
+
+       if(defs && defs->spills) {
+               for(next = defs->spills; next; next = get_irn_link(next)) {
+                       pset_insert_ptr(spills, next);
+               }
+       }
 
        return spills;
 }
@@ -2366,6 +2501,7 @@ insert_reload(spill_ilp_t * si, const ir_node * value, const ir_node * after)
 
        defs = set_find_def(si->values, value);
        /* get a spill of this value */
+#if 0
        if((!defs || !defs->spills) && is_Phi(value)) {
                pset  *spills;
 
@@ -2377,9 +2513,13 @@ insert_reload(spill_ilp_t * si, const ir_node * value, const ir_node * after)
                if(!defs) {
                        defs = set_insert_def(si->values, value);
                }
+               defs->spills = spill;
+               set_irn_link(spill, NULL);
        } else {
                spill = defs->spills;
        }
+#endif
+       spill = defs->spills;
        assert(spill && "no spill placed before reload");
 
        reload = be_reload(arch_env, si->cls, after, get_irn_mode(value), spill);
@@ -2389,7 +2529,6 @@ insert_reload(spill_ilp_t * si, const ir_node * value, const ir_node * after)
        defs->remats = reload;
 
        return reload;
-
 }
 
 static void
@@ -2403,12 +2542,14 @@ walker_spill_placer(ir_node * bb, void * data) {
        set_foreach(spill_bb->ilp, spill) {
                lpp_name_t    *name;
 
-               assert(spill->spill > 0);
-
                if(is_Phi(spill->irn) && get_nodes_block(spill->irn) == bb) {
                        name = si->lpp->vars[spill->mem_in];
                        if(!is_zero(name->value)) {
-                               DBG((si->dbg, LEVEL_2, "\t >>spilled Phi %+F\n", spill->irn));
+                               ir_node   *mem_phi;
+
+                               mem_phi = insert_mem_phi(si, spill->irn);
+
+                               DBG((si->dbg, LEVEL_2, "\t >>spilled Phi %+F -> %+F\n", spill->irn, mem_phi));
                        }
                }
 
@@ -2455,6 +2596,43 @@ walker_spill_placer(ir_node * bb, void * data) {
        del_pset(spills_to_do);
 }
 
+static void
+phim_fixer(spill_ilp_t *si) {
+       defs_t  *defs;
+
+       set_foreach(si->values, defs) {
+               const ir_node  *phi = defs->value;
+               ir_node  *phi_m = NULL;
+               ir_node  *next = defs->spills;
+               int       i,
+                                 n;
+
+               if(!is_Phi(phi)) continue;
+
+               while(next) {
+                       if(is_Phi(next) && get_irn_mode(next) == mode_M) {
+                               phi_m = next;
+                               break;
+                       } else {
+                               next = get_irn_link(next);
+                       }
+               }
+               if(!phi_m) continue;
+
+               for(i=0,n=get_irn_arity(phi); i<n; ++i) {
+                       const ir_node  *value = get_irn_n(phi, i);
+                       defs_t         *val_defs = set_find_def(si->values, value);
+
+                       /* get a spill of this value */
+                       ir_node      *spill = val_defs->spills;
+
+                       assert(spill && "no spill placed before PhiM");
+
+                       set_irn_n(phi_m, i, spill);
+               }
+       }
+}
+
 static void
 walker_reload_placer(ir_node * bb, void * data) {
        spill_ilp_t   *si = (spill_ilp_t*)data;
@@ -2486,25 +2664,34 @@ walker_reload_placer(ir_node * bb, void * data) {
                                        if(!is_zero(name->value)) {
                                                ir_node    *reload;
                                                ir_node    *insert_pos = irn;
-                                               ir_node    *prev = sched_prev(insert_pos);
-                                               op_t       *prev_op = get_irn_link(prev);
+                                               ir_node    *prev = insert_pos;
+                                               op_t       *prev_op;
+
+                                               do {
+                                                       prev = sched_prev(prev);
+                                               } while(be_is_Spill(prev));
+
+                                               prev_op = get_irn_link(prev);
 
                                                /* insert reload before pre-remats */
-                                               while(!sched_is_end(prev) && !be_is_Reload(prev) && !be_is_Spill(prev)
+                                               while(!sched_is_end(prev) && !be_is_Reload(prev) && !is_Phi(prev)
                                                                && prev_op->is_remat && prev_op->attr.remat.pre) {
                                                        insert_pos = prev;
 
-                                                       prev = sched_prev(insert_pos);
+                                                       do {
+                                                               prev = sched_prev(prev);
+                                                       } while(be_is_Spill(prev));
+
                                                        prev_op = get_irn_link(prev);
+
                                                }
 
                                                reload = insert_reload(si, arg, insert_pos);
 
                                                set_irn_n(irn, i, reload);
 
-
-#ifdef KEEPALIVE_SPILLS
-                                               keep_alive(reload);
+#ifdef KEEPALIVE_RELOADS
+                                               pset_insert_ptr(si->spills, reload);
 #endif
                                        }
                                }
@@ -2512,36 +2699,39 @@ walker_reload_placer(ir_node * bb, void * data) {
                }
        }
 
-       i=0;
-       live_foreach(bb, li) {
-               ir_node        *irn = (ir_node *) li->irn;
+       /* reloads at end of block */
+       if(spill_bb->reloads) {
+               i=0;
+               live_foreach(bb, li) {
+                       ir_node        *irn = (ir_node *) li->irn;
 
-               if (live_is_end(li) && has_reg_class(si, irn) && !pset_find_ptr(si->all_possible_remats, irn)) {
-                       lpp_name_t    *name;
+                       if (live_is_end(li) && has_reg_class(si, irn) && !pset_find_ptr(si->all_possible_remats, irn)) {
+                               lpp_name_t    *name;
 
-                       name = si->lpp->vars[spill_bb->reloads[i]];
-                       if(!is_zero(name->value)) {
-                               ir_node    *reload;
-                               ir_node    *insert_pos = bb;
-                               ir_node    *prev = sched_prev(insert_pos);
-                               op_t       *prev_op = get_irn_link(prev);
-
-                               /* insert reload before pre-remats */
-                               while(!sched_is_end(prev) && !be_is_Reload(prev) && !be_is_Spill(prev)
-                                               && prev_op->is_remat && prev_op->attr.remat.pre) {
-                                       insert_pos = prev;
-
-                                       prev = sched_prev(insert_pos);
-                                       prev_op = get_irn_link(prev);
-                               }
+                               name = si->lpp->vars[spill_bb->reloads[i]];
+                               if(!is_zero(name->value)) {
+                                       ir_node    *reload;
+                                       ir_node    *insert_pos = bb;
+                                       ir_node    *prev = sched_prev(insert_pos);
+                                       op_t       *prev_op = get_irn_link(prev);
+
+                                       /* insert reload before pre-remats */
+                                       while(!sched_is_end(prev) && !be_is_Reload(prev) && !be_is_Spill(prev)
+                                                       && prev_op->is_remat && prev_op->attr.remat.pre) {
+                                               insert_pos = prev;
+
+                                               prev = sched_prev(insert_pos);
+                                               prev_op = get_irn_link(prev);
+                                       }
 
-                               reload = insert_reload(si, irn, insert_pos);
+                                       reload = insert_reload(si, irn, insert_pos);
 
-#ifdef KEEPALIVE_SPILLS
-                               keep_alive(reload);
+#ifdef KEEPALIVE_RELOADS
+                                       pset_insert_ptr(si->spills, reload);
 #endif
+                               }
+                               ++i;
                        }
-                       ++i;
                }
        }
 
@@ -2556,12 +2746,17 @@ walker_collect_used(ir_node * irn, void * data)
        lc_bitset_set(used, get_irn_idx(irn));
 }
 
+struct kill_helper {
+       lc_bitset_t  *used;
+       spill_ilp_t  *si;
+};
+
 static void
 walker_kill_unused(ir_node * bb, void * data)
 {
-       lc_bitset_t     *used = data;
-       const ir_node   *bad = get_irg_bad(get_irn_irg(bb));
-       ir_node         *irn;
+       struct kill_helper *kh = data;
+       const ir_node      *bad = get_irg_bad(get_irn_irg(bb));
+       ir_node            *irn;
 
 
        for(irn=sched_first(bb); !sched_is_end(irn);) {
@@ -2569,15 +2764,16 @@ walker_kill_unused(ir_node * bb, void * data)
                int          i,
                                         n;
 
-               if(!lc_bitset_is_set(used, get_irn_idx(irn))) {
-                       assert(!be_is_Spill(irn) && !be_is_Reload(irn) && "something is fishy, spill or remat is unused");
+               if(!lc_bitset_is_set(kh->used, get_irn_idx(irn))) {
+                       if(be_is_Spill(irn) || be_is_Reload(irn)) {
+                               DBG((kh->si->dbg, LEVEL_1, "\t SUBOPTIMAL! %+F IS UNUSED (cost: %g)\n", irn, get_cost(kh->si, irn)*execution_frequency(kh->si, bb)));
+                               assert(lpp_get_sol_state(kh->si->lpp) != lpp_optimal && "optimal solution is suboptimal?");
+                       }
 
                        sched_remove(irn);
 
                        set_nodes_block(irn, bad);
                        for (i = 0, n = get_irn_arity(irn); i < n; ++i) {
-                               ir_node             *arg = get_irn_n(irn, i);
-
                                set_irn_n(irn, i, bad);
                        }
                }
@@ -2588,12 +2784,15 @@ walker_kill_unused(ir_node * bb, void * data)
 static void
 kill_all_unused_values_in_schedule(spill_ilp_t * si)
 {
-       lc_bitset_t   *used = lc_bitset_malloc(get_irg_last_idx(si->chordal_env->irg));
+       struct kill_helper kh;
 
-       irg_walk_graph(si->chordal_env->irg, walker_collect_used, NULL, used);
-       irg_block_walk_graph(si->chordal_env->irg, walker_kill_unused, NULL, used);
+       kh.used = lc_bitset_malloc(get_irg_last_idx(si->chordal_env->irg));
+       kh.si = si;
 
-       lc_bitset_free(used);
+       irg_walk_graph(si->chordal_env->irg, walker_collect_used, NULL, kh.used);
+       irg_block_walk_graph(si->chordal_env->irg, walker_kill_unused, NULL, &kh);
+
+       lc_bitset_free(kh.used);
 }
 
 static void
@@ -2611,6 +2810,9 @@ rewire_uses(spill_ilp_t * si)
 {
        dom_front_info_t     *dfi = be_compute_dominance_frontiers(si->chordal_env->irg);
        defs_t               *defs;
+       pset                 *ignore = pset_new_ptr(1);
+
+       pset_insert_ptr(ignore, get_irg_end(si->chordal_env->irg));
 
        /* then fix uses of spills */
        set_foreach(si->values, defs) {
@@ -2619,30 +2821,29 @@ rewire_uses(spill_ilp_t * si)
                ir_node  *next = defs->remats;
                int remats = 0;
 
-               if(next) {
-                       reloads = pset_new_ptr_default();
+               reloads = pset_new_ptr_default();
 
-                       while(next) {
-                               if(be_is_Reload(next)) {
-                                       pset_insert_ptr(reloads, next);
-                               } else {
-                                       ++remats;
-                               }
-                               next = get_irn_link(next);
+               while(next) {
+                       if(be_is_Reload(next)) {
+                               pset_insert_ptr(reloads, next);
+                       } else {
+                               ++remats;
                        }
+                       next = get_irn_link(next);
+               }
 
-                       spills = get_spills_for_value(si, defs->value);
-                       DBG((si->dbg, LEVEL_2, "\t  %d remats, %d reloads, and %d spills for value %+F\n", remats, pset_count(reloads), pset_count(spills), defs->value));
-                       if(pset_count(spills) > 1) {
-                               assert(pset_count(reloads) > 0);
-//                             print_irn_pset(spills);
-//                             print_irn_pset(reloads);
-                               be_ssa_constr_set_uses(dfi, spills, reloads);
-                       }
+               spills = get_spills_for_value(si, defs->value);
+               DBG((si->dbg, LEVEL_2, "\t  %d remats, %d reloads, and %d spills for value %+F\n", remats, pset_count(reloads), pset_count(spills), defs->value));
+               if(pset_count(spills) > 1) {
+                       //assert(pset_count(reloads) > 0);
+                       //                              print_irn_pset(spills);
+                       //                              print_irn_pset(reloads);
 
-                       del_pset(reloads);
-                       del_pset(spills);
+                       be_ssa_constr_set_ignore(dfi, spills, ignore);
                }
+
+               del_pset(reloads);
+               del_pset(spills);
        }
 
        /* first fix uses of remats and reloads */
@@ -2682,7 +2883,9 @@ writeback_results(spill_ilp_t * si)
 
        DBG((si->dbg, LEVEL_1, "Applying results\n"));
        delete_unnecessary_remats(si);
+       si->m_unknown = new_r_Unknown(si->chordal_env->irg, mode_M);
        irg_block_walk_graph(si->chordal_env->irg, walker_spill_placer, NULL, si);
+       phim_fixer(si);
        irg_block_walk_graph(si->chordal_env->irg, walker_reload_placer, NULL, si);
 
        /* clean the remat info! there are still back-edges leading there! */
@@ -2690,6 +2893,8 @@ writeback_results(spill_ilp_t * si)
 
        rewire_uses(si);
 
+       connect_all_spills_with_keep(si);
+
        del_set(si->values);
 }
 
@@ -2710,27 +2915,16 @@ get_n_regs(spill_ilp_t * si)
        return free;
 }
 
-#if 0
-#define  enqueue(r) if(reloads[ins_pos]) put(); \
-                       assert(reloads[ins_pos]==NULL); \
-                    reload[ins_pos] = (r); \
-                    ins_pos = (ins_pos+1)%si->n_regs; \
-                    ++count
-#define  put() if(reloads[del_pos]) sched_put_before(irn, reloads[del_pos]); \
-               reloads[del_pos] = NULL; \
-               del_pos = (del_pos+1)%si->n_regs; \
-               --count;
-#endif
 static void
 walker_reload_mover(ir_node * bb, void * data)
 {
        spill_ilp_t   *si = data;
-       ir_node           *irn;
+       ir_node           *tmp;
 
-       sched_foreach(bb, irn) {
-               if(be_is_Reload(irn) && has_reg_class(si, irn)) {
-                       ir_node       *reload = irn;
-                       ir_node       *irn = irn;
+       sched_foreach(bb, tmp) {
+               if(be_is_Reload(tmp) && has_reg_class(si, tmp)) {
+                       ir_node       *reload = tmp;
+                       ir_node       *irn = tmp;
 
                        /* move reload upwards */
 
@@ -2744,7 +2938,7 @@ walker_reload_mover(ir_node * bb, void * data)
                                while(pressure < si->n_regs) {
                                        if(sched_is_end(irn) || (be_is_Reload(irn) && has_reg_class(si, irn))) break;
 
-                                       set_irn_link(irn, (void*)(pressure+1));
+                                       set_irn_link(irn, INT_TO_PTR(pressure+1));
                                        DBG((si->dbg, LEVEL_5, "new regpressure before %+F: %d\n", irn, pressure+1));
                                        irn = sched_prev(irn);
 
@@ -2753,9 +2947,6 @@ walker_reload_mover(ir_node * bb, void * data)
 
                                DBG((si->dbg, LEVEL_3, "putting reload %+F after %+F\n", reload, irn));
                                sched_put_after(irn, reload);
-//                             sched_add_after(irn, reload);
-//                             pressure = (int)get_irn_link(sched_next(reload));
-//                             set_irn_link(reload, (void*)(pressure-1));
                        }
                }
        }
@@ -2780,7 +2971,7 @@ be_spill_remat(const be_chordal_env_t * chordal_env)
        ir_snprintf(problem_name, sizeof(problem_name), "%F_%s", chordal_env->irg, chordal_env->cls->name);
        ir_snprintf(dump_suffix, sizeof(dump_suffix), "-%s-remats", chordal_env->cls->name);
        ir_snprintf(dump_suffix2, sizeof(dump_suffix2), "-%s-pressure", chordal_env->cls->name);
-       ir_snprintf(dump_suffix2, sizeof(dump_suffix3), "-%s-reloads_moved", chordal_env->cls->name);
+       ir_snprintf(dump_suffix3, sizeof(dump_suffix3), "-%s-reloads_moved", chordal_env->cls->name);
 
        FIRM_DBG_REGISTER(si.dbg, "firm.be.ra.spillremat");
        DBG((si.dbg, LEVEL_1, "\n\n\t\t===== Processing %s =====\n\n", problem_name));
@@ -2793,8 +2984,12 @@ be_spill_remat(const be_chordal_env_t * chordal_env)
        si.lpp = new_lpp(problem_name, lpp_minimize);
        si.remat_info = new_set(cmp_remat_info, 4096);
        si.all_possible_remats = pset_new_ptr_default();
+       si.spills = pset_new_ptr_default();
+       si.inverse_ops = pset_new_ptr_default();
 #ifndef EXECFREQ_LOOPDEPH
        si.execfreqs = compute_execfreq(chordal_env->irg);
+#else
+       si.execfreqs = NULL;
 #endif
 #ifdef KEEPALIVE
        si.keep = NULL;
@@ -2805,7 +3000,6 @@ be_spill_remat(const be_chordal_env_t * chordal_env)
        compute_doms(chordal_env->irg);
 
 #ifdef COLLECT_REMATS
-//     irg_block_walk_graph(chordal_env->irg, process_block, NULL, &si);
        /* collect remats */
        DBG((si.dbg, LEVEL_1, "Collecting remats\n"));
        irg_walk_graph(chordal_env->irg, walker_remat_collector, NULL, &si);
@@ -2852,7 +3046,9 @@ be_spill_remat(const be_chordal_env_t * chordal_env)
 
 #ifdef SOLVE
        DBG((si.dbg, LEVEL_1, "\tSolving %F\n", chordal_env->irg));
+#ifdef ILP_TIMEOUT
        lpp_set_time_limit(si.lpp, ILP_TIMEOUT);
+#endif
 
 #ifdef SOLVE_LOCAL
        lpp_solve_cplex(si.lpp);
@@ -2862,7 +3058,7 @@ be_spill_remat(const be_chordal_env_t * chordal_env)
        assert(lpp_is_sol_valid(si.lpp)
               && "solution of ILP must be valid");
 
-       DBG((si.dbg, LEVEL_1, "\t%s: iterations: %d, solution time: %g, objective function: %g\n", problem_name, si.lpp->iterations, si.lpp->sol_time, si.lpp->objval));
+       DBG((si.dbg, LEVEL_1, "\t%s: iterations: %d, solution time: %g, objective function: %g\n", problem_name, si.lpp->iterations, si.lpp->sol_time, is_zero(si.lpp->objval)?0.0:si.lpp->objval));
 
 #ifdef DUMP_SOLUTION
        {
@@ -2887,6 +3083,10 @@ be_spill_remat(const be_chordal_env_t * chordal_env)
 
        kill_all_unused_values_in_schedule(&si);
 
+#if defined(KEEPALIVE_SPILLS) || defined(KEEPALIVE_RELOADS)
+       be_dump(chordal_env->irg, "-spills-placed", dump_ir_block_graph);
+#endif
+
        be_liveness(chordal_env->irg);
        irg_block_walk_graph(chordal_env->irg, walker_pressure_annotator, NULL, &si);
 
@@ -2902,9 +3102,11 @@ be_spill_remat(const be_chordal_env_t * chordal_env)
        dump_pressure_graph(&si, dump_suffix3);
 
        free_dom(chordal_env->irg);
+       del_pset(si.inverse_ops);
        del_pset(si.all_possible_remats);
+       del_pset(si.spills);
 #ifndef EXECFREQ_LOOPDEPH
-       del_set(si.execfreqs);
+       free_execfreq(si.execfreqs);
 #endif
        free_lpp(si.lpp);
        obstack_free(&obst, NULL);