Make Ld/St parallelisation work (but seems to be broken with bit fields).
[libfirm] / ir / opt / ldst2.c
1 /*
2  * Copyright (C) 1995-2008 University of Karlsruhe.  All right reserved.
3  *
4  * This file is part of libFirm.
5  *
6  * This file may be distributed and/or modified under the terms of the
7  * GNU General Public License version 2 as published by the Free Software
8  * Foundation and appearing in the file LICENSE.GPL included in the
9  * packaging of this file.
10  *
11  * Licensees holding valid libFirm Professional Edition licenses may use
12  * this file in accordance with the libFirm Commercial License.
13  * Agreement provided with the Software.
14  *
15  * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
16  * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17  * PURPOSE.
18  */
19
20 /**
21  * @file
22  * @brief   parallelizing Load/Store optimisation
23  * @author  Christoph Mallon
24  * @version $Id: $
25  */
26 #ifdef HAVE_CONFIG_H
27 #include "config.h"
28 #endif
29
30 #include "iroptimize.h"
31
32 #include "array.h"
33 #include "debug.h"
34 #include "ircons.h"
35 #include "irgraph.h"
36 #include "irgmod.h"
37 #include "irgopt.h"
38 #include "irgwalk.h"
39 #include "irmemory.h"
40 #include "irnode.h"
41 #include "irnodeset.h"
42 #include "obst.h"
43 #include "irdump.h"
44 #include "irflag_t.h"
45
46 #if +0
47 #define OPTIMISE_LOAD_AFTER_LOAD
48
49
50 #define UNIMPLEMENTED abort();
51
52
53 DEBUG_ONLY(static firm_dbg_module_t *dbg);
54
55
56 static struct obstack obst;
57 static size_t count_addrs;
58 static ir_node** addrs;
59
60
61 static void AddressCollector(ir_node* node, void* env)
62 {
63         ir_nodeset_t* addrs_set = env;
64         ir_node* addr;
65         if (is_Load(node)) {
66                 addr = get_Load_ptr(node);
67         } else if (is_Store(node)) {
68                 addr = get_Store_ptr(node);
69         } else {
70                 return;
71         }
72         ir_nodeset_insert(addrs_set, addr);
73 }
74
75
76 /* Collects all unique addresses used by load and store nodes of a graph and
77  * puts them into an array for later use */
78 static void CollectAddresses(ir_graph* irg)
79 {
80         ir_nodeset_t addrs_set;
81
82         ir_nodeset_init(&addrs_set);
83         irg_walk_graph(irg, AddressCollector, NULL, &addrs_set);
84
85         count_addrs = ir_nodeset_size(&addrs_set);
86         DB((dbg, LEVEL_1, "===> %+F uses %u unique addresses\n", irg, (unsigned int)count_addrs));
87         if (count_addrs != 0) {
88                 ir_nodeset_iterator_t addr_iter;
89                 size_t i;
90
91                 addrs = NEW_ARR_D(ir_node*, &obst, count_addrs);
92                 ir_nodeset_iterator_init(&addr_iter, &addrs_set);
93                 for (i = 0; i < count_addrs; i++) {
94                         ir_node* addr = ir_nodeset_iterator_next(&addr_iter);
95                         assert(addr != NULL);
96                         set_irn_link(addr, (void *)i);
97                         addrs[i] = addr;
98                         DB((dbg, LEVEL_2, "===> Collected unique symbolic address %+F\n", addr));
99                 }
100         }
101 }
102
103
104 static void AliasSetAdder(ir_node* block, void* env)
105 {
106         ir_nodeset_t* alias_set;
107         size_t i;
108         (void) env;
109
110         alias_set = NEW_ARR_D(ir_nodeset_t, &obst, count_addrs);
111         for (i = 0; i < count_addrs; i++) {
112                 ir_nodeset_init(&alias_set[i]);
113         }
114         set_irn_link(block, alias_set);
115 }
116
117
118 static void SetStartAddressesTop(ir_graph* irg)
119 {
120         ir_node* initial_mem;
121         ir_node* start_block;
122         ir_nodeset_t* start_addrs;
123         size_t i;
124
125         initial_mem = get_irg_initial_mem(irg);
126         start_block = get_irg_start_block(irg);
127         start_addrs = get_irn_link(start_block);
128         for (i = 0; i < count_addrs; i++) {
129                 ir_nodeset_insert(&start_addrs[i], initial_mem);
130         }
131         mark_Block_block_visited(start_block);
132 }
133
134
135 static void AliasSetDestroyer(ir_node* block, void* env)
136 {
137         ir_nodeset_t* alias_set = get_irn_link(block);
138         size_t i;
139         (void) env;
140
141         for (i = 0; i < count_addrs; i++) {
142                 ir_nodeset_destroy(&alias_set[i]);
143         }
144 }
145
146
147 static ir_alias_relation AliasTest(ir_graph* irg, ir_node* addr, ir_mode* mode, ir_node* other)
148 {
149         ir_node* other_addr;
150         ir_mode* other_mode;
151
152         if (is_Proj(other)) other = get_Proj_pred(other);
153
154         if (is_Load(other)) {
155                 other_addr = get_Load_ptr(other);
156         } else if (is_Store(other)) {
157                 other_addr = get_Store_ptr(other);
158         } else {
159                 return may_alias;
160         }
161
162         other_mode = get_irn_mode(other);
163         return get_alias_relation(irg, addr, mode, other_addr, other_mode);
164 }
165
166
167 static ir_node* GenerateSync(ir_graph* irg, ir_node* block, ir_nodeset_t* after_set)
168 {
169         size_t set_size = ir_nodeset_size(after_set);
170         ir_nodeset_iterator_t iter;
171
172         assert(set_size != 0);
173
174         ir_nodeset_iterator_init(&iter, after_set);
175         if (set_size == 1) {
176                 return ir_nodeset_iterator_next(&iter);
177         } else {
178                 ir_node** in;
179                 size_t i;
180
181                 NEW_ARR_A(ir_node*, in, set_size);
182                 for (i = 0; i < set_size; i++) {
183                         in[i] = ir_nodeset_iterator_next(&iter);
184                 }
185                 return new_r_Sync(irg, block, set_size, in);
186         }
187 }
188
189
190 static ir_node** unfinished_phis;
191
192
193 static void PlaceMemPhis(ir_graph* irg, ir_node* block, ir_node* phi)
194 {
195         int unfinished = 0;
196         size_t block_n_preds = get_Block_n_cfgpreds(block);
197         ir_nodeset_t* thissets;
198         ir_node** in;
199         size_t i;
200         size_t j;
201
202         thissets = get_irn_link(block);
203         NEW_ARR_A(ir_node*, in, block_n_preds);
204         for (j = 0; j < count_addrs; j++) {
205                 ir_node* new_phi;
206
207                 for (i = 0; i < block_n_preds; i++) {
208                         ir_node* pred_block = get_nodes_block(get_Phi_pred(phi, i)); // TODO get_Block_cfgpred_block(block, i);
209                         ir_nodeset_t* predsets = get_irn_link(pred_block);
210                         size_t predset_size = ir_nodeset_size(&predsets[j]);
211
212                         if (predset_size == 0) {
213                                 in[i] = new_r_Unknown(irg, mode_M);
214                                 unfinished = 1;
215                         } else {
216                                 in[i] = GenerateSync(irg, pred_block, &predsets[j]);
217                         }
218                 }
219                 new_phi = new_r_Phi(irg, block, block_n_preds, in, mode_M);
220                 if (unfinished) {
221                         set_irn_link(new_phi, unfinished_phis[j]);
222                         unfinished_phis[j] = new_phi;
223                 }
224                 ir_nodeset_insert(&thissets[j], new_phi);
225         }
226 }
227
228
229 static int WalkMem(ir_graph* irg, ir_node* node, ir_node* last_block);
230
231
232 static void WalkMemPhi(ir_graph* irg, ir_node* block, ir_node* phi)
233 {
234         size_t n = get_Phi_n_preds(phi);
235         size_t i;
236
237         for (i = 0; i < n; i++) {
238                 WalkMem(irg, get_Phi_pred(phi, i), block);
239         }
240
241         PlaceMemPhis(irg, block, phi);
242         exchange(phi, new_Bad());
243 }
244
245
246 static void PlaceLoad(ir_graph* irg, ir_node* block, ir_node* load, ir_node* memory)
247 {
248         ir_node* addr = get_Load_ptr(load);
249         size_t addr_idx = (size_t)get_irn_link(addr);
250         ir_nodeset_t* interfere_sets = get_irn_link(block);
251         ir_nodeset_t* interfere_set = &interfere_sets[addr_idx];
252         size_t size = ir_nodeset_size(interfere_set);
253         ir_nodeset_iterator_t interfere_iter;
254         size_t i;
255
256         assert(size > 0);
257         ir_nodeset_iterator_init(&interfere_iter, interfere_set);
258         if (size == 1) {
259                 ir_node* after = ir_nodeset_iterator_next(&interfere_iter);
260                 assert(!is_Proj(after) || !is_Load(get_Proj_pred(after)));
261                 DB((dbg, LEVEL_3, "===> %+F must be executed after %+F\n", load, after));
262                 set_Load_mem(load, after);
263         } else {
264                 ir_node** after_set;
265                 ir_node* after;
266                 ir_node* mem;
267                 size_t i;
268
269                 NEW_ARR_A(ir_node*, after_set, size);
270                 i = 0;
271                 while ((mem = ir_nodeset_iterator_next(&interfere_iter)) != NULL) {
272                         if (is_Proj(mem)) {
273                                 ir_node* pred = get_Proj_pred(mem);
274                                 if (is_Load(pred)) {
275 #ifdef OPTIMISE_LOAD_AFTER_LOAD
276                                         if (get_Load_ptr(pred) == addr && get_Load_mode(pred) == get_Load_mode(load)) {
277                                                 exchange(load, pred);
278                                                 return;
279                                         }
280 #endif
281                                         continue;
282                                 }
283                         }
284                         DB((dbg, LEVEL_3, "===> %+F must be executed after %+F\n", load, mem));
285                         after_set[i++] = mem;
286                 }
287                 assert(i != 0);
288                 if (i == 1) {
289                         after = after_set[0];
290                 } else {
291                         after = new_r_Sync(irg, block, i, after_set);
292                 }
293                 set_Load_mem(load, after);
294         }
295
296         for (i = 0; i < count_addrs; i++) {
297                 ir_mode* mode = get_Load_mode(load);
298                 ir_node* other_addr = addrs[i];
299                 ir_mode* other_mode = mode; // XXX second mode is nonsense
300                 ir_alias_relation rel = get_alias_relation(irg, addr, mode, other_addr, other_mode);
301
302                 DB((dbg, LEVEL_3, "===> Testing for alias between %+F and %+F. Relation is %d\n", addr, other_addr, rel));
303                 if (rel == no_alias) {
304                         continue;
305                 }
306                 DB((dbg, LEVEL_3, "===> %+F potentially aliases address %+F\n", load, other_addr));
307
308                 ir_nodeset_insert(&interfere_sets[i], memory);
309         }
310 }
311
312
313 static void PlaceStore(ir_graph* irg, ir_node* block, ir_node* store, ir_node* memory)
314 {
315         ir_node* addr = get_Store_ptr(store);
316         size_t addr_idx = (size_t)get_irn_link(addr);
317         ir_nodeset_t* interfere_sets = get_irn_link(block);
318         ir_nodeset_t* interfere_set = &interfere_sets[addr_idx];
319         ir_node* after;
320         size_t i;
321
322         after = GenerateSync(irg, block, interfere_set);
323         set_Store_mem(store, after);
324
325         for (i = 0; i < count_addrs; i++) {
326                 ir_nodeset_iterator_t interfere_iter;
327                 ir_mode* mode = get_irn_mode(get_Store_value(store));
328                 ir_node* other_addr = addrs[i];
329                 ir_mode* other_mode = mode; // XXX second mode is nonsense
330                 ir_alias_relation rel = get_alias_relation(irg, addr, mode, other_addr, other_mode);
331                 ir_node* other_node;
332
333                 DB((dbg, LEVEL_3, "===> Testing for alias between %+F and %+F. Relation is %d\n", addr, other_addr, rel));
334                 if (rel == no_alias) {
335                         continue;
336                 }
337                 DB((dbg, LEVEL_3, "===> %+F potentially aliases address %+F\n", store, other_addr));
338
339                 ir_nodeset_iterator_init(&interfere_iter, &interfere_sets[i]);
340                 while ((other_node = ir_nodeset_iterator_next(&interfere_iter)) != NULL) {
341                         if (AliasTest(irg, addr, mode, other_node) != no_alias) {
342                                 DB((dbg, LEVEL_3, "===> Removing %+F from execute-after set of %+F due to %+F\n", other_node, addrs[i], store));
343                                 ir_nodeset_remove_iterator(&interfere_sets[i], &interfere_iter);
344                         }
345                 }
346
347                 ir_nodeset_insert(&interfere_sets[i], memory);
348         }
349 }
350
351
352 static int WalkMem(ir_graph* irg, ir_node* node, ir_node* last_block)
353 {
354         int block_change = 0;
355         ir_node* block = get_nodes_block(node);
356         ir_node* pred;
357         ir_node* memory = node;
358         ir_nodeset_t* addr_sets;
359
360         if (block != last_block) {
361                 DB((dbg, LEVEL_3, "===> Changing block from %+F to %+F\n", last_block, block));
362                 block_change = 1;
363                 if (Block_not_block_visited(block)) {
364                         mark_Block_block_visited(block);
365                 } else {
366                         DB((dbg, LEVEL_2, "===> Hit already visited block at %+F\n", node));
367                         return block_change;
368                 }
369         }
370
371         // Skip projs
372         if (is_Proj(node)) node = get_Proj_pred(node);
373
374         if (is_Phi(node)) {
375                 WalkMemPhi(irg, block, node);
376                 return block_change;
377         } else if (is_Sync(node)) {
378                 UNIMPLEMENTED
379         } else if (is_Return(node)) {
380                 pred = get_Return_mem(node);
381         } else {
382                 pred = get_fragile_op_mem(node);
383         }
384
385         if (WalkMem(irg, pred, block)) {
386                 // There was a block change
387                 size_t block_arity = get_Block_n_cfgpreds(block);
388
389                 DB((dbg, LEVEL_3, "===> There is a block change before %+F\n", node));
390                 if (block_arity == 1) {
391                         // Just one predecessor, inherit its alias sets
392                         ir_node* pred_block = get_nodes_block(pred);
393                         ir_nodeset_t* predsets = get_irn_link(pred_block);
394                         ir_nodeset_t* thissets = get_irn_link(block);
395                         size_t i;
396
397                         DB((dbg, LEVEL_3, "===> Copying the only predecessor's address sets\n"));
398
399                         if (ir_nodeset_size(&predsets[0]) == 0) {
400                                 ir_node* unknown;
401
402                                 DB((dbg, LEVEL_3, "===> The predecessor was not finished yet\n"));
403                                 assert(!Block_not_block_visited(pred_block));
404
405                                 unknown = new_r_Unknown(irg, mode_M);
406                                 for (i = 0; i < count_addrs; i++) {
407                                         ir_node* phi_unk = new_r_Phi(irg, block, 1, &unknown, mode_M);
408                                         DB((dbg, LEVEL_3, "===> Placing unfinished %+F for %+F in %+F\n", phi_unk, addrs[i], block));
409                                         set_irn_link(phi_unk, unfinished_phis[i]);
410                                         unfinished_phis[i] = phi_unk;
411                                         ir_nodeset_insert(&thissets[i], phi_unk);
412                                 }
413                         } else {
414                                 for (i = 0; i < count_addrs; i++) {
415                                         ir_nodeset_iterator_t prediter;
416                                         ir_node* addr;
417
418                                         ir_nodeset_iterator_init(&prediter, &predsets[i]);
419                                         while ((addr = ir_nodeset_iterator_next(&prediter)) != NULL) {
420                                                 ir_nodeset_insert(&thissets[i], addr);
421                                         }
422                                 }
423                         }
424                 }
425         }
426
427         DB((dbg, LEVEL_3, "===> Detotalising %+F\n", node));
428
429         addr_sets = get_irn_link(block);
430
431         if (is_Load(node)) {
432                 PlaceLoad(irg, block, node, memory);
433         } else if (is_Store(node)) {
434                 PlaceStore(irg, block, node, memory);
435         } else {
436                 ir_nodeset_t sync_set;
437                 size_t i;
438                 ir_node* after;
439
440                 DB((dbg, LEVEL_3, "===> Fallback: %+F aliases everything\n", node));
441
442                 ir_nodeset_init(&sync_set);
443                 for (i = 0; i < count_addrs; i++) {
444                         ir_nodeset_iterator_t iter;
445                         ir_node* mem;
446
447                         ir_nodeset_iterator_init(&iter, &addr_sets[i]);
448                         while ((mem = ir_nodeset_iterator_next(&iter)) != NULL) {
449                                 ir_nodeset_insert(&sync_set, mem);
450                         }
451                 }
452
453                 after = GenerateSync(irg, block, &sync_set);
454                 set_irn_n(node, 0, after); // XXX unnice way to set the memory input
455
456                 for (i = 0; i < count_addrs; i++) {
457                         ir_nodeset_iterator_t iter;
458                         ir_nodeset_iterator_init(&iter, &addr_sets[i]);
459                         while (ir_nodeset_iterator_next(&iter) != NULL) {
460                                 ir_nodeset_remove_iterator(&addr_sets[i], &iter);
461                         }
462                         ir_nodeset_insert(&addr_sets[i], memory);
463                 }
464         }
465
466         return block_change;
467 }
468
469
470 static void FinalisePhis(ir_graph* irg)
471 {
472         size_t i;
473
474         for (i = 0; i < count_addrs; i++) {
475                 ir_node* next_phi;
476                 ir_node* phi;
477
478                 for (phi = unfinished_phis[i]; phi != NULL; phi = next_phi) {
479                         ir_node* block = get_nodes_block(phi);
480                         size_t block_n_preds = get_Block_n_cfgpreds(block);
481
482                         next_phi = get_irn_link(phi);
483
484                         DB((dbg, LEVEL_4, "===> Finialising phi %+F in %+F\n", phi, block));
485
486                         if (block_n_preds == 1) {
487                                 ir_node* pred_block = get_Block_cfgpred_block(block, 0);
488                                 ir_nodeset_t* pred_sets = get_irn_link(pred_block);
489                                 ir_node* after = GenerateSync(irg, pred_block, &pred_sets[i]);
490
491                                 assert(is_Unknown(get_Phi_pred(phi, 0)));
492                                 exchange(phi, after);
493                         } else {
494                                 ir_node** in;
495                                 size_t j;
496
497                                 NEW_ARR_A(ir_node*, in, block_n_preds);
498                                 for (j = 0; j < block_n_preds; j++) {
499                                         ir_node* pred_block = get_Block_cfgpred_block(block, j);
500                                         ir_nodeset_t* pred_sets = get_irn_link(pred_block);
501
502                                         if (is_Unknown(get_Phi_pred(phi, j))) {
503                                                 set_Phi_pred(phi, j, GenerateSync(irg, pred_block, &pred_sets[i]));
504                                         }
505                                 }
506                         }
507                 }
508         }
509 }
510
511
512 static void Detotalise(ir_graph* irg)
513 {
514         ir_node* end_block = get_irg_end_block(irg);
515         size_t npreds = get_Block_n_cfgpreds(end_block);
516         size_t i;
517
518         unfinished_phis = xmalloc(sizeof(*unfinished_phis) * count_addrs);
519         for (i = 0; i < count_addrs; i++) {
520                 unfinished_phis[i] = NULL;
521         }
522
523         for (i = 0; i < npreds; i++) {
524                 ir_node* pred = get_Block_cfgpred(end_block, i);
525                 assert(is_Return(pred));
526                 DB((dbg, LEVEL_2, "===> Starting memory walk at %+F\n", pred));
527                 WalkMem(irg, pred, NULL);
528         }
529
530         FinalisePhis(irg);
531         xfree(unfinished_phis);
532 }
533 #endif
534
535
536 static void AddSyncPreds(ir_nodeset_t* preds, ir_node* sync)
537 {
538         size_t n = get_Sync_n_preds(sync);
539         size_t i;
540
541         for (i = 0; i < n; i++) {
542                 ir_node* pred = get_Sync_pred(sync, i);
543                 if (is_Sync(pred)) {
544                         AddSyncPreds(preds, pred);
545                 } else {
546                         ir_nodeset_insert(preds, pred);
547                 }
548         }
549 }
550
551
552 static void NormaliseSync(ir_node* node, void* env)
553 {
554         ir_nodeset_t preds;
555         ir_nodeset_iterator_t iter;
556         ir_node** in;
557         size_t count_preds;
558         size_t i;
559         (void) env;
560
561         if (!is_Sync(node)) return;
562
563         ir_nodeset_init(&preds);
564         AddSyncPreds(&preds, node);
565
566         count_preds = ir_nodeset_size(&preds);
567         if (count_preds != (unsigned)get_Sync_n_preds(node)) {
568                 NEW_ARR_A(ir_node*, in, count_preds);
569                 ir_nodeset_iterator_init(&iter, &preds);
570                 for (i = 0; i < count_preds; i++) {
571                         ir_node* pred = ir_nodeset_iterator_next(&iter);
572                         assert(pred != NULL);
573                         in[i] = pred;
574                 }
575                 set_irn_in(node, count_preds, in);
576         }
577
578         ir_nodeset_destroy(&preds);
579 }
580
581
582 #if 0
583 void opt_ldst2(ir_graph* irg)
584 {
585         FIRM_DBG_REGISTER(dbg, "firm.opt.ldst2");
586         DB((dbg, LEVEL_1, "===> Performing load/store optimisation on %+F\n", irg));
587
588         normalize_one_return(irg);
589         dump_ir_block_graph(irg, "-prefluffig");
590
591         obstack_init(&obst);
592
593         if (1 /* XXX */ || get_opt_alias_analysis()) {
594                 assure_irg_address_taken_computed(irg);
595                 assure_irp_globals_address_taken_computed();
596         }
597
598
599         CollectAddresses(irg);
600         if (count_addrs == 0) return;
601
602         irg_block_walk_graph(irg, AliasSetAdder, NULL, NULL);
603         inc_irg_block_visited(irg);
604         SetStartAddressesTop(irg);
605         Detotalise(irg);
606         dump_ir_block_graph(irg, "-fluffig");
607
608         irg_block_walk_graph(irg, AliasSetDestroyer, NULL, NULL);
609         obstack_free(&obst, NULL);
610
611         normalize_proj_nodes(irg);
612         irg_walk_graph(irg, NormaliseSync, NULL, NULL);
613   optimize_graph_df(irg);
614         irg_walk_graph(irg, NormaliseSync, NULL, NULL);
615         dump_ir_block_graph(irg, "-postfluffig");
616 }
617 #endif
618
619
620 typedef struct parallelise_info
621 {
622         ir_node      *origin_block;
623         ir_node      *origin_ptr;
624         ir_mode      *origin_mode;
625         ir_nodeset_t  this_mem;
626         ir_nodeset_t  user_mem;
627 } parallelise_info;
628
629
630 static void parallelise_load(parallelise_info *pi, ir_node *irn)
631 {
632         //ir_fprintf(stderr, "considering %+F\n", irn);
633         if (get_nodes_block(irn) == pi->origin_block) {
634                 if (is_Proj(irn)) {
635                         ir_node *pred = get_Proj_pred(irn);
636                         if (is_Load(pred) &&
637                                         get_Load_volatility(pred) == volatility_non_volatile) {
638                                 ir_node *mem = get_Load_mem(pred);
639                                 //ir_nodeset_insert(&pi->this_mem, mem);
640                                 ir_nodeset_insert(&pi->user_mem, irn);
641                                 //ir_fprintf(stderr, "adding %+F to user set\n", irn);
642                                 parallelise_load(pi, mem);
643                                 return;
644                         } else if (is_Store(pred) &&
645                                         get_Store_volatility(pred) == volatility_non_volatile) {
646                                 ir_mode *org_mode   = pi->origin_mode;
647                                 ir_node *org_ptr    = pi->origin_ptr;
648                                 ir_mode *store_mode = get_irn_mode(get_Store_value(pred));
649                                 ir_node *store_ptr  = get_Store_ptr(pred);
650                                 if (get_alias_relation(current_ir_graph, org_ptr, org_mode, store_ptr, store_mode) == no_alias) {
651                                         ir_node *mem = get_Store_mem(pred);
652                                         ir_fprintf(stderr, "Ld after St: %+F (%+F) does not alias %+F (%+F)\n", org_ptr, org_mode, store_ptr, store_mode);
653                                         ir_nodeset_insert(&pi->user_mem, irn);
654                                         //ir_fprintf(stderr, "adding %+F to user set\n", irn);
655                                         parallelise_load(pi, mem);
656                                         return;
657                                 }
658                         }
659                 } else if (is_Sync(irn)) {
660                         int n = get_Sync_n_preds(irn);
661                         int i;
662
663                         for (i = 0; i < n; ++i) {
664                                 ir_node *sync_pred = get_Sync_pred(irn, i);
665                                 parallelise_load(pi, sync_pred);
666                         }
667                         return;
668                 }
669         }
670         ir_nodeset_insert(&pi->this_mem, irn);
671         //ir_fprintf(stderr, "adding %+F to this set\n", irn);
672 }
673
674
675 static void parallelise_store(parallelise_info *pi, ir_node *irn)
676 {
677         //ir_fprintf(stderr, "considering %+F\n", irn);
678         if (get_nodes_block(irn) == pi->origin_block) {
679                 if (is_Proj(irn)) {
680                         ir_node *pred = get_Proj_pred(irn);
681                         if (is_Load(pred) &&
682                                         get_Load_volatility(pred) == volatility_non_volatile) {
683                                 ir_mode *org_mode  = pi->origin_mode;
684                                 ir_node *org_ptr   = pi->origin_ptr;
685                                 ir_mode *load_mode = get_Load_mode(pred);
686                                 ir_node *load_ptr  = get_Load_ptr(pred);
687                                 if (get_alias_relation(current_ir_graph, org_ptr, org_mode, load_ptr, load_mode) == no_alias) {
688                                         ir_node *mem = get_Load_mem(pred);
689                                         ir_fprintf(stderr, "St after Ld: %+F (%+F) does not alias %+F (%+F)\n", org_ptr, org_mode, load_ptr, load_mode);
690                                         ir_nodeset_insert(&pi->user_mem, irn);
691                                         //ir_fprintf(stderr, "adding %+F to user set\n", irn);
692                                         parallelise_store(pi, mem);
693                                         return;
694                                 }
695                         } else if (is_Store(pred) &&
696                                         get_Store_volatility(pred) == volatility_non_volatile) {
697                                 ir_mode *org_mode   = pi->origin_mode;
698                                 ir_node *org_ptr    = pi->origin_ptr;
699                                 ir_mode *store_mode = get_irn_mode(get_Store_value(pred));
700                                 ir_node *store_ptr  = get_Store_ptr(pred);
701                                 if (get_alias_relation(current_ir_graph, org_ptr, org_mode, store_ptr, store_mode) == no_alias) {
702                                         ir_fprintf(stderr, "St after St: %+F (%+F) does not alias %+F (%+F)\n", org_ptr, org_mode, store_ptr, store_mode);
703                                         ir_node *mem = get_Store_mem(pred);
704                                         ir_nodeset_insert(&pi->user_mem, irn);
705                                         //ir_fprintf(stderr, "adding %+F to user set\n", irn);
706                                         parallelise_store(pi, mem);
707                                         return;
708                                 }
709                         }
710                 } else if (is_Sync(irn)) {
711                         int n = get_Sync_n_preds(irn);
712                         int i;
713
714                         for (i = 0; i < n; ++i) {
715                                 ir_node *sync_pred = get_Sync_pred(irn, i);
716                                 parallelise_store(pi, sync_pred);
717                         }
718                         return;
719                 }
720         }
721         ir_nodeset_insert(&pi->this_mem, irn);
722         //ir_fprintf(stderr, "adding %+F to this set\n", irn);
723 }
724
725
726 static void walker(ir_node *proj, void *env)
727 {
728         ir_node          *mem_op;
729         ir_node          *pred;
730         ir_node          *block;
731         int               n;
732         parallelise_info  pi;
733
734         (void)env;
735
736         if (!is_Proj(proj)) return;
737         if (get_irn_mode(proj) != mode_M) return;
738
739         mem_op = get_Proj_pred(proj);
740         if (is_Load(mem_op)) {
741                 if (get_Load_volatility(mem_op) != volatility_non_volatile) return;
742
743                 block = get_nodes_block(mem_op);
744                 pred  = get_Load_mem(mem_op);
745                 //ir_fprintf(stderr, "starting parallelise at %+F for %+F\n", pred, proj);
746
747                 pi.origin_block = block,
748                 pi.origin_ptr   = get_Load_ptr(mem_op);
749                 pi.origin_mode  = get_Load_mode(mem_op);
750                 ir_nodeset_init(&pi.this_mem);
751                 ir_nodeset_init(&pi.user_mem);
752
753                 parallelise_load(&pi, pred);
754         } else if (is_Store(mem_op)) {
755                 if (get_Store_volatility(mem_op) != volatility_non_volatile) return;
756
757                 block = get_nodes_block(mem_op);
758                 pred  = get_Store_mem(mem_op);
759                 //ir_fprintf(stderr, "starting parallelise at %+F for %+F\n", pred, proj);
760
761                 pi.origin_block = block,
762                 pi.origin_ptr   = get_Store_ptr(mem_op);
763                 pi.origin_mode  = get_irn_mode(get_Store_value(mem_op));
764                 ir_nodeset_init(&pi.this_mem);
765                 ir_nodeset_init(&pi.user_mem);
766
767                 parallelise_store(&pi, pred);
768         } else {
769                 return;
770         }
771
772         n = ir_nodeset_size(&pi.user_mem);
773         if (n != 0) { /* nothing happend otherwise */
774                 ir_graph               *irg  = current_ir_graph;
775                 ir_node                *sync;
776                 ir_node               **in;
777                 ir_nodeset_iterator_t   iter;
778                 int                     i;
779
780                 ++n;
781                 //ir_fprintf(stderr, "creating sync for users of %+F with %d inputs\n", proj, n);
782                 NEW_ARR_A(ir_node*, in, n);
783                 i = 0;
784                 in[i++] = new_r_Unknown(irg, mode_M);
785                 ir_nodeset_iterator_init(&iter, &pi.user_mem);
786                 for (;;) {
787                         ir_node* p = ir_nodeset_iterator_next(&iter);
788                         if (p == NULL) break;
789                         in[i++] = p;
790                 }
791                 assert(i == n);
792                 sync = new_r_Sync(irg, block, n, in);
793                 exchange(proj, sync);
794
795                 assert(pn_Load_M == pn_Store_M);
796                 proj = new_r_Proj(irg, block, mem_op, mode_M, pn_Load_M);
797                 set_Sync_pred(sync, 0, proj);
798
799                 n = ir_nodeset_size(&pi.this_mem);
800                 //ir_fprintf(stderr, "creating sync for %+F with %d inputs\n", mem_op, n);
801                 ir_nodeset_iterator_init(&iter, &pi.this_mem);
802                 if (n == 1) {
803                         sync = ir_nodeset_iterator_next(&iter);
804                 } else {
805                         NEW_ARR_A(ir_node*, in, n);
806                         i = 0;
807                         for (;;) {
808                                 ir_node* p = ir_nodeset_iterator_next(&iter);
809                                 if (p == NULL) break;
810                                 in[i++] = p;
811                         }
812                         assert(i == n);
813                         sync = new_r_Sync(irg, block, n, in);
814                 }
815                 set_memop_mem(mem_op, sync);
816         }
817
818         ir_nodeset_destroy(&pi.this_mem);
819         ir_nodeset_destroy(&pi.user_mem);
820 }
821
822
823 void opt_ldst2(ir_graph *irg)
824 {
825         assure_irg_address_taken_computed(irg);
826         assure_irp_globals_address_taken_computed();
827
828         irg_walk_graph(irg, NULL, walker, NULL);
829   //optimize_graph_df(irg);
830         //irg_walk_graph(irg, NormaliseSync, NULL, NULL);
831 }