X-Git-Url: http://nsz.repo.hu/git/?a=blobdiff_plain;f=ir%2Fbe%2Fia32%2Fia32_architecture.c;h=d9a81bd51e7fc82365561cef5b9ceac89143b009;hb=2cc8c3b3bb545e020df93fa6bbc880974d66c20b;hp=44b6e924a2c6e828d985067e342ff6334f48ce67;hpb=52bdfefef3eb404178d82dd8946f2c7a5cdc7665;p=libfirm

diff --git a/ir/be/ia32/ia32_architecture.c b/ir/be/ia32/ia32_architecture.c
index 44b6e924a..d9a81bd51 100644
--- a/ir/be/ia32/ia32_architecture.c
+++ b/ir/be/ia32/ia32_architecture.c
@@ -41,7 +41,7 @@ ia32_code_gen_config_t  ia32_cg_config;
  * CPU architectures and features.
  */
 enum cpu_arch_features {
-	arch_generic          = 0x00000001, /**< no specific architecture */
+	arch_generic32        = 0x00000001, /**< no specific architecture */
 
 	arch_i386             = 0x00000002, /**< i386 architecture */
 	arch_i486             = 0x00000004, /**< i486 architecture */
@@ -63,15 +63,25 @@ enum cpu_arch_features {
 	arch_athlon_plus      = arch_athlon | arch_k8 | arch_k10,
 	arch_all_amd          = arch_k6 | arch_geode | arch_athlon_plus,
 
-	arch_feature_mmx      = 0x00004000,                      /**< MMX instructions */
-	arch_feature_p6_insn  = 0x00008000,                      /**< PentiumPro instructions */
-	arch_feature_sse1     = 0x00010000 | arch_feature_mmx,   /**< SSE1 instructions, include MMX */
-	arch_feature_sse2     = 0x00020000 | arch_feature_sse1,  /**< SSE2 instructions, include SSE1 */
-	arch_feature_sse3     = 0x00040000 | arch_feature_sse2,  /**< SSE3 instructions, include SSE2 */
-	arch_feature_ssse3    = 0x00080000 | arch_feature_sse3,  /**< SSSE3 instructions, include SSE3 */
-	arch_feature_3DNow    = 0x00100000,                      /**< 3DNow! instructions */
-	arch_feature_3DNowE   = 0x00200000 | arch_feature_3DNow, /**< Enhanced 3DNow! instructions */
-	arch_feature_64bit    = 0x00400000 | arch_feature_sse2,  /**< x86_64 support, includes SSE2 */
+	arch_feature_mmx      = 0x00004000, /**< MMX instructions */
+	arch_feature_p6_insn  = 0x00008000, /**< PentiumPro instructions */
+	arch_feature_sse1     = 0x00010000, /**< SSE1 instructions */
+	arch_feature_sse2     = 0x00020000, /**< SSE2 instructions */
+	arch_feature_sse3     = 0x00040000, /**< SSE3 instructions */
+	arch_feature_ssse3    = 0x00080000, /**< SSSE3 instructions */
+	arch_feature_3DNow    = 0x00100000, /**< 3DNow! instructions */
+	arch_feature_3DNowE   = 0x00200000, /**< Enhanced 3DNow! instructions */
+	arch_feature_64bit    = 0x00400000, /**< x86_64 support */
+
+	arch_mmx_insn     = arch_feature_mmx,                         /**< MMX instructions */
+	arch_sse1_insn    = arch_feature_sse1  | arch_mmx_insn,       /**< SSE1 instructions, include MMX */
+	arch_sse2_insn    = arch_feature_sse2  | arch_sse1_insn,      /**< SSE2 instructions, include SSE1 */
+	arch_sse3_insn    = arch_feature_sse3  | arch_sse2_insn,      /**< SSE3 instructions, include SSE2 */
+	arch_ssse3_insn   = arch_feature_ssse3 | arch_sse3_insn,      /**< SSSE3 instructions, include SSE3 */
+
+	arch_3DNow_insn   = arch_feature_3DNow | arch_feature_mmx,    /**< 3DNow! instructions, including MMX */
+	arch_3DNowE_insn  = arch_feature_3DNowE | arch_3DNow_insn,    /**< Enhanced 3DNow! instructions */
+	arch_64bit_insn   = arch_feature_64bit  | arch_sse2_insn,     /**< x86_64 support, includes SSE2 */
 };
 
 #define FLAGS(x, f) (((x) & (f)) != 0)
@@ -80,41 +90,42 @@ enum cpu_arch_features {
  * CPU's.
  */
 enum cpu_support {
-	cpu_generic     = arch_generic,
+	cpu_generic     = arch_generic32,
 
 	/* intel CPU's */
 	cpu_i386        = arch_i386,
 	cpu_i486        = arch_i486,
 	cpu_pentium     = arch_pentium,
-	cpu_pentium_mmx = arch_pentium | arch_feature_mmx,
+	cpu_pentium_mmx = arch_pentium | arch_mmx_insn,
 	cpu_pentium_pro = arch_ppro | arch_feature_p6_insn,
-	cpu_pentium_2   = arch_ppro | arch_feature_p6_insn | arch_feature_mmx,
-	cpu_pentium_3   = arch_ppro | arch_feature_p6_insn | arch_feature_sse1,
-	cpu_pentium_m   = arch_ppro | arch_feature_p6_insn | arch_feature_sse2,
-	cpu_pentium_4   = arch_netburst | arch_feature_p6_insn | arch_feature_sse2,
-	cpu_prescott    = arch_nocona | arch_feature_p6_insn | arch_feature_sse3,
-	cpu_nocona      = arch_nocona | arch_feature_p6_insn | arch_feature_64bit | arch_feature_sse3,
-	cpu_core2       = arch_core2 | arch_feature_p6_insn | arch_feature_64bit | arch_feature_ssse3,
+	cpu_pentium_2   = arch_ppro | arch_feature_p6_insn | arch_mmx_insn,
+	cpu_pentium_3   = arch_ppro | arch_feature_p6_insn | arch_sse1_insn,
+	cpu_pentium_m   = arch_ppro | arch_feature_p6_insn | arch_sse2_insn,
+	cpu_pentium_4   = arch_netburst | arch_feature_p6_insn | arch_sse2_insn,
+	cpu_prescott    = arch_nocona | arch_feature_p6_insn | arch_sse3_insn,
+	cpu_nocona      = arch_nocona | arch_feature_p6_insn | arch_64bit_insn | arch_sse3_insn,
+	cpu_core2       = arch_core2 | arch_feature_p6_insn | arch_64bit_insn | arch_ssse3_insn,
 
 	/* AMD CPU's */
-	cpu_k6          = arch_k6 | arch_feature_mmx,
-	cpu_k6_PLUS     = arch_k6 | arch_feature_mmx | arch_feature_3DNow,
-	cpu_geode       = arch_geode | arch_feature_sse1 | arch_feature_3DNowE,
-	cpu_athlon      = arch_athlon | arch_feature_sse1 | arch_feature_3DNowE | arch_feature_p6_insn,
-	cpu_athlon64    = arch_athlon | arch_feature_sse2 | arch_feature_3DNowE | arch_feature_p6_insn | arch_feature_64bit,
-	cpu_k8          = arch_k8 | arch_feature_sse2 | arch_feature_3DNowE | arch_feature_p6_insn | arch_feature_64bit,
-	cpu_k8_sse3     = arch_k8 | arch_feature_sse3 | arch_feature_3DNowE | arch_feature_p6_insn | arch_feature_64bit,
-	cpu_k10         = arch_k10 | arch_feature_sse3 | arch_feature_3DNowE | arch_feature_p6_insn | arch_feature_64bit,
+	cpu_k6          = arch_k6 | arch_mmx_insn,
+	cpu_k6_PLUS     = arch_k6 | arch_3DNow_insn,
+	cpu_geode       = arch_geode  | arch_sse1_insn | arch_3DNowE_insn,
+	cpu_athlon      = arch_athlon | arch_sse1_insn | arch_3DNowE_insn | arch_feature_p6_insn,
+	cpu_athlon64    = arch_athlon | arch_sse2_insn | arch_3DNowE_insn | arch_feature_p6_insn | arch_64bit_insn,
+	cpu_k8          = arch_k8  | arch_3DNowE_insn | arch_feature_p6_insn | arch_64bit_insn,
+	cpu_k8_sse3     = arch_k8  | arch_3DNowE_insn | arch_feature_p6_insn | arch_64bit_insn | arch_sse3_insn,
+	cpu_k10         = arch_k10 | arch_3DNowE_insn | arch_feature_p6_insn | arch_64bit_insn | arch_sse3_insn,
 
 	/* other CPU's */
 	cpu_winchip_c6  = arch_i486 | arch_feature_mmx,
 	cpu_winchip2    = arch_i486 | arch_feature_mmx | arch_feature_3DNow,
 	cpu_c3          = arch_i486 | arch_feature_mmx | arch_feature_3DNow,
-	cpu_c3_2        = arch_ppro | arch_feature_sse1,  /* really no 3DNow! */
+	cpu_c3_2        = arch_ppro | arch_sse1_insn,  /* really no 3DNow! */
 };
 
+static int         opt_size             = 0;
 static cpu_support arch                 = cpu_generic;
-static cpu_support opt_arch             = cpu_core2;
+static cpu_support opt_arch             = cpu_generic;
 static int         use_sse2             = 0;
 static int         opt_cc               = 1;
 static int         opt_unsafe_floatconv = 0;
@@ -191,6 +202,7 @@ static lc_opt_enum_int_var_t fp_unit_var = {
 };
 
 static const lc_opt_table_entry_t ia32_architecture_options[] = {
+	LC_OPT_ENT_BOOL("size",            "optimize for size", &opt_size),
 	LC_OPT_ENT_ENUM_INT("arch",        "select the instruction architecture",
 	                    &arch_var),
 	LC_OPT_ENT_ENUM_INT("opt",         "optimize for instruction architecture",
@@ -205,15 +217,28 @@ static const lc_opt_table_entry_t ia32_architecture_options[] = {
 };
 
 typedef struct insn_const {
-	int      add_cost;           /**< cost of an add instruction */
-	int      lea_cost;           /**< cost of a lea instruction */
-	int      const_shf_cost;     /**< cost of a constant shift instruction */
-	int      cost_mul_start;     /**< starting cost of a multiply instruction */
-	int      cost_mul_bit;       /**< cost of multiply for every set bit */
-	unsigned function_alignment; /**< logarithm for alignment of function labels */
-	unsigned label_alignment;     /**< logarithm for alignment of loops labels */
+	int      add_cost;                 /**< cost of an add instruction */
+	int      lea_cost;                 /**< cost of a lea instruction */
+	int      const_shf_cost;           /**< cost of a constant shift instruction */
+	int      cost_mul_start;           /**< starting cost of a multiply instruction */
+	int      cost_mul_bit;             /**< cost of multiply for every set bit */
+	unsigned function_alignment;       /**< logarithm for alignment of function labels */
+	unsigned label_alignment;          /**< logarithm for alignment of loops labels */
+	unsigned label_alignment_max_skip; /**< maximum skip for alignment of loops labels */
 } insn_const;
 
+/* costs for optimizing for size */
+static const insn_const size_cost = {
+	2,   /* cost of an add instruction */
+	3,   /* cost of a lea instruction */
+	3,   /* cost of a constant shift instruction */
+	4,   /* starting cost of a multiply instruction */
+	0,   /* cost of multiply for every set bit */
+	0,   /* logarithm for alignment of function labels */
+	0,   /* logarithm for alignment of loops labels */
+	0,   /* maximum skip for alignment of loops labels */
+};
+
 /* costs for the i386 */
 static const insn_const i386_cost = {
 	1,   /* cost of an add instruction */
@@ -223,6 +248,7 @@ static const insn_const i386_cost = {
 	1,   /* cost of multiply for every set bit */
 	2,   /* logarithm for alignment of function labels */
 	2,   /* logarithm for alignment of loops labels */
+	3,   /* maximum skip for alignment of loops labels */
 };
 
 /* costs for the i486 */
@@ -234,6 +260,7 @@ static const insn_const i486_cost = {
 	1,   /* cost of multiply for every set bit */
 	4,   /* logarithm for alignment of function labels */
 	4,   /* logarithm for alignment of loops labels */
+	15,  /* maximum skip for alignment of loops labels */
 };
 
 /* costs for the Pentium */
@@ -245,6 +272,7 @@ static const insn_const pentium_cost = {
 	0,   /* cost of multiply for every set bit */
 	4,   /* logarithm for alignment of function labels */
 	4,   /* logarithm for alignment of loops labels */
+	7,   /* maximum skip for alignment of loops labels */
 };
 
 /* costs for the Pentium Pro */
@@ -256,6 +284,7 @@ static const insn_const pentiumpro_cost = {
 	0,   /* cost of multiply for every set bit */
 	4,   /* logarithm for alignment of function labels */
 	4,   /* logarithm for alignment of loops labels */
+	10,  /* maximum skip for alignment of loops labels */
 };
 
 /* costs for the K6 */
@@ -267,6 +296,7 @@ static const insn_const k6_cost = {
 	0,   /* cost of multiply for every set bit */
 	5,   /* logarithm for alignment of function labels */
 	5,   /* logarithm for alignment of loops labels */
+	7,   /* maximum skip for alignment of loops labels */
 };
 
 /* costs for the Geode */
@@ -278,6 +308,7 @@ static const insn_const geode_cost = {
 	0,   /* cost of multiply for every set bit */
 	0,   /* logarithm for alignment of function labels */
 	0,   /* logarithm for alignment of loops labels */
+	0,   /* maximum skip for alignment of loops labels */
 };
 
 /* costs for the Athlon */
@@ -289,6 +320,7 @@ static const insn_const athlon_cost = {
 	0,   /* cost of multiply for every set bit */
 	4,   /* logarithm for alignment of function labels */
 	4,   /* logarithm for alignment of loops labels */
+	7,   /* maximum skip for alignment of loops labels */
 };
 
 /* costs for the Opteron/K8/K10 */
@@ -298,8 +330,15 @@ static const insn_const k8_cost = {
 	1,   /* cost of a constant shift instruction */
 	3,   /* starting cost of a multiply instruction */
 	0,   /* cost of multiply for every set bit */
+#if 0 /* TEST */
 	4,   /* logarithm for alignment of function labels */
 	4,   /* logarithm for alignment of loops labels */
+	7,   /* maximum skip for alignment of loops labels */
+#else
+	0,
+	0,
+	0
+#endif
 };
 
 /* costs for the K10 */
@@ -311,6 +350,7 @@ static const insn_const k10_cost = {
 	0,   /* cost of multiply for every set bit */
 	5,   /* logarithm for alignment of function labels */
 	5,   /* logarithm for alignment of loops labels */
+	7,   /* maximum skip for alignment of loops labels */
 };
 
 /* costs for the Pentium 4 */
@@ -322,6 +362,7 @@ static const insn_const netburst_cost = {
 	0,   /* cost of multiply for every set bit */
 	4,   /* logarithm for alignment of function labels */
 	4,   /* logarithm for alignment of loops labels */
+	7,   /* maximum skip for alignment of loops labels */
 };
 
 /* costs for the Nocona and Core */
@@ -333,6 +374,7 @@ static const insn_const nocona_cost = {
 	0,   /* cost of multiply for every set bit */
 	4,   /* logarithm for alignment of function labels */
 	4,   /* logarithm for alignment of loops labels */
+	7,   /* maximum skip for alignment of loops labels */
 };
 
 /* costs for the Core2 */
@@ -344,10 +386,11 @@ static const insn_const core2_cost = {
 	0,   /* cost of multiply for every set bit */
 	4,   /* logarithm for alignment of function labels */
 	4,   /* logarithm for alignment of loops labels */
+	10,  /* maximum skip for alignment of loops labels */
 };
 
-/* costs for the generic */
-static const insn_const generic_cost = {
+/* costs for the generic32 */
+static const insn_const generic32_cost = {
 	1,   /* cost of an add instruction */
 	2,   /* cost of a lea instruction */
 	1,   /* cost of a constant shift instruction */
@@ -355,12 +398,17 @@ static const insn_const generic_cost = {
 	0,   /* cost of multiply for every set bit */
 	4,   /* logarithm for alignment of function labels */
 	4,   /* logarithm for alignment of loops labels */
+	7,   /* maximum skip for alignment of loops labels */
 };
 
-static const insn_const *arch_costs = &generic_cost;
+static const insn_const *arch_costs = &generic32_cost;
 
 static void set_arch_costs(void)
 {
+	if (opt_size) {
+		arch_costs = &size_cost;
+		return;
+	}
 	switch (opt_arch & arch_mask) {
 	case arch_i386:
 		arch_costs = &i386_cost;
@@ -398,9 +446,9 @@ static void set_arch_costs(void)
 	case arch_k10:
 		arch_costs = &k10_cost;
 		break;
-	case 0:
+	case arch_generic32:
 	default:
-		arch_costs = &generic_cost;
+		arch_costs = &generic32_cost;
 	}
 }
 
@@ -445,38 +493,52 @@ void ia32_setup_cg_config(void)
 
 	set_arch_costs();
 
+	ia32_cg_config.optimize_size        = opt_size != 0;
 	/* on newer intel cpus mov, pop is often faster then leave although it has a
 	 * longer opcode */
 	ia32_cg_config.use_leave            = FLAGS(opt_arch, arch_i386 | arch_all_amd | arch_core2);
 	/* P4s don't like inc/decs because they only partially write the flags
 	   register which produces false dependencies */
-	ia32_cg_config.use_incdec           = !FLAGS(opt_arch, arch_netburst | arch_nocona | arch_geode);
-	ia32_cg_config.use_sse2             = use_sse2;
+	ia32_cg_config.use_incdec           = !FLAGS(opt_arch, arch_netburst | arch_nocona | arch_core2 | arch_geode) || opt_size;
+	ia32_cg_config.use_sse2             = use_sse2 && FLAGS(arch, arch_feature_sse2);
 	ia32_cg_config.use_ffreep           = FLAGS(opt_arch, arch_athlon_plus);
 	ia32_cg_config.use_ftst             = !FLAGS(arch, arch_feature_p6_insn);
+	/* valgrind can't cope with femms yet and the usefullness of the optimisation is questionable anyway */
+#if 0
 	ia32_cg_config.use_femms            = FLAGS(opt_arch, arch_athlon_plus) &&
 	                                      FLAGS(arch, arch_feature_mmx | arch_all_amd);
+#else
+	ia32_cg_config.use_femms            = 0;
+#endif
 	ia32_cg_config.use_fucomi           = FLAGS(arch, arch_feature_p6_insn);
 	ia32_cg_config.use_cmov             = FLAGS(arch, arch_feature_p6_insn);
 	ia32_cg_config.use_modeD_moves      = FLAGS(opt_arch, arch_athlon_plus | arch_geode | arch_ppro |
-	                                            arch_netburst | arch_nocona | arch_core2 | arch_generic);
+	                                            arch_netburst | arch_nocona | arch_core2 | arch_generic32);
 	ia32_cg_config.use_add_esp_4        = FLAGS(opt_arch, arch_geode | arch_athlon_plus |
-	                                            arch_netburst | arch_nocona | arch_core2 | arch_generic);
+	                                            arch_netburst | arch_nocona | arch_core2 | arch_generic32) &&
+	                                      !opt_size;
 	ia32_cg_config.use_add_esp_8        = FLAGS(opt_arch, arch_geode | arch_athlon_plus |
 	                                            arch_i386 | arch_i486 | arch_ppro | arch_netburst |
-	                                            arch_nocona | arch_core2 | arch_generic);
+	                                            arch_nocona | arch_core2 | arch_generic32) &&
+	                                      !opt_size;
 	ia32_cg_config.use_sub_esp_4        = FLAGS(opt_arch, arch_athlon_plus | arch_ppro |
-	                                            arch_netburst | arch_nocona | arch_core2 | arch_generic);
+	                                            arch_netburst | arch_nocona | arch_core2 | arch_generic32) &&
+	                                      !opt_size;
 	ia32_cg_config.use_sub_esp_8        = FLAGS(opt_arch, arch_athlon_plus | arch_i386 | arch_i486 |
-	                                            arch_ppro | arch_netburst | arch_nocona | arch_core2 | arch_generic);
-	ia32_cg_config.use_imul_mem_imm32   = !FLAGS(opt_arch, arch_k8 | arch_k10);
-	ia32_cg_config.use_mov_0            = FLAGS(opt_arch, arch_k6);
-	ia32_cg_config.use_pad_return       = FLAGS(opt_arch, arch_athlon_plus | cpu_core2 | arch_generic);
+	                                            arch_ppro | arch_netburst | arch_nocona | arch_core2 | arch_generic32) &&
+	                                      !opt_size;
+	ia32_cg_config.use_imul_mem_imm32   = !FLAGS(opt_arch, arch_k8 | arch_k10) || opt_size;
+	ia32_cg_config.use_pxor             = FLAGS(opt_arch, arch_netburst);
+	ia32_cg_config.use_mov_0            = FLAGS(opt_arch, arch_k6) && !opt_size;
+	ia32_cg_config.use_pad_return       = FLAGS(opt_arch, arch_athlon_plus | arch_core2 | arch_generic32) && !opt_size;
+	ia32_cg_config.use_bt               = FLAGS(opt_arch, arch_core2 | arch_athlon_plus) || opt_size;
+	ia32_cg_config.use_fisttp           = FLAGS(opt_arch & arch, arch_feature_sse3);
 	ia32_cg_config.optimize_cc          = opt_cc;
 	ia32_cg_config.use_unsafe_floatconv = opt_unsafe_floatconv;
 
-	ia32_cg_config.function_alignment = arch_costs->function_alignment;
-	ia32_cg_config.label_alignment    = arch_costs->label_alignment;
+	ia32_cg_config.function_alignment       = arch_costs->function_alignment;
+	ia32_cg_config.label_alignment          = arch_costs->label_alignment;
+	ia32_cg_config.label_alignment_max_skip = arch_costs->label_alignment_max_skip;
 
 	if (opt_arch & (arch_i386 | arch_i486)) {
 		ia32_cg_config.label_alignment_factor = 0;