- do not use imul mem, imm32 on newer AMD cpu's
[libfirm] / ir / be / ia32 / ia32_architecture.c
index 81b9790..ff4be15 100644 (file)
@@ -60,7 +60,7 @@ enum cpu_arch_features {
  */
 enum cpu_support {
        /* intel CPU's */
-       arch_generic     =  0,
+       arch_generic          =  0,
 
        arch_i386        =  1,
        arch_i486        =  2,
@@ -70,24 +70,27 @@ enum cpu_support {
        arch_pentium_2   =  6 | arch_feature_intel | arch_feature_p6 | arch_feature_mmx,
        arch_pentium_3   =  7 | arch_feature_intel | arch_feature_p6 | arch_feature_sse1,
        arch_pentium_4   =  8 | arch_feature_netburst | arch_feature_p6 | arch_feature_sse2,
-       arch_pentium_m   =  9 | arch_feature_intel | arch_feature_p6 | arch_feature_sse2,
-       arch_core        = 10 | arch_feature_intel | arch_feature_p6 | arch_feature_sse3,
-       arch_prescott    = 11 | arch_feature_netburst | arch_feature_p6 | arch_feature_sse3,
-       arch_core2       = 12 | arch_feature_intel | arch_feature_p6 | arch_feature_64bit | arch_feature_ssse3,
+       arch_prescott    =  9 | arch_feature_netburst | arch_feature_p6 | arch_feature_sse3,
+       arch_nocona      = 10 | arch_feature_netburst | arch_feature_p6 | arch_feature_64bit | arch_feature_sse3,
+       arch_pentium_m   = 11 | arch_feature_intel | arch_feature_p6 | arch_feature_sse2,
+       arch_core        = 12 | arch_feature_intel | arch_feature_p6 | arch_feature_sse3,
+       arch_core2       = 13 | arch_feature_intel | arch_feature_p6 | arch_feature_64bit | arch_feature_ssse3,
 
        /* AMD CPU's */
-       arch_k6          = 13 | arch_feature_amd | arch_feature_mmx,
-       arch_k6_2        = 14 | arch_feature_amd | arch_feature_mmx | arch_feature_3DNow,
-       arch_k6_3        = 15 | arch_feature_amd | arch_feature_mmx | arch_feature_3DNow,
-       arch_athlon      = 16 | arch_feature_amd | arch_feature_mmx | arch_feature_3DNowE | arch_feature_p6,
-       arch_athlon_xp   = 17 | arch_feature_amd | arch_feature_sse1 | arch_feature_3DNowE | arch_feature_p6,
-       arch_opteron     = 18 | arch_feature_amd | arch_feature_64bit | arch_feature_3DNowE | arch_feature_p6,
+       arch_k6          = 14 | arch_feature_amd | arch_feature_mmx,
+       arch_geode       = 15 | arch_feature_amd | arch_feature_mmx | arch_feature_3DNowE,
+       arch_k6_2        = 16 | arch_feature_amd | arch_feature_mmx | arch_feature_3DNow,
+       arch_k6_3        = 17 | arch_feature_amd | arch_feature_mmx | arch_feature_3DNow,
+       arch_athlon      = 18 | arch_feature_amd | arch_feature_mmx | arch_feature_3DNowE | arch_feature_p6,
+       arch_athlon_xp   = 19 | arch_feature_amd | arch_feature_sse1 | arch_feature_3DNowE | arch_feature_p6,
+       arch_opteron     = 20 | arch_feature_amd | arch_feature_64bit | arch_feature_3DNowE | arch_feature_p6,
+       arch_k10         = 21 | arch_feature_amd | arch_feature_64bit | arch_feature_3DNowE | arch_feature_p6,
 
        /* other */
-       arch_winchip_c6  = 19 | arch_feature_mmx,
-       arch_winchip2    = 20 | arch_feature_mmx | arch_feature_3DNow,
-       arch_c3          = 21 | arch_feature_mmx | arch_feature_3DNow,
-       arch_c3_2        = 22 | arch_feature_sse1,  /* really no 3DNow! */
+       arch_winchip_c6  = 22 | arch_feature_mmx,
+       arch_winchip2    = 23 | arch_feature_mmx | arch_feature_3DNow,
+       arch_c3          = 24 | arch_feature_mmx | arch_feature_3DNow,
+       arch_c3_2        = 25 | arch_feature_sse1,  /* really no 3DNow! */
 };
 
 /** checks for l <= x <= h */
@@ -100,7 +103,10 @@ enum cpu_support {
 #define ARCH_AMD(x)         (((x) & arch_feature_amd) != 0)
 
 /** return true if it's a Athlon/Opteron */
-#define ARCH_ATHLON(x)      _IN_RANGE((x), arch_athlon, arch_opteron)
+#define ARCH_K8(x)          _IN_RANGE((x), arch_athlon, arch_opteron)
+
+/** return true if it's a Athlon or newer */
+#define ARCH_ATHLON_PLUS(x) _IN_RANGE((x), arch_athlon, arch_k10)
 
 /** return true if the CPU has MMX support */
 #define ARCH_MMX(x)         (((x) & arch_feature_mmx) != 0)
@@ -111,6 +117,9 @@ enum cpu_support {
 /** return true if the CPU has P6 features (CMOV) */
 #define IS_P6_ARCH(x)       (((x) & arch_feature_p6) != 0)
 
+/** return true if the CPU has the NetBurst architecture */
+#define IS_NETBURST_ARCH(x) (((x) & arch_feature_netburst) != 0)
+
 static cpu_support arch                 = arch_generic;
 static cpu_support opt_arch             = arch_pentium_4;
 static int         use_sse2             = 0;
@@ -132,8 +141,8 @@ static const lc_opt_enum_int_items_t arch_items[] = {
        { "p3",         arch_pentium_3, },
        { "pentium4",   arch_pentium_4, },
        { "p4",         arch_pentium_4, },
-       { "prescott",   arch_pentium_4, },
-       { "nocona",     arch_pentium_4, },
+       { "prescott",   arch_prescott, },
+       { "nocona",     arch_nocona, },
        { "pentiumm",   arch_pentium_m, },
        { "pm",         arch_pentium_m, },
        /*
@@ -145,17 +154,20 @@ static const lc_opt_enum_int_items_t arch_items[] = {
         * core2 CPUs: Conroe (XE, L), Allendale, Merom (XE),
         * Kentsfield (XE), Yorkfield XE, Penryn, Wolfdale, Yorkfield
         */
+
        { "merom",      arch_core2, },
        { "core2",      arch_core2, },
        { "k6",         arch_k6, },
        { "k6-2",       arch_k6_2, },
        { "k6-3",       arch_k6_2, },
+       { "geode",      arch_geode, },
        { "athlon",     arch_athlon, },
        { "athlon-xp",  arch_athlon_xp, },
        { "athlon-mp",  arch_athlon_xp, },
        { "athlon-4",   arch_athlon_xp, },
        { "athlon64",   arch_opteron, },
        { "k8",         arch_opteron, },
+       { "k10",        arch_k10, },
        { "opteron",    arch_opteron, },
        { "generic",    arch_generic, },
        { NULL,         0 }
@@ -205,8 +217,8 @@ typedef struct insn_const {
 static const insn_const i386_cost = {
        1,   /* cost of an add instruction */
        1,   /* cost of a lea instruction */
-       2,   /* cost of a constant shift instruction */
-       6,   /* starting cost of a multiply instruction */
+       3,   /* cost of a constant shift instruction */
+       9,   /* starting cost of a multiply instruction */
        1    /* cost of multiply for every set bit */
 };
 
@@ -246,6 +258,15 @@ static const insn_const k6_cost = {
        0    /* cost of multiply for every set bit */
 };
 
+/* costs for the Geode */
+static const insn_const geode_cost = {
+       1,   /* cost of an add instruction */
+       1,   /* cost of a lea instruction */
+       1,   /* cost of a constant shift instruction */
+       7,   /* starting cost of a multiply instruction */
+       0    /* cost of multiply for every set bit */
+ };
+
 /* costs for the Athlon */
 static const insn_const athlon_cost = {
        1,   /* cost of an add instruction */
@@ -255,6 +276,15 @@ static const insn_const athlon_cost = {
        0    /* cost of multiply for every set bit */
 };
 
+/* costs for the Opteron/K8/K10 */
+static const insn_const opteron_cost = {
+       1,   /* cost of an add instruction */
+       2,   /* cost of a lea instruction */
+       1,   /* cost of a constant shift instruction */
+       3,   /* starting cost of a multiply instruction */
+       0    /* cost of multiply for every set bit */
+};
+
 /* costs for the Pentium 4 */
 static const insn_const pentium4_cost = {
        1,   /* cost of an add instruction */
@@ -264,8 +294,8 @@ static const insn_const pentium4_cost = {
        0    /* cost of multiply for every set bit */
 };
 
-/* costs for the Core */
-static const insn_const core_cost = {
+/* costs for the Nocona and Core */
+static const insn_const nocona_cost = {
        1,   /* cost of an add instruction */
        1,   /* cost of a lea instruction */
        1,   /* cost of a constant shift instruction */
@@ -273,6 +303,15 @@ static const insn_const core_cost = {
        0    /* cost of multiply for every set bit */
 };
 
+/* costs for the Core2 */
+static const insn_const core2_cost = {
+       1,   /* cost of an add instruction */
+       1,   /* cost of a lea instruction */
+       1,   /* cost of a constant shift instruction */
+       3,   /* starting cost of a multiply instruction */
+        0    /* cost of multiply for every set bit */
+ };
+
 /* costs for the generic */
 static const insn_const generic_cost = {
        1,   /* cost of an add instruction */
@@ -308,24 +347,29 @@ static void set_arch_costs(void)
        case arch_pentium_m:
                arch_costs = &pentiumpro_cost;
                break;
-       case arch_core:
-               arch_costs = &core_cost;
-               break;
+       case arch_nocona:
        case arch_prescott:
-               arch_costs = &pentium4_cost;
+       case arch_core:
+               arch_costs = &nocona_cost;
                break;
        case arch_core2:
-               arch_costs = &core_cost;
+               arch_costs = &core2_cost;
                break;
        case arch_k6:
        case arch_k6_2:
                arch_costs = &k6_cost;
                break;
+       case arch_geode:
+               arch_costs = &geode_cost;
+               break;
        case arch_athlon:
        case arch_athlon_xp:
-       case arch_opteron:
                arch_costs = &athlon_cost;
                break;
+       case arch_opteron:
+       case arch_k10:
+               arch_costs = &opteron_cost;
+               break;
        case arch_generic:
        default:
                arch_costs = &generic_cost;
@@ -367,8 +411,6 @@ int ia32_evaluate_insn(insn_kind kind, tarval *tv) {
        }
 }
 
-
-
 void ia32_setup_cg_config(void)
 {
        memset(&ia32_cg_config, 0, sizeof(ia32_cg_config));
@@ -379,14 +421,29 @@ void ia32_setup_cg_config(void)
                                               || !IS_P6_ARCH(opt_arch);
        /* P4s don't like inc/decs because they only partially write the flags
           register which produces false dependencies */
-       ia32_cg_config.use_incdec           = (opt_arch != arch_pentium_4);
+       ia32_cg_config.use_incdec           = !IS_NETBURST_ARCH(opt_arch) && (opt_arch != arch_generic);
        ia32_cg_config.use_sse2             = use_sse2;
-       ia32_cg_config.use_ffreep           = ARCH_ATHLON(opt_arch);
+       ia32_cg_config.use_ffreep           = ARCH_ATHLON_PLUS(opt_arch);
        ia32_cg_config.use_ftst             = !IS_P6_ARCH(arch);
-       ia32_cg_config.use_femms            = ARCH_ATHLON(opt_arch)
+       ia32_cg_config.use_femms            = ARCH_ATHLON_PLUS(opt_arch)
                                              && ARCH_MMX(arch) && ARCH_AMD(arch);
        ia32_cg_config.use_fucomi           = IS_P6_ARCH(arch);
        ia32_cg_config.use_cmov             = IS_P6_ARCH(arch);
+       ia32_cg_config.use_add_esp_4        = ARCH_ATHLON_PLUS(opt_arch) || (opt_arch == arch_geode) ||
+                                             IS_NETBURST_ARCH(opt_arch) || (opt_arch == arch_core2) ||
+                                             (opt_arch == arch_generic);
+       ia32_cg_config.use_add_esp_8        = ARCH_ATHLON_PLUS(opt_arch) || (opt_arch == arch_geode) ||
+                                             IS_P6_ARCH(opt_arch) || IS_NETBURST_ARCH(opt_arch) ||
+                                             (opt_arch == arch_core2) || (opt_arch == arch_generic) ||
+                                             (opt_arch == arch_i386) || (opt_arch == arch_i486);
+       ia32_cg_config.use_sub_esp_4        = ARCH_ATHLON_PLUS(opt_arch) || IS_P6_ARCH(opt_arch) ||
+                                             IS_NETBURST_ARCH(opt_arch) || (opt_arch == arch_core2) ||
+                                             (opt_arch == arch_generic);
+       ia32_cg_config.use_sub_esp_8        = ARCH_ATHLON_PLUS(opt_arch) ||
+                                             IS_P6_ARCH(opt_arch) || IS_NETBURST_ARCH(opt_arch) ||
+                                             (opt_arch == arch_core2) || (opt_arch == arch_generic) ||
+                                             (opt_arch == arch_i386) || (opt_arch == arch_i486);
+       ia32_cg_config.use_imul_mem_imm32   = !(opt_arch == arch_opteron || opt_arch == arch_k10);
        ia32_cg_config.optimize_cc          = opt_cc;
        ia32_cg_config.use_unsafe_floatconv = opt_unsafe_floatconv;